diff options
Diffstat (limited to 'fs/ceph')
| -rw-r--r-- | fs/ceph/Kconfig | 26 | ||||
| -rw-r--r-- | fs/ceph/Makefile | 25 | ||||
| -rw-r--r-- | fs/ceph/acl.c | 194 | ||||
| -rw-r--r-- | fs/ceph/addr.c | 673 | ||||
| -rw-r--r-- | fs/ceph/cache.c | 402 | ||||
| -rw-r--r-- | fs/ceph/cache.h | 182 | ||||
| -rw-r--r-- | fs/ceph/caps.c | 1065 | ||||
| -rw-r--r-- | fs/ceph/debugfs.c | 27 | ||||
| -rw-r--r-- | fs/ceph/dir.c | 612 | ||||
| -rw-r--r-- | fs/ceph/export.c | 266 | ||||
| -rw-r--r-- | fs/ceph/file.c | 1019 | ||||
| -rw-r--r-- | fs/ceph/inode.c | 892 | ||||
| -rw-r--r-- | fs/ceph/ioctl.c | 180 | ||||
| -rw-r--r-- | fs/ceph/ioctl.h | 60 | ||||
| -rw-r--r-- | fs/ceph/locks.c | 271 | ||||
| -rw-r--r-- | fs/ceph/mds_client.c | 801 | ||||
| -rw-r--r-- | fs/ceph/mds_client.h | 67 | ||||
| -rw-r--r-- | fs/ceph/mdsmap.c | 62 | ||||
| -rw-r--r-- | fs/ceph/snap.c | 88 | ||||
| -rw-r--r-- | fs/ceph/strings.c | 7 | ||||
| -rw-r--r-- | fs/ceph/super.c | 274 | ||||
| -rw-r--r-- | fs/ceph/super.h | 288 | ||||
| -rw-r--r-- | fs/ceph/xattr.c | 581 | 
23 files changed, 5547 insertions, 2515 deletions
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 9eb134ea6eb..264e9bf83ff 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -1,6 +1,6 @@  config CEPH_FS -        tristate "Ceph distributed file system (EXPERIMENTAL)" -	depends on INET && EXPERIMENTAL +	tristate "Ceph distributed file system" +	depends on INET  	select CEPH_LIB  	select LIBCRC32C  	select CRYPTO_AES @@ -16,3 +16,25 @@ config CEPH_FS  	  If unsure, say N. +if CEPH_FS +config CEPH_FSCACHE +	bool "Enable Ceph client caching support" +	depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y +	help +	  Choose Y here to enable persistent, read-only local +	  caching support for Ceph clients using FS-Cache + +endif + +config CEPH_FS_POSIX_ACL +	bool "Ceph POSIX Access Control Lists" +	depends on CEPH_FS +	select FS_POSIX_ACL +	help +	  POSIX Access Control Lists (ACLs) support permissions for users and +	  groups beyond the owner/group/world scheme. + +	  To learn more about Access Control Lists, visit the POSIX ACLs for +	  Linux website <http://acl.bestbits.at/>. + +	  If you don't know what Access Control Lists are, say N diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 9e6c4f2e8ff..85a4230b9bf 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -2,31 +2,12 @@  # Makefile for CEPH filesystem.  # -ifneq ($(KERNELRELEASE),) -  obj-$(CONFIG_CEPH_FS) += ceph.o -ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ +ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \  	export.o caps.o snap.o xattr.o \  	mds_client.o mdsmap.o strings.o ceph_frag.o \  	debugfs.o -else -#Otherwise we were called directly from the command -# line; invoke the kernel build system. - -KERNELDIR ?= /lib/modules/$(shell uname -r)/build -PWD := $(shell pwd) - -default: all - -all: -	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules - -modules_install: -	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install - -clean: -	$(MAKE) -C $(KERNELDIR) M=$(PWD) clean - -endif +ceph-$(CONFIG_CEPH_FSCACHE) += cache.o +ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c new file mode 100644 index 00000000000..469f2e8657e --- /dev/null +++ b/fs/ceph/acl.c @@ -0,0 +1,194 @@ +/* + * linux/fs/ceph/acl.c + * + * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include <linux/posix_acl.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include "super.h" + +static inline void ceph_set_cached_acl(struct inode *inode, +					int type, struct posix_acl *acl) +{ +	struct ceph_inode_info *ci = ceph_inode(inode); + +	spin_lock(&ci->i_ceph_lock); +	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) +		set_cached_acl(inode, type, acl); +	spin_unlock(&ci->i_ceph_lock); +} + +static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, +							int type) +{ +	struct ceph_inode_info *ci = ceph_inode(inode); +	struct posix_acl *acl = ACL_NOT_CACHED; + +	spin_lock(&ci->i_ceph_lock); +	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) +		acl = get_cached_acl(inode, type); +	spin_unlock(&ci->i_ceph_lock); + +	return acl; +} + +struct posix_acl *ceph_get_acl(struct inode *inode, int type) +{ +	int size; +	const char *name; +	char *value = NULL; +	struct posix_acl *acl; + +	switch (type) { +	case ACL_TYPE_ACCESS: +		name = POSIX_ACL_XATTR_ACCESS; +		break; +	case ACL_TYPE_DEFAULT: +		name = POSIX_ACL_XATTR_DEFAULT; +		break; +	default: +		BUG(); +	} + +	size = __ceph_getxattr(inode, name, "", 0); +	if (size > 0) { +		value = kzalloc(size, GFP_NOFS); +		if (!value) +			return ERR_PTR(-ENOMEM); +		size = __ceph_getxattr(inode, name, value, size); +	} + +	if (size > 0) +		acl = posix_acl_from_xattr(&init_user_ns, value, size); +	else if (size == -ERANGE || size == -ENODATA || size == 0) +		acl = NULL; +	else +		acl = ERR_PTR(-EIO); + +	kfree(value); + +	if (!IS_ERR(acl)) +		ceph_set_cached_acl(inode, type, acl); + +	return acl; +} + +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ +	int ret = 0, size = 0; +	const char *name = NULL; +	char *value = NULL; +	struct iattr newattrs; +	umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; +	struct dentry *dentry; + +	switch (type) { +	case ACL_TYPE_ACCESS: +		name = POSIX_ACL_XATTR_ACCESS; +		if (acl) { +			ret = posix_acl_equiv_mode(acl, &new_mode); +			if (ret < 0) +				goto out; +			if (ret == 0) +				acl = NULL; +		} +		break; +	case ACL_TYPE_DEFAULT: +		if (!S_ISDIR(inode->i_mode)) { +			ret = acl ? -EINVAL : 0; +			goto out; +		} +		name = POSIX_ACL_XATTR_DEFAULT; +		break; +	default: +		ret = -EINVAL; +		goto out; +	} + +	if (acl) { +		size = posix_acl_xattr_size(acl->a_count); +		value = kmalloc(size, GFP_NOFS); +		if (!value) { +			ret = -ENOMEM; +			goto out; +		} + +		ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); +		if (ret < 0) +			goto out_free; +	} + +	dentry = d_find_alias(inode); +	if (new_mode != old_mode) { +		newattrs.ia_mode = new_mode; +		newattrs.ia_valid = ATTR_MODE; +		ret = ceph_setattr(dentry, &newattrs); +		if (ret) +			goto out_dput; +	} + +	ret = __ceph_setxattr(dentry, name, value, size, 0); +	if (ret) { +		if (new_mode != old_mode) { +			newattrs.ia_mode = old_mode; +			newattrs.ia_valid = ATTR_MODE; +			ceph_setattr(dentry, &newattrs); +		} +		goto out_dput; +	} + +	ceph_set_cached_acl(inode, type, acl); + +out_dput: +	dput(dentry); +out_free: +	kfree(value); +out: +	return ret; +} + +int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) +{ +	struct posix_acl *default_acl, *acl; +	int error; + +	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); +	if (error) +		return error; + +	if (!default_acl && !acl) +		cache_no_acl(inode); + +	if (default_acl) { +		error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); +		posix_acl_release(default_acl); +	} +	if (acl) { +		if (!error) +			error = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS); +		posix_acl_release(acl); +	} +	return error; +} diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e9c874abc9e..90b3954d48e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -11,6 +11,7 @@  #include "super.h"  #include "mds_client.h" +#include "cache.h"  #include <linux/ceph/osd_client.h>  /* @@ -24,7 +25,7 @@   * context needs to be associated with the osd write during writeback.   *   * Similarly, struct ceph_inode_info maintains a set of counters to - * count dirty pages on the inode.  In the absense of snapshots, + * count dirty pages on the inode.  In the absence of snapshots,   * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.   *   * When a snapshot is taken (that is, when the client receives @@ -54,7 +55,12 @@  	(CONGESTION_ON_THRESH(congestion_kb) -				\  	 (CONGESTION_ON_THRESH(congestion_kb) >> 2)) - +static inline struct ceph_snap_context *page_snap_context(struct page *page) +{ +	if (PagePrivate(page)) +		return (void *)page->private; +	return NULL; +}  /*   * Dirty a page.  Optimistically adjust accounting, on the assumption @@ -65,15 +71,16 @@ static int ceph_set_page_dirty(struct page *page)  	struct address_space *mapping = page->mapping;  	struct inode *inode;  	struct ceph_inode_info *ci; -	int undo = 0;  	struct ceph_snap_context *snapc; +	int ret;  	if (unlikely(!mapping))  		return !TestSetPageDirty(page); -	if (TestSetPageDirty(page)) { +	if (PageDirty(page)) {  		dout("%p set_page_dirty %p idx %lu -- already dirty\n",  		     mapping->host, page, page->index); +		BUG_ON(!PagePrivate(page));  		return 0;  	} @@ -87,12 +94,12 @@ static int ceph_set_page_dirty(struct page *page)  	snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);  	/* dirty the head */ -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	if (ci->i_head_snapc == NULL)  		ci->i_head_snapc = ceph_get_snap_context(snapc);  	++ci->i_wrbuffer_ref_head;  	if (ci->i_wrbuffer_ref == 0) -		igrab(inode); +		ihold(inode);  	++ci->i_wrbuffer_ref;  	dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "  	     "snapc %p seq %lld (%d snaps)\n", @@ -100,37 +107,21 @@ static int ceph_set_page_dirty(struct page *page)  	     ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,  	     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,  	     snapc, snapc->seq, snapc->num_snaps); -	spin_unlock(&inode->i_lock); - -	/* now adjust page */ -	spin_lock_irq(&mapping->tree_lock); -	if (page->mapping) {	/* Race with truncate? */ -		WARN_ON_ONCE(!PageUptodate(page)); -		account_page_dirtied(page, page->mapping); -		radix_tree_tag_set(&mapping->page_tree, -				page_index(page), PAGECACHE_TAG_DIRTY); - -		/* -		 * Reference snap context in page->private.  Also set -		 * PagePrivate so that we get invalidatepage callback. -		 */ -		page->private = (unsigned long)snapc; -		SetPagePrivate(page); -	} else { -		dout("ANON set_page_dirty %p (raced truncate?)\n", page); -		undo = 1; -	} - -	spin_unlock_irq(&mapping->tree_lock); +	spin_unlock(&ci->i_ceph_lock); -	if (undo) -		/* whoops, we failed to dirty the page */ -		ceph_put_wrbuffer_cap_refs(ci, 1, snapc); +	/* +	 * Reference snap context in page->private.  Also set +	 * PagePrivate so that we get invalidatepage callback. +	 */ +	BUG_ON(PagePrivate(page)); +	page->private = (unsigned long)snapc; +	SetPagePrivate(page); -	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); +	ret = __set_page_dirty_nobuffers(page); +	WARN_ON(!PageLocked(page)); +	WARN_ON(!page->mapping); -	BUG_ON(!PageDirty(page)); -	return 1; +	return ret;  }  /* @@ -138,18 +129,26 @@ static int ceph_set_page_dirty(struct page *page)   * dirty page counters appropriately.  Only called if there is private   * data on the page.   */ -static void ceph_invalidatepage(struct page *page, unsigned long offset) +static void ceph_invalidatepage(struct page *page, unsigned int offset, +				unsigned int length)  {  	struct inode *inode;  	struct ceph_inode_info *ci; -	struct ceph_snap_context *snapc = (void *)page->private; - -	BUG_ON(!PageLocked(page)); -	BUG_ON(!page->private); -	BUG_ON(!PagePrivate(page)); -	BUG_ON(!page->mapping); +	struct ceph_snap_context *snapc = page_snap_context(page);  	inode = page->mapping->host; +	ci = ceph_inode(inode); + +	if (offset != 0 || length != PAGE_CACHE_SIZE) { +		dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", +		     inode, page, page->index, offset, length); +		return; +	} + +	ceph_invalidate_fscache_page(inode, page); + +	if (!PagePrivate(page)) +		return;  	/*  	 * We can get non-dirty pages here due to races between @@ -159,32 +158,28 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)  	if (!PageDirty(page))  		pr_err("%p invalidatepage %p page not dirty\n", inode, page); -	if (offset == 0) -		ClearPageChecked(page); +	ClearPageChecked(page); -	ci = ceph_inode(inode); -	if (offset == 0) { -		dout("%p invalidatepage %p idx %lu full dirty page %lu\n", -		     inode, page, page->index, offset); -		ceph_put_wrbuffer_cap_refs(ci, 1, snapc); -		ceph_put_snap_context(snapc); -		page->private = 0; -		ClearPagePrivate(page); -	} else { -		dout("%p invalidatepage %p idx %lu partial dirty page\n", -		     inode, page, page->index); -	} +	dout("%p invalidatepage %p idx %lu full dirty page\n", +	     inode, page, page->index); + +	ceph_put_wrbuffer_cap_refs(ci, 1, snapc); +	ceph_put_snap_context(snapc); +	page->private = 0; +	ClearPagePrivate(page);  } -/* just a sanity check */  static int ceph_releasepage(struct page *page, gfp_t g)  {  	struct inode *inode = page->mapping ? page->mapping->host : NULL;  	dout("%p releasepage %p idx %lu\n", inode, page, page->index);  	WARN_ON(PageDirty(page)); -	WARN_ON(page->private); -	WARN_ON(PagePrivate(page)); -	return 0; + +	/* Can we release the page from the cache? */ +	if (!ceph_release_fscache_page(page, g)) +		return 0; + +	return !PagePrivate(page);  }  /* @@ -192,29 +187,39 @@ static int ceph_releasepage(struct page *page, gfp_t g)   */  static int readpage_nounlock(struct file *filp, struct page *page)  { -	struct inode *inode = filp->f_dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct ceph_inode_info *ci = ceph_inode(inode); -	struct ceph_osd_client *osdc =  +	struct ceph_osd_client *osdc =  		&ceph_inode_to_client(inode)->client->osdc;  	int err = 0;  	u64 len = PAGE_CACHE_SIZE; +	err = ceph_readpage_from_fscache(inode, page); + +	if (err == 0) +		goto out; +  	dout("readpage inode %p file %p page %p index %lu\n",  	     inode, filp, page, page->index);  	err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, -				  page->index << PAGE_CACHE_SHIFT, &len, +				  (u64) page_offset(page), &len,  				  ci->i_truncate_seq, ci->i_truncate_size, -				  &page, 1); +				  &page, 1, 0);  	if (err == -ENOENT)  		err = 0;  	if (err < 0) {  		SetPageError(page); +		ceph_fscache_readpage_cancel(inode, page);  		goto out; -	} else if (err < PAGE_CACHE_SIZE) { +	} +	if (err < PAGE_CACHE_SIZE)  		/* zero fill remainder of page */  		zero_user_segment(page, err, PAGE_CACHE_SIZE); -	} +	else +		flush_dcache_page(page); +  	SetPageUptodate(page); +	ceph_readpage_to_fscache(inode, page);  out:  	return err < 0 ? err : 0; @@ -228,102 +233,180 @@ static int ceph_readpage(struct file *filp, struct page *page)  }  /* - * Build a vector of contiguous pages from the provided page list. + * Finish an async read(ahead) op.   */ -static struct page **page_vector_from_list(struct list_head *page_list, -					   unsigned *nr_pages) +static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)  { -	struct page **pages; -	struct page *page; -	int next_index, contig_pages = 0; +	struct inode *inode = req->r_inode; +	struct ceph_osd_data *osd_data; +	int rc = req->r_result; +	int bytes = le32_to_cpu(msg->hdr.data_len); +	int num_pages; +	int i; -	/* build page vector */ -	pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS); -	if (!pages) -		return ERR_PTR(-ENOMEM); +	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); -	BUG_ON(list_empty(page_list)); -	next_index = list_entry(page_list->prev, struct page, lru)->index; -	list_for_each_entry_reverse(page, page_list, lru) { -		if (page->index == next_index) { -			dout("readpages page %d %p\n", contig_pages, page); -			pages[contig_pages] = page; -			contig_pages++; -			next_index++; -		} else { -			break; +	/* unlock all pages, zeroing any data we didn't read */ +	osd_data = osd_req_op_extent_osd_data(req, 0); +	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); +	num_pages = calc_pages_for((u64)osd_data->alignment, +					(u64)osd_data->length); +	for (i = 0; i < num_pages; i++) { +		struct page *page = osd_data->pages[i]; + +		if (rc < 0) +			goto unlock; +		if (bytes < (int)PAGE_CACHE_SIZE) { +			/* zero (remainder of) page */ +			int s = bytes < 0 ? 0 : bytes; +			zero_user_segment(page, s, PAGE_CACHE_SIZE);  		} + 		dout("finish_read %p uptodate %p idx %lu\n", inode, page, +		     page->index); +		flush_dcache_page(page); +		SetPageUptodate(page); +		ceph_readpage_to_fscache(inode, page); +unlock: +		unlock_page(page); +		page_cache_release(page); +		bytes -= PAGE_CACHE_SIZE;  	} -	*nr_pages = contig_pages; -	return pages; +	kfree(osd_data->pages); +} + +static void ceph_unlock_page_vector(struct page **pages, int num_pages) +{ +	int i; + +	for (i = 0; i < num_pages; i++) +		unlock_page(pages[i]);  }  /* - * Read multiple pages.  Leave pages we don't read + unlock in page_list; - * the caller (VM) cleans them up. + * start an async read(ahead) operation.  return nr_pages we submitted + * a read for on success, or negative error code.   */ -static int ceph_readpages(struct file *file, struct address_space *mapping, -			  struct list_head *page_list, unsigned nr_pages) +static int start_read(struct inode *inode, struct list_head *page_list, int max)  { -	struct inode *inode = file->f_dentry->d_inode; -	struct ceph_inode_info *ci = ceph_inode(inode);  	struct ceph_osd_client *osdc =  		&ceph_inode_to_client(inode)->client->osdc; -	int rc = 0; -	struct page **pages; -	loff_t offset; +	struct ceph_inode_info *ci = ceph_inode(inode); +	struct page *page = list_entry(page_list->prev, struct page, lru); +	struct ceph_vino vino; +	struct ceph_osd_request *req; +	u64 off;  	u64 len; +	int i; +	struct page **pages; +	pgoff_t next_index; +	int nr_pages = 0; +	int ret; -	dout("readpages %p file %p nr_pages %d\n", -	     inode, file, nr_pages); - -	pages = page_vector_from_list(page_list, &nr_pages); -	if (IS_ERR(pages)) -		return PTR_ERR(pages); +	off = (u64) page_offset(page); -	/* guess read extent */ -	offset = pages[0]->index << PAGE_CACHE_SHIFT; +	/* count pages */ +	next_index = page->index; +	list_for_each_entry_reverse(page, page_list, lru) { +		if (page->index != next_index) +			break; +		nr_pages++; +		next_index++; +		if (max && nr_pages == max) +			break; +	}  	len = nr_pages << PAGE_CACHE_SHIFT; -	rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, -				 offset, &len, -				 ci->i_truncate_seq, ci->i_truncate_size, -				 pages, nr_pages); -	if (rc == -ENOENT) -		rc = 0; -	if (rc < 0) -		goto out; - -	for (; !list_empty(page_list) && len > 0; -	     rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { -		struct page *page = -			list_entry(page_list->prev, struct page, lru); +	dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, +	     off, len); +	vino = ceph_vino(inode); +	req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, +				    1, CEPH_OSD_OP_READ, +				    CEPH_OSD_FLAG_READ, NULL, +				    ci->i_truncate_seq, ci->i_truncate_size, +				    false); +	if (IS_ERR(req)) +		return PTR_ERR(req); +	/* build page vector */ +	nr_pages = calc_pages_for(0, len); +	pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); +	ret = -ENOMEM; +	if (!pages) +		goto out; +	for (i = 0; i < nr_pages; ++i) { +		page = list_entry(page_list->prev, struct page, lru); +		BUG_ON(PageLocked(page));  		list_del(&page->lru); -		if (rc < (int)PAGE_CACHE_SIZE) { -			/* zero (remainder of) page */ -			int s = rc < 0 ? 0 : rc; -			zero_user_segment(page, s, PAGE_CACHE_SIZE); -		} - -		if (add_to_page_cache_lru(page, mapping, page->index, + 		dout("start_read %p adding %p idx %lu\n", inode, page, +		     page->index); +		if (add_to_page_cache_lru(page, &inode->i_data, page->index,  					  GFP_NOFS)) { +			ceph_fscache_uncache_page(inode, page);  			page_cache_release(page); -			dout("readpages %p add_to_page_cache failed %p\n", +			dout("start_read %p add_to_page_cache failed %p\n",  			     inode, page); -			continue; +			nr_pages = i; +			goto out_pages;  		} -		dout("readpages %p adding %p idx %lu\n", inode, page, -		     page->index); -		flush_dcache_page(page); -		SetPageUptodate(page); -		unlock_page(page); -		page_cache_release(page); +		pages[i] = page;  	} -	rc = 0; +	osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); +	req->r_callback = finish_read; +	req->r_inode = inode; + +	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); + +	dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); +	ret = ceph_osdc_start_request(osdc, req, false); +	if (ret < 0) +		goto out_pages; +	ceph_osdc_put_request(req); +	return nr_pages; +out_pages: +	ceph_unlock_page_vector(pages, nr_pages); +	ceph_release_page_vector(pages, nr_pages);  out: -	kfree(pages); +	ceph_osdc_put_request(req); +	return ret; +} + + +/* + * Read multiple pages.  Leave pages we don't read + unlock in page_list; + * the caller (VM) cleans them up. + */ +static int ceph_readpages(struct file *file, struct address_space *mapping, +			  struct list_head *page_list, unsigned nr_pages) +{ +	struct inode *inode = file_inode(file); +	struct ceph_fs_client *fsc = ceph_inode_to_client(inode); +	int rc = 0; +	int max = 0; + +	rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, +					 &nr_pages); + +	if (rc == 0) +		goto out; + +	if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) +		max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) +			>> PAGE_SHIFT; + +	dout("readpages %p file %p nr_pages %d max %d\n", inode, +		file, nr_pages, +	     max); +	while (!list_empty(page_list)) { +		rc = start_read(inode, page_list, max); +		if (rc < 0) +			goto out; +		BUG_ON(rc == 0); +	} +out: +	ceph_fscache_readpages_cancel(inode, page_list); + +	dout("readpages %p file %p ret %d\n", inode, file, rc);  	return rc;  } @@ -338,7 +421,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,  	struct ceph_snap_context *snapc = NULL;  	struct ceph_cap_snap *capsnap = NULL; -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {  		dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,  		     capsnap->context, capsnap->dirty_pages); @@ -354,7 +437,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,  		dout(" head snapc %p has %d dirty pages\n",  		     snapc, ci->i_wrbuffer_ref_head);  	} -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	return snapc;  } @@ -370,13 +453,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)  	struct ceph_inode_info *ci;  	struct ceph_fs_client *fsc;  	struct ceph_osd_client *osdc; -	loff_t page_off = page->index << PAGE_CACHE_SHIFT; -	int len = PAGE_CACHE_SIZE; -	loff_t i_size; -	int err = 0;  	struct ceph_snap_context *snapc, *oldest; -	u64 snap_size = 0; +	loff_t page_off = page_offset(page);  	long writeback_stat; +	u64 truncate_size, snap_size = 0; +	u32 truncate_seq; +	int err = 0, len = PAGE_CACHE_SIZE;  	dout("writepage %p idx %lu\n", page, page->index); @@ -390,7 +472,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)  	osdc = &fsc->client->osdc;  	/* verify this is a writeable snap context */ -	snapc = (void *)page->private; +	snapc = page_snap_context(page);  	if (snapc == NULL) {  		dout("writepage %p page %p not dirty?\n", inode, page);  		goto out; @@ -398,7 +480,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)  	oldest = get_oldest_context(inode, &snap_size);  	if (snapc->seq > oldest->seq) {  		dout("writepage %p page %p snapc %p not writeable - noop\n", -		     inode, page, (void *)page->private); +		     inode, page, snapc);  		/* we should only noop if called by kswapd */  		WARN_ON((current->flags & PF_MEMALLOC) == 0);  		ceph_put_snap_context(oldest); @@ -406,13 +488,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)  	}  	ceph_put_snap_context(oldest); +	spin_lock(&ci->i_ceph_lock); +	truncate_seq = ci->i_truncate_seq; +	truncate_size = ci->i_truncate_size; +	if (!snap_size) +		snap_size = i_size_read(inode); +	spin_unlock(&ci->i_ceph_lock); +  	/* is this a partial page at end of file? */ -	if (snap_size) -		i_size = snap_size; -	else -		i_size = i_size_read(inode); -	if (i_size < page_off + len) -		len = i_size - page_off; +	if (page_off >= snap_size) { +		dout("%p page eof %llu\n", page, snap_size); +		goto out; +	} +	if (snap_size < page_off + len) +		len = snap_size - page_off;  	dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",  	     inode, page, page->index, page_off, len, snapc); @@ -422,13 +511,14 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)  	    CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))  		set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); +	ceph_readpage_to_fscache(inode, page); +  	set_page_writeback(page);  	err = ceph_osdc_writepages(osdc, ceph_vino(inode),  				   &ci->i_layout, snapc,  				   page_off, len, -				   ci->i_truncate_seq, ci->i_truncate_size, -				   &inode->i_mtime, -				   &page, 1, 0, 0, true); +				   truncate_seq, truncate_size, +				   &inode->i_mtime, &page, 1);  	if (err < 0) {  		dout("writepage setting page/mapping error %d %p\n", err, page);  		SetPageError(page); @@ -453,7 +543,7 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)  	int err;  	struct inode *inode = page->mapping->host;  	BUG_ON(!inode); -	igrab(inode); +	ihold(inode);  	err = writepage_nounlock(page, wbc);  	unlock_page(page);  	iput(inode); @@ -478,7 +568,6 @@ static void ceph_release_pages(struct page **pages, int num)  	pagevec_release(&pvec);  } -  /*   * async writeback completion handler.   * @@ -489,27 +578,24 @@ static void writepages_finish(struct ceph_osd_request *req,  			      struct ceph_msg *msg)  {  	struct inode *inode = req->r_inode; -	struct ceph_osd_reply_head *replyhead; -	struct ceph_osd_op *op;  	struct ceph_inode_info *ci = ceph_inode(inode); +	struct ceph_osd_data *osd_data;  	unsigned wrote;  	struct page *page; +	int num_pages;  	int i;  	struct ceph_snap_context *snapc = req->r_snapc;  	struct address_space *mapping = inode->i_mapping; -	__s32 rc = -EIO; -	u64 bytes = 0; +	int rc = req->r_result; +	u64 bytes = req->r_ops[0].extent.length;  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);  	long writeback_stat;  	unsigned issued = ceph_caps_issued(ci); -	/* parse reply */ -	replyhead = msg->front.iov_base; -	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); -	op = (void *)(replyhead + 1); -	rc = le32_to_cpu(replyhead->result); -	bytes = le64_to_cpu(op->extent.length); - +	osd_data = osd_req_op_extent_osd_data(req, 0); +	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); +	num_pages = calc_pages_for((u64)osd_data->alignment, +					(u64)osd_data->length);  	if (rc >= 0) {  		/*  		 * Assume we wrote the pages we originally sent.  The @@ -517,7 +603,7 @@ static void writepages_finish(struct ceph_osd_request *req,  		 * raced with a truncation and was adjusted at the osd,  		 * so don't believe the reply.  		 */ -		wrote = req->r_num_pages; +		wrote = num_pages;  	} else {  		wrote = 0;  		mapping_set_error(mapping, rc); @@ -526,8 +612,8 @@ static void writepages_finish(struct ceph_osd_request *req,  	     inode, rc, bytes, wrote);  	/* clean all pages */ -	for (i = 0; i < req->r_num_pages; i++) { -		page = req->r_pages[i]; +	for (i = 0; i < num_pages; i++) { +		page = osd_data->pages[i];  		BUG_ON(!page);  		WARN_ON(!PageUptodate(page)); @@ -538,7 +624,7 @@ static void writepages_finish(struct ceph_osd_request *req,  			clear_bdi_congested(&fsc->backing_dev_info,  					    BLK_RW_ASYNC); -		ceph_put_snap_context((void *)page->private); +		ceph_put_snap_context(page_snap_context(page));  		page->private = 0;  		ClearPagePrivate(page);  		dout("unlocking %d %p\n", i, page); @@ -556,35 +642,18 @@ static void writepages_finish(struct ceph_osd_request *req,  		unlock_page(page);  	}  	dout("%p wrote+cleaned %d pages\n", inode, wrote); -	ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc); +	ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); -	ceph_release_pages(req->r_pages, req->r_num_pages); -	if (req->r_pages_from_pool) -		mempool_free(req->r_pages, +	ceph_release_pages(osd_data->pages, num_pages); +	if (osd_data->pages_from_pool) +		mempool_free(osd_data->pages,  			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);  	else -		kfree(req->r_pages); +		kfree(osd_data->pages);  	ceph_osdc_put_request(req);  }  /* - * allocate a page vec, either directly, or if necessary, via a the - * mempool.  we avoid the mempool if we can because req->r_num_pages - * may be less than the maximum write size. - */ -static void alloc_page_vec(struct ceph_fs_client *fsc, -			   struct ceph_osd_request *req) -{ -	req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, -			       GFP_NOFS); -	if (!req->r_pages) { -		req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); -		req->r_pages_from_pool = 1; -		WARN_ON(!req->r_pages); -	} -} - -/*   * initiate async writeback   */  static int ceph_writepages_start(struct address_space *mapping, @@ -592,7 +661,8 @@ static int ceph_writepages_start(struct address_space *mapping,  {  	struct inode *inode = mapping->host;  	struct ceph_inode_info *ci = ceph_inode(inode); -	struct ceph_fs_client *fsc; +	struct ceph_fs_client *fsc = ceph_inode_to_client(inode); +	struct ceph_vino vino = ceph_vino(inode);  	pgoff_t index, start, end;  	int range_whole = 0;  	int should_loop = 1; @@ -604,24 +674,24 @@ static int ceph_writepages_start(struct address_space *mapping,  	unsigned wsize = 1 << inode->i_blkbits;  	struct ceph_osd_request *req = NULL;  	int do_sync; -	u64 snap_size = 0; +	u64 truncate_size, snap_size; +	u32 truncate_seq;  	/*  	 * Include a 'sync' in the OSD request if this is a data  	 * integrity write (e.g., O_SYNC write or fsync()), or if our  	 * cap is being revoked.  	 */ -	do_sync = wbc->sync_mode == WB_SYNC_ALL; -	if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) +	if ((wbc->sync_mode == WB_SYNC_ALL) || +		ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))  		do_sync = 1;  	dout("writepages_start %p dosync=%d (mode=%s)\n",  	     inode, do_sync,  	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :  	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); -	fsc = ceph_inode_to_client(inode);  	if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { -		pr_warning("writepage_start %p on forced umount\n", inode); +		pr_warn("writepage_start %p on forced umount\n", inode);  		return -EIO; /* we're in a forced umount, don't write! */  	}  	if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) @@ -650,6 +720,7 @@ static int ceph_writepages_start(struct address_space *mapping,  retry:  	/* find oldest snap context with dirty data */  	ceph_put_snap_context(snapc); +	snap_size = 0;  	snapc = get_oldest_context(inode, &snap_size);  	if (!snapc) {  		/* hmm, why does writepages get called when there @@ -657,8 +728,18 @@ retry:  		dout(" no snap context with dirty data?\n");  		goto out;  	} +	if (snap_size == 0) +		snap_size = i_size_read(inode);  	dout(" oldest snapc is %p seq %lld (%d snaps)\n",  	     snapc, snapc->seq, snapc->num_snaps); + +	spin_lock(&ci->i_ceph_lock); +	truncate_seq = ci->i_truncate_seq; +	truncate_size = ci->i_truncate_size; +	if (!snap_size) +		snap_size = i_size_read(inode); +	spin_unlock(&ci->i_ceph_lock); +  	if (last_snapc && snapc != last_snapc) {  		/* if we switched to a newer snapc, restart our scan at the  		 * start of the original file range. */ @@ -669,15 +750,16 @@ retry:  	last_snapc = snapc;  	while (!done && index <= end) { +		int num_ops = do_sync ? 2 : 1;  		unsigned i;  		int first;  		pgoff_t next;  		int pvec_pages, locked_pages; +		struct page **pages = NULL; +		mempool_t *pool = NULL;	/* Becomes non-null if mempool used */  		struct page *page;  		int want;  		u64 offset, len; -		struct ceph_osd_request_head *reqhead; -		struct ceph_osd_op *op;  		long writeback_stat;  		next = 0; @@ -726,11 +808,8 @@ get_more_pages:  				dout("waiting on writeback %p\n", page);  				wait_on_page_writeback(page);  			} -			if ((snap_size && page_offset(page) > snap_size) || -			    (!snap_size && -			     page_offset(page) > i_size_read(inode))) { -				dout("%p page eof %llu\n", page, snap_size ? -				     snap_size : i_size_read(inode)); +			if (page_offset(page) >= snap_size) { +				dout("%p page eof %llu\n", page, snap_size);  				done = 1;  				unlock_page(page);  				break; @@ -742,7 +821,7 @@ get_more_pages:  			}  			/* only if matching snap context */ -			pgsnapc = (void *)page->private; +			pgsnapc = page_snap_context(page);  			if (pgsnapc->seq > snapc->seq) {  				dout("page snapc %p %lld > oldest %p %lld\n",  				     pgsnapc, pgsnapc->seq, snapc, snapc->seq); @@ -758,28 +837,42 @@ get_more_pages:  				break;  			} -			/* ok */ +			/* +			 * We have something to write.  If this is +			 * the first locked page this time through, +			 * allocate an osd request and a page array +			 * that it will use. +			 */  			if (locked_pages == 0) { +				BUG_ON(pages);  				/* prepare async write request */ -				offset = (unsigned long long)page->index -					<< PAGE_CACHE_SHIFT; +				offset = (u64)page_offset(page);  				len = wsize;  				req = ceph_osdc_new_request(&fsc->client->osdc, -					    &ci->i_layout, -					    ceph_vino(inode), -					    offset, &len, -					    CEPH_OSD_OP_WRITE, -					    CEPH_OSD_FLAG_WRITE | -						    CEPH_OSD_FLAG_ONDISK, -					    snapc, do_sync, -					    ci->i_truncate_seq, -					    ci->i_truncate_size, -					    &inode->i_mtime, true, 1); -				max_pages = req->r_num_pages; - -				alloc_page_vec(fsc, req); +							&ci->i_layout, vino, +							offset, &len, num_ops, +							CEPH_OSD_OP_WRITE, +							CEPH_OSD_FLAG_WRITE | +							CEPH_OSD_FLAG_ONDISK, +							snapc, truncate_seq, +							truncate_size, true); +				if (IS_ERR(req)) { +					rc = PTR_ERR(req); +					unlock_page(page); +					break; +				} +  				req->r_callback = writepages_finish;  				req->r_inode = inode; + +				max_pages = calc_pages_for(0, (u64)len); +				pages = kmalloc(max_pages * sizeof (*pages), +						GFP_NOFS); +				if (!pages) { +					pool = fsc->wb_pagevec_pool; +					pages = mempool_alloc(pool, GFP_NOFS); +					BUG_ON(!pages); +				}  			}  			/* note position of first page in pvec */ @@ -797,7 +890,7 @@ get_more_pages:  			}  			set_page_writeback(page); -			req->r_pages[locked_pages] = page; +			pages[locked_pages] = page;  			locked_pages++;  			next = page->index + 1;  		} @@ -826,22 +919,30 @@ get_more_pages:  			pvec.nr -= i-first;  		} -		/* submit the write */ -		offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; -		len = min((snap_size ? snap_size : i_size_read(inode)) - offset, +		/* Format the osd request message and submit the write */ + +		offset = page_offset(pages[0]); +		len = min(snap_size - offset,  			  (u64)locked_pages << PAGE_CACHE_SHIFT);  		dout("writepages got %d pages at %llu~%llu\n",  		     locked_pages, offset, len); -		/* revise final length, page count */ -		req->r_num_pages = locked_pages; -		reqhead = req->r_request->front.iov_base; -		op = (void *)(reqhead + 1); -		op->extent.length = cpu_to_le64(len); -		op->payload_len = cpu_to_le32(len); -		req->r_request->hdr.data_len = cpu_to_le32(len); +		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, +							!!pool, false); + +		pages = NULL;	/* request message now owns the pages array */ +		pool = NULL; + +		/* Update the write op length in case we changed it */ -		ceph_osdc_start_request(&fsc->client->osdc, req, true); +		osd_req_op_extent_update(req, 0, len); + +		vino = ceph_vino(inode); +		ceph_osdc_build_request(req, offset, snapc, vino.snap, +					&inode->i_mtime); + +		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); +		BUG_ON(rc);  		req = NULL;  		/* continue? */ @@ -873,8 +974,6 @@ release_pvec_pages:  out:  	if (req)  		ceph_osdc_put_request(req); -	if (rc > 0) -		rc = 0;  /* vfs expects us to return 0 */  	ceph_put_snap_context(snapc);  	dout("writepages done, rc = %d\n", rc);  	return rc; @@ -907,7 +1006,7 @@ static int ceph_update_writeable_page(struct file *file,  			    loff_t pos, unsigned len,  			    struct page *page)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_inode_info *ci = ceph_inode(inode);  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;  	loff_t page_off = pos & PAGE_CACHE_MASK; @@ -925,7 +1024,7 @@ retry_locked:  	BUG_ON(!ci->i_snap_realm);  	down_read(&mdsc->snap_rwsem);  	BUG_ON(!ci->i_snap_realm->cached_context); -	snapc = (void *)page->private; +	snapc = page_snap_context(page);  	if (snapc && snapc != ci->i_head_snapc) {  		/*  		 * this page is already dirty in another (older) snap @@ -1016,7 +1115,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,  			    loff_t pos, unsigned len, unsigned flags,  			    struct page **pagep, void **fsdata)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct page *page;  	pgoff_t index = pos >> PAGE_CACHE_SHIFT;  	int r; @@ -1046,7 +1145,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,  			  loff_t pos, unsigned len, unsigned copied,  			  struct page *page, void *fsdata)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);  	struct ceph_mds_client *mdsc = fsc->mdsc;  	unsigned from = pos & (PAGE_CACHE_SIZE - 1); @@ -1085,8 +1184,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,   * never get called.   */  static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, -			      const struct iovec *iov, -			      loff_t pos, unsigned long nr_segs) +			      struct iov_iter *iter, +			      loff_t pos)  {  	WARN_ON(1);  	return -EINVAL; @@ -1109,27 +1208,83 @@ const struct address_space_operations ceph_aops = {  /*   * vm ops   */ +static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ +	struct inode *inode = file_inode(vma->vm_file); +	struct ceph_inode_info *ci = ceph_inode(inode); +	struct ceph_file_info *fi = vma->vm_file->private_data; +	loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; +	int want, got, ret; + +	dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", +	     inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE); +	if (fi->fmode & CEPH_FILE_MODE_LAZY) +		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; +	else +		want = CEPH_CAP_FILE_CACHE; +	while (1) { +		got = 0; +		ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); +		if (ret == 0) +			break; +		if (ret != -ERESTARTSYS) { +			WARN_ON(1); +			return VM_FAULT_SIGBUS; +		} +	} +	dout("filemap_fault %p %llu~%zd got cap refs on %s\n", +	     inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); + +	ret = filemap_fault(vma, vmf); + +	dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", +	     inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); +	ceph_put_cap_refs(ci, got); + +	return ret; +}  /*   * Reuse write_begin here for simplicity.   */  static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  { -	struct inode *inode = vma->vm_file->f_dentry->d_inode; -	struct page *page = vmf->page; +	struct inode *inode = file_inode(vma->vm_file); +	struct ceph_inode_info *ci = ceph_inode(inode); +	struct ceph_file_info *fi = vma->vm_file->private_data;  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; -	loff_t off = page->index << PAGE_CACHE_SHIFT; -	loff_t size, len; -	int ret; +	struct page *page = vmf->page; +	loff_t off = page_offset(page); +	loff_t size = i_size_read(inode); +	size_t len; +	int want, got, ret; -	size = i_size_read(inode);  	if (off + PAGE_CACHE_SIZE <= size)  		len = PAGE_CACHE_SIZE;  	else  		len = size & ~PAGE_CACHE_MASK; -	dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, -	     off, len, page, page->index); +	dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", +	     inode, ceph_vinop(inode), off, len, size); +	if (fi->fmode & CEPH_FILE_MODE_LAZY) +		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; +	else +		want = CEPH_CAP_FILE_BUFFER; +	while (1) { +		got = 0; +		ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len); +		if (ret == 0) +			break; +		if (ret != -ERESTARTSYS) { +			WARN_ON(1); +			return VM_FAULT_SIGBUS; +		} +	} +	dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", +	     inode, off, len, ceph_cap_string(got)); + +	/* Update time before taking page lock */ +	file_update_time(vma->vm_file);  	lock_page(page); @@ -1151,15 +1306,28 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  			ret = VM_FAULT_SIGBUS;  	}  out: -	dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); -	if (ret != VM_FAULT_LOCKED) +	if (ret != VM_FAULT_LOCKED) {  		unlock_page(page); +	} else { +		int dirty; +		spin_lock(&ci->i_ceph_lock); +		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); +		spin_unlock(&ci->i_ceph_lock); +		if (dirty) +			__mark_inode_dirty(inode, dirty); +	} + +	dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", +	     inode, off, len, ceph_cap_string(got), ret); +	ceph_put_cap_refs(ci, got); +  	return ret;  }  static struct vm_operations_struct ceph_vmops = { -	.fault		= filemap_fault, +	.fault		= ceph_filemap_fault,  	.page_mkwrite	= ceph_page_mkwrite, +	.remap_pages	= generic_file_remap_pages,  };  int ceph_mmap(struct file *file, struct vm_area_struct *vma) @@ -1170,6 +1338,5 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)  		return -ENOEXEC;  	file_accessed(file);  	vma->vm_ops = &ceph_vmops; -	vma->vm_flags |= VM_CAN_NONLINEAR;  	return 0;  } diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c new file mode 100644 index 00000000000..834f9f3723f --- /dev/null +++ b/fs/ceph/cache.c @@ -0,0 +1,402 @@ +/* + * Ceph cache definitions. + * + *  Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + *  Written by Milosz Tanski (milosz@adfin.com) + * + *  This program is free software; you can redistribute it and/or modify + *  it under the terms of the GNU General Public License version 2 + *  as published by the Free Software Foundation. + * + *  This program is distributed in the hope that it will be useful, + *  but WITHOUT ANY WARRANTY; without even the implied warranty of + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + *  GNU General Public License for more details. + * + *  You should have received a copy of the GNU General Public License + *  along with this program; if not, write to: + *  Free Software Foundation + *  51 Franklin Street, Fifth Floor + *  Boston, MA  02111-1301  USA + * + */ + +#include "super.h" +#include "cache.h" + +struct ceph_aux_inode { +	struct timespec	mtime; +	loff_t          size; +}; + +struct fscache_netfs ceph_cache_netfs = { +	.name		= "ceph", +	.version	= 0, +}; + +static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data, +					     void *buffer, uint16_t maxbuf) +{ +	const struct ceph_fs_client* fsc = cookie_netfs_data; +	uint16_t klen; + +	klen = sizeof(fsc->client->fsid); +	if (klen > maxbuf) +		return 0; + +	memcpy(buffer, &fsc->client->fsid, klen); +	return klen; +} + +static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { +	.name		= "CEPH.fsid", +	.type		= FSCACHE_COOKIE_TYPE_INDEX, +	.get_key	= ceph_fscache_session_get_key, +}; + +int ceph_fscache_register(void) +{ +	return fscache_register_netfs(&ceph_cache_netfs); +} + +void ceph_fscache_unregister(void) +{ +	fscache_unregister_netfs(&ceph_cache_netfs); +} + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +{ +	fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, +					      &ceph_fscache_fsid_object_def, +					      fsc, true); + +	if (fsc->fscache == NULL) { +		pr_err("Unable to resgister fsid: %p fscache cookie", fsc); +		return 0; +	} + +	fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1); +	if (fsc->revalidate_wq == NULL) +		return -ENOMEM; + +	return 0; +} + +static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data, +					   void *buffer, uint16_t maxbuf) +{ +	const struct ceph_inode_info* ci = cookie_netfs_data; +	uint16_t klen; + +	/* use ceph virtual inode (id + snaphot) */ +	klen = sizeof(ci->i_vino); +	if (klen > maxbuf) +		return 0; + +	memcpy(buffer, &ci->i_vino, klen); +	return klen; +} + +static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, +					   void *buffer, uint16_t bufmax) +{ +	struct ceph_aux_inode aux; +	const struct ceph_inode_info* ci = cookie_netfs_data; +	const struct inode* inode = &ci->vfs_inode; + +	memset(&aux, 0, sizeof(aux)); +	aux.mtime = inode->i_mtime; +	aux.size = inode->i_size; + +	memcpy(buffer, &aux, sizeof(aux)); + +	return sizeof(aux); +} + +static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data, +					uint64_t *size) +{ +	const struct ceph_inode_info* ci = cookie_netfs_data; +	const struct inode* inode = &ci->vfs_inode; + +	*size = inode->i_size; +} + +static enum fscache_checkaux ceph_fscache_inode_check_aux( +	void *cookie_netfs_data, const void *data, uint16_t dlen) +{ +	struct ceph_aux_inode aux; +	struct ceph_inode_info* ci = cookie_netfs_data; +	struct inode* inode = &ci->vfs_inode; + +	if (dlen != sizeof(aux)) +		return FSCACHE_CHECKAUX_OBSOLETE; + +	memset(&aux, 0, sizeof(aux)); +	aux.mtime = inode->i_mtime; +	aux.size = inode->i_size; + +	if (memcmp(data, &aux, sizeof(aux)) != 0) +		return FSCACHE_CHECKAUX_OBSOLETE; + +	dout("ceph inode 0x%p cached okay", ci); +	return FSCACHE_CHECKAUX_OKAY; +} + +static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data) +{ +	struct ceph_inode_info* ci = cookie_netfs_data; +	struct pagevec pvec; +	pgoff_t first; +	int loop, nr_pages; + +	pagevec_init(&pvec, 0); +	first = 0; + +	dout("ceph inode 0x%p now uncached", ci); + +	while (1) { +		nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first, +					  PAGEVEC_SIZE - pagevec_count(&pvec)); + +		if (!nr_pages) +			break; + +		for (loop = 0; loop < nr_pages; loop++) +			ClearPageFsCache(pvec.pages[loop]); + +		first = pvec.pages[nr_pages - 1]->index + 1; + +		pvec.nr = nr_pages; +		pagevec_release(&pvec); +		cond_resched(); +	} +} + +static const struct fscache_cookie_def ceph_fscache_inode_object_def = { +	.name		= "CEPH.inode", +	.type		= FSCACHE_COOKIE_TYPE_DATAFILE, +	.get_key	= ceph_fscache_inode_get_key, +	.get_attr	= ceph_fscache_inode_get_attr, +	.get_aux	= ceph_fscache_inode_get_aux, +	.check_aux	= ceph_fscache_inode_check_aux, +	.now_uncached	= ceph_fscache_inode_now_uncached, +}; + +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, +					struct ceph_inode_info* ci) +{ +	struct inode* inode = &ci->vfs_inode; + +	/* No caching for filesystem */ +	if (fsc->fscache == NULL) +		return; + +	/* Only cache for regular files that are read only */ +	if ((ci->vfs_inode.i_mode & S_IFREG) == 0) +		return; + +	/* Avoid multiple racing open requests */ +	mutex_lock(&inode->i_mutex); + +	if (ci->fscache) +		goto done; + +	ci->fscache = fscache_acquire_cookie(fsc->fscache, +					     &ceph_fscache_inode_object_def, +					     ci, true); +	fscache_check_consistency(ci->fscache); +done: +	mutex_unlock(&inode->i_mutex); + +} + +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ +	struct fscache_cookie* cookie; + +	if ((cookie = ci->fscache) == NULL) +		return; + +	ci->fscache = NULL; + +	fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode); +	fscache_relinquish_cookie(cookie, 0); +} + +static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) +{ +	if (!error) +		SetPageUptodate(page); +} + +static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error) +{ +	if (!error) +		SetPageUptodate(page); + +	unlock_page(page); +} + +static inline int cache_valid(struct ceph_inode_info *ci) +{ +	return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && +		(ci->i_fscache_gen == ci->i_rdcache_gen)); +} + + +/* Atempt to read from the fscache, + * + * This function is called from the readpage_nounlock context. DO NOT attempt to + * unlock the page here (or in the callback). + */ +int ceph_readpage_from_fscache(struct inode *inode, struct page *page) +{ +	struct ceph_inode_info *ci = ceph_inode(inode); +	int ret; + +	if (!cache_valid(ci)) +		return -ENOBUFS; + +	ret = fscache_read_or_alloc_page(ci->fscache, page, +					 ceph_vfs_readpage_complete, NULL, +					 GFP_KERNEL); + +	switch (ret) { +		case 0: /* Page found */ +			dout("page read submitted\n"); +			return 0; +		case -ENOBUFS: /* Pages were not found, and can't be */ +		case -ENODATA: /* Pages were not found */ +			dout("page/inode not in cache\n"); +			return ret; +		default: +			dout("%s: unknown error ret = %i\n", __func__, ret); +			return ret; +	} +} + +int ceph_readpages_from_fscache(struct inode *inode, +				  struct address_space *mapping, +				  struct list_head *pages, +				  unsigned *nr_pages) +{ +	struct ceph_inode_info *ci = ceph_inode(inode); +	int ret; + +	if (!cache_valid(ci)) +		return -ENOBUFS; + +	ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, +					  ceph_vfs_readpage_complete_unlock, +					  NULL, mapping_gfp_mask(mapping)); + +	switch (ret) { +		case 0: /* All pages found */ +			dout("all-page read submitted\n"); +			return 0; +		case -ENOBUFS: /* Some pages were not found, and can't be */ +		case -ENODATA: /* some pages were not found */ +			dout("page/inode not in cache\n"); +			return ret; +		default: +			dout("%s: unknown error ret = %i\n", __func__, ret); +			return ret; +	} +} + +void ceph_readpage_to_fscache(struct inode *inode, struct page *page) +{ +	struct ceph_inode_info *ci = ceph_inode(inode); +	int ret; + +	if (!PageFsCache(page)) +		return; + +	if (!cache_valid(ci)) +		return; + +	ret = fscache_write_page(ci->fscache, page, GFP_KERNEL); +	if (ret) +		 fscache_uncache_page(ci->fscache, page); +} + +void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) +{ +	struct ceph_inode_info *ci = ceph_inode(inode); + +	if (!PageFsCache(page)) +		return; + +	fscache_wait_on_page_write(ci->fscache, page); +	fscache_uncache_page(ci->fscache, page); +} + +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ +	if (fsc->revalidate_wq) +		destroy_workqueue(fsc->revalidate_wq); + +	fscache_relinquish_cookie(fsc->fscache, 0); +	fsc->fscache = NULL; +} + +static void ceph_revalidate_work(struct work_struct *work) +{ +	int issued; +	u32 orig_gen; +	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, +						  i_revalidate_work); +	struct inode *inode = &ci->vfs_inode; + +	spin_lock(&ci->i_ceph_lock); +	issued = __ceph_caps_issued(ci, NULL); +	orig_gen = ci->i_rdcache_gen; +	spin_unlock(&ci->i_ceph_lock); + +	if (!(issued & CEPH_CAP_FILE_CACHE)) { +		dout("revalidate_work lost cache before validation %p\n", +		     inode); +		goto out; +	} + +	if (!fscache_check_consistency(ci->fscache)) +		fscache_invalidate(ci->fscache); + +	spin_lock(&ci->i_ceph_lock); +	/* Update the new valid generation (backwards sanity check too) */ +	if (orig_gen > ci->i_fscache_gen) { +		ci->i_fscache_gen = orig_gen; +	} +	spin_unlock(&ci->i_ceph_lock); + +out: +	iput(&ci->vfs_inode); +} + +void ceph_queue_revalidate(struct inode *inode) +{ +	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); +	struct ceph_inode_info *ci = ceph_inode(inode); + +	if (fsc->revalidate_wq == NULL || ci->fscache == NULL) +		return; + +	ihold(inode); + +	if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq, +		       &ci->i_revalidate_work)) { +		dout("ceph_queue_revalidate %p\n", inode); +	} else { +		dout("ceph_queue_revalidate %p failed\n)", inode); +		iput(inode); +	} +} + +void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ +	ci->fscache = NULL; +	/* The first load is verifed cookie open time */ +	ci->i_fscache_gen = 1; +	INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work); +} diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h new file mode 100644 index 00000000000..5ac591bd012 --- /dev/null +++ b/fs/ceph/cache.h @@ -0,0 +1,182 @@ +/* + * Ceph cache definitions. + * + *  Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + *  Written by Milosz Tanski (milosz@adfin.com) + * + *  This program is free software; you can redistribute it and/or modify + *  it under the terms of the GNU General Public License version 2 + *  as published by the Free Software Foundation. + * + *  This program is distributed in the hope that it will be useful, + *  but WITHOUT ANY WARRANTY; without even the implied warranty of + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + *  GNU General Public License for more details. + * + *  You should have received a copy of the GNU General Public License + *  along with this program; if not, write to: + *  Free Software Foundation + *  51 Franklin Street, Fifth Floor + *  Boston, MA  02111-1301  USA + * + */ + +#ifndef _CEPH_CACHE_H +#define _CEPH_CACHE_H + +#ifdef CONFIG_CEPH_FSCACHE + +extern struct fscache_netfs ceph_cache_netfs; + +int ceph_fscache_register(void); +void ceph_fscache_unregister(void); + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc); +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); + +void ceph_fscache_inode_init(struct ceph_inode_info *ci); +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, +					struct ceph_inode_info* ci); +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); + +int ceph_readpage_from_fscache(struct inode *inode, struct page *page); +int ceph_readpages_from_fscache(struct inode *inode, +				struct address_space *mapping, +				struct list_head *pages, +				unsigned *nr_pages); +void ceph_readpage_to_fscache(struct inode *inode, struct page *page); +void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); +void ceph_queue_revalidate(struct inode *inode); + +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ +	struct ceph_inode_info *ci = ceph_inode(inode); +	fscache_attr_changed(ci->fscache); +} + +static inline void ceph_fscache_invalidate(struct inode *inode) +{ +	fscache_invalidate(ceph_inode(inode)->fscache); +} + +static inline void ceph_fscache_uncache_page(struct inode *inode, +					     struct page *page) +{ +	struct ceph_inode_info *ci = ceph_inode(inode); +	return fscache_uncache_page(ci->fscache, page); +} + +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +{ +	struct inode* inode = page->mapping->host; +	struct ceph_inode_info *ci = ceph_inode(inode); +	return fscache_maybe_release_page(ci->fscache, page, gfp); +} + +static inline void ceph_fscache_readpage_cancel(struct inode *inode, +						struct page *page) +{ +	struct ceph_inode_info *ci = ceph_inode(inode); +	if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) +		__fscache_uncache_page(ci->fscache, page); +} + +static inline void ceph_fscache_readpages_cancel(struct inode *inode, +						 struct list_head *pages) +{ +	struct ceph_inode_info *ci = ceph_inode(inode); +	return fscache_readpages_cancel(ci->fscache, pages); +} + +#else + +static inline int ceph_fscache_register(void) +{ +	return 0; +} + +static inline void ceph_fscache_unregister(void) +{ +} + +static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +{ +	return 0; +} + +static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ +} + +static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ +} + +static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc, +						      struct ceph_inode_info* ci) +{ +} + +static inline void ceph_fscache_uncache_page(struct inode *inode, +					     struct page *pages) +{ +} + +static inline int ceph_readpage_from_fscache(struct inode* inode, +					     struct page *page) +{ +	return -ENOBUFS; +} + +static inline int ceph_readpages_from_fscache(struct inode *inode, +					      struct address_space *mapping, +					      struct list_head *pages, +					      unsigned *nr_pages) +{ +	return -ENOBUFS; +} + +static inline void ceph_readpage_to_fscache(struct inode *inode, +					    struct page *page) +{ +} + +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ +} + +static inline void ceph_fscache_invalidate(struct inode *inode) +{ +} + +static inline void ceph_invalidate_fscache_page(struct inode *inode, +						struct page *page) +{ +} + +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ +} + +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +{ +	return 1; +} + +static inline void ceph_fscache_readpage_cancel(struct inode *inode, +						struct page *page) +{ +} + +static inline void ceph_fscache_readpages_cancel(struct inode *inode, +						 struct list_head *pages) +{ +} + +static inline void ceph_queue_revalidate(struct inode *inode) +{ +} + +#endif + +#endif diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 98ab13e2b71..1fde164b74b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -10,6 +10,7 @@  #include "super.h"  #include "mds_client.h" +#include "cache.h"  #include <linux/ceph/decode.h>  #include <linux/ceph/messenger.h> @@ -147,7 +148,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)  	spin_unlock(&mdsc->caps_list_lock);  } -int ceph_reserve_caps(struct ceph_mds_client *mdsc, +void ceph_reserve_caps(struct ceph_mds_client *mdsc,  		      struct ceph_cap_reservation *ctx, int need)  {  	int i; @@ -155,7 +156,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,  	int have;  	int alloc = 0;  	LIST_HEAD(newcaps); -	int ret = 0;  	dout("reserve caps ctx=%p need=%d\n", ctx, need); @@ -174,14 +174,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,  	for (i = have; i < need; i++) {  		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); -		if (!cap) { -			ret = -ENOMEM; -			goto out_alloc_count; -		} +		if (!cap) +			break;  		list_add(&cap->caps_item, &newcaps);  		alloc++;  	} -	BUG_ON(have + alloc != need); +	/* we didn't manage to reserve as much as we needed */ +	if (have + alloc != need) +		pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", +			ctx, need, have + alloc);  	spin_lock(&mdsc->caps_list_lock);  	mdsc->caps_total_count += alloc; @@ -197,13 +198,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,  	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",  	     ctx, mdsc->caps_total_count, mdsc->caps_use_count,  	     mdsc->caps_reserve_count, mdsc->caps_avail_count); -	return 0; - -out_alloc_count: -	/* we didn't manage to reserve as much as we needed */ -	pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n", -		   ctx, need, have); -	return ret;  }  int ceph_unreserve_caps(struct ceph_mds_client *mdsc, @@ -227,8 +221,8 @@ int ceph_unreserve_caps(struct ceph_mds_client *mdsc,  	return 0;  } -static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, -				struct ceph_cap_reservation *ctx) +struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, +			      struct ceph_cap_reservation *ctx)  {  	struct ceph_cap *cap = NULL; @@ -236,8 +230,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,  	if (!ctx) {  		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);  		if (cap) { +			spin_lock(&mdsc->caps_list_lock);  			mdsc->caps_use_count++;  			mdsc->caps_total_count++; +			spin_unlock(&mdsc->caps_list_lock);  		}  		return cap;  	} @@ -309,7 +305,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,  /*   * Find ceph_cap for given mds, if any.   * - * Called with i_lock held. + * Called with i_ceph_lock held.   */  static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)  { @@ -332,9 +328,9 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)  {  	struct ceph_cap *cap; -	spin_lock(&ci->vfs_inode.i_lock); +	spin_lock(&ci->i_ceph_lock);  	cap = __get_cap_for_mds(ci, mds); -	spin_unlock(&ci->vfs_inode.i_lock); +	spin_unlock(&ci->i_ceph_lock);  	return cap;  } @@ -361,15 +357,16 @@ static int __ceph_get_cap_mds(struct ceph_inode_info *ci)  int ceph_get_cap_mds(struct inode *inode)  { +	struct ceph_inode_info *ci = ceph_inode(inode);  	int mds; -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	mds = __ceph_get_cap_mds(ceph_inode(inode)); -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	return mds;  }  /* - * Called under i_lock. + * Called under i_ceph_lock.   */  static void __insert_cap_node(struct ceph_inode_info *ci,  			      struct ceph_cap *new) @@ -415,7 +412,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,   *   * If I_FLUSH is set, leave the inode at the front of the list.   * - * Caller holds i_lock + * Caller holds i_ceph_lock   *    -> we take mdsc->cap_delay_lock   */  static void __cap_delay_requeue(struct ceph_mds_client *mdsc, @@ -457,7 +454,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,  /*   * Cancel delayed work on cap.   * - * Caller must hold i_lock. + * Caller must hold i_ceph_lock.   */  static void __cap_delay_cancel(struct ceph_mds_client *mdsc,  			       struct ceph_inode_info *ci) @@ -483,11 +480,12 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,  	 * i_rdcache_gen.  	 */  	if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && -	    (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) +	    (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {  		ci->i_rdcache_gen++; +	}  	/* -	 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we +	 * if we are newly issued FILE_SHARED, mark dir not complete; we  	 * don't know what happened to this directory while we didn't  	 * have the cap.  	 */ @@ -496,7 +494,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,  		ci->i_shared_gen++;  		if (S_ISDIR(ci->vfs_inode.i_mode)) {  			dout(" marking %p NOT complete\n", &ci->vfs_inode); -			ci->i_ceph_flags &= ~CEPH_I_COMPLETE; +			__ceph_dir_clear_complete(ci);  		}  	}  } @@ -510,15 +508,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,   * it is < 0.  (This is so we can atomically add the cap and add an   * open file reference to it.)   */ -int ceph_add_cap(struct inode *inode, -		 struct ceph_mds_session *session, u64 cap_id, -		 int fmode, unsigned issued, unsigned wanted, -		 unsigned seq, unsigned mseq, u64 realmino, int flags, -		 struct ceph_cap_reservation *caps_reservation) +void ceph_add_cap(struct inode *inode, +		  struct ceph_mds_session *session, u64 cap_id, +		  int fmode, unsigned issued, unsigned wanted, +		  unsigned seq, unsigned mseq, u64 realmino, int flags, +		  struct ceph_cap **new_cap)  {  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;  	struct ceph_inode_info *ci = ceph_inode(inode); -	struct ceph_cap *new_cap = NULL;  	struct ceph_cap *cap;  	int mds = session->s_mds;  	int actual_wanted; @@ -533,42 +530,44 @@ int ceph_add_cap(struct inode *inode,  	if (fmode >= 0)  		wanted |= ceph_caps_for_mode(fmode); -retry: -	spin_lock(&inode->i_lock);  	cap = __get_cap_for_mds(ci, mds);  	if (!cap) { -		if (new_cap) { -			cap = new_cap; -			new_cap = NULL; -		} else { -			spin_unlock(&inode->i_lock); -			new_cap = get_cap(mdsc, caps_reservation); -			if (new_cap == NULL) -				return -ENOMEM; -			goto retry; -		} +		cap = *new_cap; +		*new_cap = NULL;  		cap->issued = 0;  		cap->implemented = 0;  		cap->mds = mds;  		cap->mds_wanted = 0; +		cap->mseq = 0;  		cap->ci = ci;  		__insert_cap_node(ci, cap); -		/* clear out old exporting info?  (i.e. on cap import) */ -		if (ci->i_cap_exporting_mds == mds) { -			ci->i_cap_exporting_issued = 0; -			ci->i_cap_exporting_mseq = 0; -			ci->i_cap_exporting_mds = -1; -		} -  		/* add to session cap list */  		cap->session = session;  		spin_lock(&session->s_cap_lock);  		list_add_tail(&cap->session_caps, &session->s_caps);  		session->s_nr_caps++;  		spin_unlock(&session->s_cap_lock); +	} else { +		/* +		 * auth mds of the inode changed. we received the cap export +		 * message, but still haven't received the cap import message. +		 * handle_cap_export() updated the new auth MDS' cap. +		 * +		 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing +		 * a message that was send before the cap import message. So +		 * don't remove caps. +		 */ +		if (ceph_seq_cmp(seq, cap->seq) <= 0) { +			WARN_ON(cap != ci->i_auth_cap); +			WARN_ON(cap->cap_id != cap_id); +			seq = cap->seq; +			mseq = cap->mseq; +			issued |= cap->issued; +			flags |= CEPH_CAP_FLAG_AUTH; +		}  	}  	if (!ci->i_snap_realm) { @@ -607,10 +606,15 @@ retry:  		__cap_delay_requeue(mdsc, ci);  	} -	if (flags & CEPH_CAP_FLAG_AUTH) -		ci->i_auth_cap = cap; -	else if (ci->i_auth_cap == cap) -		ci->i_auth_cap = NULL; +	if (flags & CEPH_CAP_FLAG_AUTH) { +		if (ci->i_auth_cap == NULL || +		    ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { +			ci->i_auth_cap = cap; +			cap->mds_wanted = wanted; +		} +	} else { +		WARN_ON(ci->i_auth_cap == cap); +	}  	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",  	     inode, ceph_vinop(inode), cap, ceph_cap_string(issued), @@ -618,7 +622,10 @@ retry:  	cap->cap_id = cap_id;  	cap->issued = issued;  	cap->implemented |= issued; -	cap->mds_wanted |= wanted; +	if (ceph_seq_cmp(mseq, cap->mseq) > 0) +		cap->mds_wanted = wanted; +	else +		cap->mds_wanted |= wanted;  	cap->seq = seq;  	cap->issue_seq = seq;  	cap->mseq = mseq; @@ -626,9 +633,6 @@ retry:  	if (fmode >= 0)  		__ceph_get_fmode(ci, fmode); -	spin_unlock(&inode->i_lock); -	wake_up_all(&ci->i_cap_wq); -	return 0;  }  /* @@ -641,10 +645,10 @@ static int __cap_is_valid(struct ceph_cap *cap)  	unsigned long ttl;  	u32 gen; -	spin_lock(&cap->session->s_cap_lock); +	spin_lock(&cap->session->s_gen_ttl_lock);  	gen = cap->session->s_cap_gen;  	ttl = cap->session->s_cap_ttl; -	spin_unlock(&cap->session->s_cap_lock); +	spin_unlock(&cap->session->s_gen_ttl_lock);  	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {  		dout("__cap_is_valid %p cap %p issued %s " @@ -663,7 +667,7 @@ static int __cap_is_valid(struct ceph_cap *cap)   */  int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)  { -	int have = ci->i_snap_caps | ci->i_cap_exporting_issued; +	int have = ci->i_snap_caps;  	struct ceph_cap *cap;  	struct rb_node *p; @@ -679,6 +683,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)  		if (implemented)  			*implemented |= cap->implemented;  	} +	/* +	 * exclude caps issued by non-auth MDS, but are been revoking +	 * by the auth MDS. The non-auth MDS should be revoking/exporting +	 * these caps, but the message is delayed. +	 */ +	if (ci->i_auth_cap) { +		cap = ci->i_auth_cap; +		have &= ~cap->implemented | cap->issued; +	}  	return have;  } @@ -765,7 +778,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)  			if (touch) {  				struct rb_node *q; -				/* touch this + preceeding caps */ +				/* touch this + preceding caps */  				__touch_cap(cap);  				for (q = rb_first(&ci->i_caps); q != p;  				     q = rb_next(q)) { @@ -786,23 +799,29 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)  /*   * Return true if mask caps are currently being revoked by an MDS.   */ -int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +int __ceph_caps_revoking_other(struct ceph_inode_info *ci, +			       struct ceph_cap *ocap, int mask)  { -	struct inode *inode = &ci->vfs_inode;  	struct ceph_cap *cap;  	struct rb_node *p; -	int ret = 0; -	spin_lock(&inode->i_lock);  	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {  		cap = rb_entry(p, struct ceph_cap, ci_node); -		if (__cap_is_valid(cap) && -		    (cap->implemented & ~cap->issued & mask)) { -			ret = 1; -			break; -		} +		if (cap != ocap && +		    (cap->implemented & ~cap->issued & mask)) +			return 1;  	} -	spin_unlock(&inode->i_lock); +	return 0; +} + +int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +{ +	struct inode *inode = &ci->vfs_inode; +	int ret; + +	spin_lock(&ci->i_ceph_lock); +	ret = __ceph_caps_revoking_other(ci, NULL, mask); +	spin_unlock(&ci->i_ceph_lock);  	dout("ceph_caps_revoking %p %s = %d\n", inode,  	     ceph_cap_string(mask), ret);  	return ret; @@ -819,7 +838,7 @@ int __ceph_caps_used(struct ceph_inode_info *ci)  		used |= CEPH_CAP_FILE_CACHE;  	if (ci->i_wr_ref)  		used |= CEPH_CAP_FILE_WR; -	if (ci->i_wrbuffer_ref) +	if (ci->i_wb_ref || ci->i_wrbuffer_ref)  		used |= CEPH_CAP_FILE_BUFFER;  	return used;  } @@ -850,26 +869,41 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)  		cap = rb_entry(p, struct ceph_cap, ci_node);  		if (!__cap_is_valid(cap))  			continue; -		mds_wanted |= cap->mds_wanted; +		if (cap == ci->i_auth_cap) +			mds_wanted |= cap->mds_wanted; +		else +			mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);  	}  	return mds_wanted;  }  /* - * called under i_lock + * called under i_ceph_lock   */  static int __ceph_is_any_caps(struct ceph_inode_info *ci)  { -	return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; +	return !RB_EMPTY_ROOT(&ci->i_caps); +} + +int ceph_is_any_caps(struct inode *inode) +{ +	struct ceph_inode_info *ci = ceph_inode(inode); +	int ret; + +	spin_lock(&ci->i_ceph_lock); +	ret = __ceph_is_any_caps(ci); +	spin_unlock(&ci->i_ceph_lock); + +	return ret;  }  /*   * Remove a cap.  Take steps to deal with a racing iterate_session_caps.   * - * caller should hold i_lock. + * caller should hold i_ceph_lock.   * caller will not hold session s_mutex if called from destroy_inode.   */ -void __ceph_remove_cap(struct ceph_cap *cap) +void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)  {  	struct ceph_mds_session *session = cap->session;  	struct ceph_inode_info *ci = cap->ci; @@ -881,6 +915,16 @@ void __ceph_remove_cap(struct ceph_cap *cap)  	/* remove from session list */  	spin_lock(&session->s_cap_lock); +	/* +	 * s_cap_reconnect is protected by s_cap_lock. no one changes +	 * s_cap_gen while session is in the reconnect state. +	 */ +	if (queue_release && +	    (!session->s_cap_reconnect || +	     cap->cap_gen == session->s_cap_gen)) +		__queue_cap_release(session, ci->i_vino.ino, cap->cap_id, +				    cap->mseq, cap->issue_seq); +  	if (session->s_cap_iterator == cap) {  		/* not yet, we are iterating over this very cap */  		dout("__ceph_remove_cap  delaying %p removal from session %p\n", @@ -928,7 +972,7 @@ static int send_cap_msg(struct ceph_mds_session *session,  			u64 size, u64 max_size,  			struct timespec *mtime, struct timespec *atime,  			u64 time_warp_seq, -			uid_t uid, gid_t gid, mode_t mode, +			kuid_t uid, kgid_t gid, umode_t mode,  			u64 xattr_version,  			struct ceph_buffer *xattrs_buf,  			u64 follows) @@ -944,7 +988,7 @@ static int send_cap_msg(struct ceph_mds_session *session,  	     seq, issue_seq, mseq, follows, size, max_size,  	     xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); -	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS); +	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false);  	if (!msg)  		return -ENOMEM; @@ -972,8 +1016,8 @@ static int send_cap_msg(struct ceph_mds_session *session,  		ceph_encode_timespec(&fc->atime, atime);  	fc->time_warp_seq = cpu_to_le32(time_warp_seq); -	fc->uid = cpu_to_le32(uid); -	fc->gid = cpu_to_le32(gid); +	fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); +	fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid));  	fc->mode = cpu_to_le32(mode);  	fc->xattr_version = cpu_to_le64(xattr_version); @@ -987,15 +1031,14 @@ static int send_cap_msg(struct ceph_mds_session *session,  	return 0;  } -static void __queue_cap_release(struct ceph_mds_session *session, -				u64 ino, u64 cap_id, u32 migrate_seq, -				u32 issue_seq) +void __queue_cap_release(struct ceph_mds_session *session, +			 u64 ino, u64 cap_id, u32 migrate_seq, +			 u32 issue_seq)  {  	struct ceph_msg *msg;  	struct ceph_mds_cap_release *head;  	struct ceph_mds_cap_item *item; -	spin_lock(&session->s_cap_lock);  	BUG_ON(!session->s_num_cap_releases);  	msg = list_first_entry(&session->s_cap_releases,  			       struct ceph_msg, list_head); @@ -1005,7 +1048,7 @@ static void __queue_cap_release(struct ceph_mds_session *session,  	BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);  	head = msg->front.iov_base; -	head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); +	le32_add_cpu(&head->num, 1);  	item = msg->front.iov_base + msg->front.iov_len;  	item->ino = cpu_to_le64(ino);  	item->cap_id = cpu_to_le64(cap_id); @@ -1024,12 +1067,11 @@ static void __queue_cap_release(struct ceph_mds_session *session,  		     (int)CEPH_CAPS_PER_RELEASE,  		     (int)msg->front.iov_len);  	} -	spin_unlock(&session->s_cap_lock);  }  /*   * Queue cap releases when an inode is dropped from our cache.  Since - * inode is about to be destroyed, there is no need for i_lock. + * inode is about to be destroyed, there is no need for i_ceph_lock.   */  void ceph_queue_caps_release(struct inode *inode)  { @@ -1039,18 +1081,14 @@ void ceph_queue_caps_release(struct inode *inode)  	p = rb_first(&ci->i_caps);  	while (p) {  		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); -		struct ceph_mds_session *session = cap->session; - -		__queue_cap_release(session, ceph_ino(inode), cap->cap_id, -				    cap->mseq, cap->issue_seq);  		p = rb_next(p); -		__ceph_remove_cap(cap); +		__ceph_remove_cap(cap, true);  	}  }  /*   * Send a cap msg on the given inode.  Update our caps state, then - * drop i_lock and send the message. + * drop i_ceph_lock and send the message.   *   * Make note of max_size reported/requested from mds, revoked caps   * that have now been implemented. @@ -1062,13 +1100,13 @@ void ceph_queue_caps_release(struct inode *inode)   * Return non-zero if delayed release, or we experienced an error   * such that the caller should requeue + retry later.   * - * called with i_lock, then drops it. + * called with i_ceph_lock, then drops it.   * caller should hold snap_rwsem (read), s_mutex.   */  static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,  		      int op, int used, int want, int retain, int flushing,  		      unsigned *pflush_tid) -	__releases(cap->ci->vfs_inode->i_lock) +	__releases(cap->ci->i_ceph_lock)  {  	struct ceph_inode_info *ci = cap->ci;  	struct inode *inode = &ci->vfs_inode; @@ -1078,9 +1116,9 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,  	u64 size, max_size;  	struct timespec mtime, atime;  	int wake = 0; -	mode_t mode; -	uid_t uid; -	gid_t gid; +	umode_t mode; +	kuid_t uid; +	kgid_t gid;  	struct ceph_mds_session *session;  	u64 xattr_version = 0;  	struct ceph_buffer *xattr_blob = NULL; @@ -1171,7 +1209,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,  		xattr_version = ci->i_xattrs.version;  	} -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,  		op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, @@ -1199,13 +1237,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,   * Unless @again is true, skip cap_snaps that were already sent to   * the MDS (i.e., during this session).   * - * Called under i_lock.  Takes s_mutex as needed. + * Called under i_ceph_lock.  Takes s_mutex as needed.   */  void __ceph_flush_snaps(struct ceph_inode_info *ci,  			struct ceph_mds_session **psession,  			int again) -		__releases(ci->vfs_inode->i_lock) -		__acquires(ci->vfs_inode->i_lock) +		__releases(ci->i_ceph_lock) +		__acquires(ci->i_ceph_lock)  {  	struct inode *inode = &ci->vfs_inode;  	int mds; @@ -1262,7 +1300,7 @@ retry:  			session = NULL;  		}  		if (!session) { -			spin_unlock(&inode->i_lock); +			spin_unlock(&ci->i_ceph_lock);  			mutex_lock(&mdsc->mutex);  			session = __ceph_lookup_mds_session(mdsc, mds);  			mutex_unlock(&mdsc->mutex); @@ -1276,7 +1314,7 @@ retry:  			 * deletion or migration.  retry, and we'll  			 * get a better @mds value next time.  			 */ -			spin_lock(&inode->i_lock); +			spin_lock(&ci->i_ceph_lock);  			goto retry;  		} @@ -1286,7 +1324,7 @@ retry:  			list_del_init(&capsnap->flushing_item);  		list_add_tail(&capsnap->flushing_item,  			      &session->s_cap_snaps_flushing); -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  		dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",  		     inode, capsnap, capsnap->follows, capsnap->flush_tid); @@ -1303,7 +1341,7 @@ retry:  		next_follows = capsnap->follows + 1;  		ceph_put_cap_snap(capsnap); -		spin_lock(&inode->i_lock); +		spin_lock(&ci->i_ceph_lock);  		goto retry;  	} @@ -1323,18 +1361,17 @@ out:  static void ceph_flush_snaps(struct ceph_inode_info *ci)  { -	struct inode *inode = &ci->vfs_inode; - -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	__ceph_flush_snaps(ci, NULL, 0); -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  }  /* - * Mark caps dirty.  If inode is newly dirty, add to the global dirty - * list. + * Mark caps dirty.  If inode is newly dirty, return the dirty flags. + * Caller is then responsible for calling __mark_inode_dirty with the + * returned flags value.   */ -void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) +int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)  {  	struct ceph_mds_client *mdsc =  		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; @@ -1350,14 +1387,15 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)  		if (!ci->i_head_snapc)  			ci->i_head_snapc = ceph_get_snap_context(  				ci->i_snap_realm->cached_context); -		dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, -			ci->i_head_snapc); +		dout(" inode %p now dirty snapc %p auth cap %p\n", +		     &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); +		WARN_ON(!ci->i_auth_cap);  		BUG_ON(!list_empty(&ci->i_dirty_item));  		spin_lock(&mdsc->cap_dirty_lock);  		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);  		spin_unlock(&mdsc->cap_dirty_lock);  		if (ci->i_flushing_caps == 0) { -			igrab(inode); +			ihold(inode);  			dirty |= I_DIRTY_SYNC;  		}  	} @@ -1365,16 +1403,15 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)  	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&  	    (mask & CEPH_CAP_FILE_BUFFER))  		dirty |= I_DIRTY_DATASYNC; -	if (dirty) -		__mark_inode_dirty(inode, dirty);  	__cap_delay_requeue(mdsc, ci); +	return dirty;  }  /*   * Add dirty inode to the flushing list.  Assigned a seq number so we   * can wait for caps to flush without starving.   * - * Called under i_lock. + * Called under i_ceph_lock.   */  static int __mark_caps_flushing(struct inode *inode,  				 struct ceph_mds_session *session) @@ -1422,16 +1459,16 @@ static int try_nonblocking_invalidate(struct inode *inode)  	struct ceph_inode_info *ci = ceph_inode(inode);  	u32 invalidating_gen = ci->i_rdcache_gen; -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	invalidate_mapping_pages(&inode->i_data, 0, -1); -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	if (inode->i_data.nrpages == 0 &&  	    invalidating_gen == ci->i_rdcache_gen) {  		/* success. */  		dout("try_nonblocking_invalidate %p success\n", inode); -		ci->i_rdcache_gen = 0; -		ci->i_rdcache_revoking = 0; +		/* save any racing async invalidate some trouble */ +		ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;  		return 0;  	}  	dout("try_nonblocking_invalidate %p failed\n", inode); @@ -1456,7 +1493,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,  	struct ceph_mds_client *mdsc = fsc->mdsc;  	struct inode *inode = &ci->vfs_inode;  	struct ceph_cap *cap; -	int file_wanted, used; +	int file_wanted, used, cap_used;  	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */  	int issued, implemented, want, retain, revoking, flushing = 0;  	int mds = -1;   /* keep track of how far we've gone through i_caps list @@ -1471,7 +1508,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,  	if (mdsc->stopping)  		is_delayed = 1; -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	if (ci->i_ceph_flags & CEPH_I_FLUSH)  		flags |= CHECK_CAPS_FLUSH; @@ -1481,7 +1518,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,  		__ceph_flush_snaps(ci, &session, 0);  	goto retry_locked;  retry: -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  retry_locked:  	file_wanted = __ceph_caps_file_wanted(ci);  	used = __ceph_caps_used(ci); @@ -1559,10 +1596,16 @@ retry_locked:  		/* NOTE: no side-effects allowed, until we take s_mutex */ +		cap_used = used; +		if (ci->i_auth_cap && cap != ci->i_auth_cap) +			cap_used &= ~ci->i_auth_cap->issued; +  		revoking = cap->implemented & ~cap->issued; -		if (revoking) -			dout(" mds%d revoking %s\n", cap->mds, -			     ceph_cap_string(revoking)); +		dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", +		     cap->mds, cap, ceph_cap_string(cap->issued), +		     ceph_cap_string(cap_used), +		     ceph_cap_string(cap->implemented), +		     ceph_cap_string(revoking));  		if (cap == ci->i_auth_cap &&  		    (cap->issued & CEPH_CAP_FILE_WR)) { @@ -1588,7 +1631,7 @@ retry_locked:  		}  		/* completed revocation? going down and there are no caps? */ -		if (revoking && (revoking & used) == 0) { +		if (revoking && (revoking & cap_used) == 0) {  			dout("completed revocation of %s\n",  			     ceph_cap_string(cap->implemented & ~cap->issued));  			goto ack; @@ -1634,7 +1677,7 @@ ack:  			if (mutex_trylock(&session->s_mutex) == 0) {  				dout("inverting session/ino locks on %p\n",  				     session); -				spin_unlock(&inode->i_lock); +				spin_unlock(&ci->i_ceph_lock);  				if (took_snap_rwsem) {  					up_read(&mdsc->snap_rwsem);  					took_snap_rwsem = 0; @@ -1648,7 +1691,7 @@ ack:  			if (down_read_trylock(&mdsc->snap_rwsem) == 0) {  				dout("inverting snap/in locks on %p\n",  				     inode); -				spin_unlock(&inode->i_lock); +				spin_unlock(&ci->i_ceph_lock);  				down_read(&mdsc->snap_rwsem);  				took_snap_rwsem = 1;  				goto retry; @@ -1658,14 +1701,16 @@ ack:  		if (cap == ci->i_auth_cap && ci->i_dirty_caps)  			flushing = __mark_caps_flushing(inode, session); +		else +			flushing = 0;  		mds = cap->mds;  /* remember mds, so we don't repeat */  		sent++; -		/* __send_cap drops i_lock */ -		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, -				      retain, flushing, NULL); -		goto retry; /* retake i_lock and restart our cap scan. */ +		/* __send_cap drops i_ceph_lock */ +		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, +				      want, retain, flushing, NULL); +		goto retry; /* retake i_ceph_lock and restart our cap scan. */  	}  	/* @@ -1679,7 +1724,7 @@ ack:  	else if (!is_delayed || force_requeue)  		__cap_delay_requeue(mdsc, ci); -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	if (queue_invalidate)  		ceph_queue_invalidate(inode); @@ -1693,16 +1738,15 @@ ack:  /*   * Try to flush dirty caps back to the auth mds.   */ -static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, -			  unsigned *flush_tid) +static int try_flush_caps(struct inode *inode, unsigned *flush_tid)  {  	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;  	struct ceph_inode_info *ci = ceph_inode(inode); -	int unlock_session = session ? 0 : 1;  	int flushing = 0; +	struct ceph_mds_session *session = NULL;  retry: -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {  		dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);  		goto out; @@ -1713,32 +1757,33 @@ retry:  		int want = __ceph_caps_wanted(ci);  		int delayed; -		if (!session) { -			spin_unlock(&inode->i_lock); +		if (!session || session != cap->session) { +			spin_unlock(&ci->i_ceph_lock); +			if (session) +				mutex_unlock(&session->s_mutex);  			session = cap->session;  			mutex_lock(&session->s_mutex);  			goto retry;  		} -		BUG_ON(session != cap->session);  		if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)  			goto out;  		flushing = __mark_caps_flushing(inode, session); -		/* __send_cap drops i_lock */ +		/* __send_cap drops i_ceph_lock */  		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,  				     cap->issued | cap->implemented, flushing,  				     flush_tid);  		if (!delayed)  			goto out_unlocked; -		spin_lock(&inode->i_lock); +		spin_lock(&ci->i_ceph_lock);  		__cap_delay_requeue(mdsc, ci);  	}  out: -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  out_unlocked: -	if (session && unlock_session) +	if (session)  		mutex_unlock(&session->s_mutex);  	return flushing;  } @@ -1751,7 +1796,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid)  	struct ceph_inode_info *ci = ceph_inode(inode);  	int i, ret = 1; -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	for (i = 0; i < CEPH_CAP_BITS; i++)  		if ((ci->i_flushing_caps & (1 << i)) &&  		    ci->i_cap_flush_tid[i] <= tid) { @@ -1759,7 +1804,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid)  			ret = 0;  			break;  		} -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	return ret;  } @@ -1807,7 +1852,7 @@ out:  	spin_unlock(&ci->i_unsafe_lock);  } -int ceph_fsync(struct file *file, int datasync) +int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)  {  	struct inode *inode = file->f_mapping->host;  	struct ceph_inode_info *ci = ceph_inode(inode); @@ -1818,11 +1863,12 @@ int ceph_fsync(struct file *file, int datasync)  	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");  	sync_write_wait(inode); -	ret = filemap_write_and_wait(inode->i_mapping); +	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);  	if (ret < 0)  		return ret; +	mutex_lock(&inode->i_mutex); -	dirty = try_flush_caps(inode, NULL, &flush_tid); +	dirty = try_flush_caps(inode, &flush_tid);  	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));  	/* @@ -1837,6 +1883,7 @@ int ceph_fsync(struct file *file, int datasync)  	}  	dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); +	mutex_unlock(&inode->i_mutex);  	return ret;  } @@ -1856,7 +1903,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)  	dout("write_inode %p wait=%d\n", inode, wait);  	if (wait) { -		dirty = try_flush_caps(inode, NULL, &flush_tid); +		dirty = try_flush_caps(inode, &flush_tid);  		if (dirty)  			err = wait_event_interruptible(ci->i_cap_wq,  				       caps_are_flushed(inode, flush_tid)); @@ -1864,10 +1911,10 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)  		struct ceph_mds_client *mdsc =  			ceph_sb_to_client(inode->i_sb)->mdsc; -		spin_lock(&inode->i_lock); +		spin_lock(&ci->i_ceph_lock);  		if (__ceph_caps_dirty(ci))  			__cap_delay_requeue_front(mdsc, ci); -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  	}  	return err;  } @@ -1890,7 +1937,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,  		struct inode *inode = &ci->vfs_inode;  		struct ceph_cap *cap; -		spin_lock(&inode->i_lock); +		spin_lock(&ci->i_ceph_lock);  		cap = ci->i_auth_cap;  		if (cap && cap->session == session) {  			dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, @@ -1900,7 +1947,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,  			pr_err("%p auth cap %p not mds%d ???\n", inode,  			       cap, session->s_mds);  		} -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  	}  } @@ -1917,7 +1964,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,  		struct ceph_cap *cap;  		int delayed = 0; -		spin_lock(&inode->i_lock); +		spin_lock(&ci->i_ceph_lock);  		cap = ci->i_auth_cap;  		if (cap && cap->session == session) {  			dout("kick_flushing_caps %p cap %p %s\n", inode, @@ -1928,15 +1975,51 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,  					     cap->issued | cap->implemented,  					     ci->i_flushing_caps, NULL);  			if (delayed) { -				spin_lock(&inode->i_lock); +				spin_lock(&ci->i_ceph_lock);  				__cap_delay_requeue(mdsc, ci); -				spin_unlock(&inode->i_lock); +				spin_unlock(&ci->i_ceph_lock);  			}  		} else {  			pr_err("%p auth cap %p not mds%d ???\n", inode,  			       cap, session->s_mds); -			spin_unlock(&inode->i_lock); +			spin_unlock(&ci->i_ceph_lock); +		} +	} +} + +static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, +				     struct ceph_mds_session *session, +				     struct inode *inode) +{ +	struct ceph_inode_info *ci = ceph_inode(inode); +	struct ceph_cap *cap; +	int delayed = 0; + +	spin_lock(&ci->i_ceph_lock); +	cap = ci->i_auth_cap; +	dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, +	     ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + +	__ceph_flush_snaps(ci, &session, 1); + +	if (ci->i_flushing_caps) { +		spin_lock(&mdsc->cap_dirty_lock); +		list_move_tail(&ci->i_flushing_item, +			       &cap->session->s_cap_flushing); +		spin_unlock(&mdsc->cap_dirty_lock); + +		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, +				     __ceph_caps_used(ci), +				     __ceph_caps_wanted(ci), +				     cap->issued | cap->implemented, +				     ci->i_flushing_caps, NULL); +		if (delayed) { +			spin_lock(&ci->i_ceph_lock); +			__cap_delay_requeue(mdsc, ci); +			spin_unlock(&ci->i_ceph_lock);  		} +	} else { +		spin_unlock(&ci->i_ceph_lock);  	}  } @@ -1945,7 +2028,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,   * Take references to capabilities we hold, so that we don't release   * them to the MDS prematurely.   * - * Protected by i_lock. + * Protected by i_ceph_lock.   */  static void __take_cap_refs(struct ceph_inode_info *ci, int got)  { @@ -1958,11 +2041,11 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)  	if (got & CEPH_CAP_FILE_WR)  		ci->i_wr_ref++;  	if (got & CEPH_CAP_FILE_BUFFER) { -		if (ci->i_wrbuffer_ref == 0) -			igrab(&ci->vfs_inode); -		ci->i_wrbuffer_ref++; -		dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n", -		     &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref); +		if (ci->i_wb_ref == 0) +			ihold(&ci->vfs_inode); +		ci->i_wb_ref++; +		dout("__take_cap_refs %p wb %d -> %d (?)\n", +		     &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);  	}  } @@ -1983,7 +2066,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,  	dout("get_cap_refs %p need %s want %s\n", inode,  	     ceph_cap_string(need), ceph_cap_string(want)); -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	/* make sure file is actually open */  	file_wanted = __ceph_caps_file_wanted(ci); @@ -1995,11 +2078,20 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,  		goto out;  	} -	if (need & CEPH_CAP_FILE_WR) { +	/* finish pending truncate */ +	while (ci->i_truncate_pending) { +		spin_unlock(&ci->i_ceph_lock); +		__ceph_do_pending_vmtruncate(inode); +		spin_lock(&ci->i_ceph_lock); +	} + +	have = __ceph_caps_issued(ci, &implemented); + +	if (have & need & CEPH_CAP_FILE_WR) {  		if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {  			dout("get_cap_refs %p endoff %llu > maxsize %llu\n",  			     inode, endoff, ci->i_max_size); -			if (endoff > ci->i_wanted_max_size) { +			if (endoff > ci->i_requested_max_size) {  				*check_max = 1;  				ret = 1;  			} @@ -2014,13 +2106,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,  			goto out;  		}  	} -	have = __ceph_caps_issued(ci, &implemented); - -	/* -	 * disallow writes while a truncate is pending -	 */ -	if (ci->i_truncate_pending) -		have &= ~CEPH_CAP_FILE_WR;  	if ((have & need) == need) {  		/* @@ -2044,7 +2129,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,  		     ceph_cap_string(have), ceph_cap_string(need));  	}  out: -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	dout("get_cap_refs %p ret %d got %s\n", inode,  	     ret, ceph_cap_string(*got));  	return ret; @@ -2061,16 +2146,19 @@ static void check_max_size(struct inode *inode, loff_t endoff)  	int check = 0;  	/* do we need to explicitly request a larger max_size? */ -	spin_lock(&inode->i_lock); -	if ((endoff >= ci->i_max_size || -	     endoff > (inode->i_size << 1)) && -	    endoff > ci->i_wanted_max_size) { +	spin_lock(&ci->i_ceph_lock); +	if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {  		dout("write %p at large endoff %llu, req max_size\n",  		     inode, endoff);  		ci->i_wanted_max_size = endoff; -		check = 1;  	} -	spin_unlock(&inode->i_lock); +	/* duplicate ceph_check_caps()'s logic */ +	if (ci->i_auth_cap && +	    (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) && +	    ci->i_wanted_max_size > ci->i_max_size && +	    ci->i_wanted_max_size > ci->i_requested_max_size) +		check = 1; +	spin_unlock(&ci->i_ceph_lock);  	if (check)  		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);  } @@ -2107,9 +2195,9 @@ retry:   */  void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)  { -	spin_lock(&ci->vfs_inode.i_lock); +	spin_lock(&ci->i_ceph_lock);  	__take_cap_refs(ci, caps); -	spin_unlock(&ci->vfs_inode.i_lock); +	spin_unlock(&ci->i_ceph_lock);  }  /* @@ -2127,7 +2215,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)  	int last = 0, put = 0, flushsnaps = 0, wake = 0;  	struct ceph_cap_snap *capsnap; -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	if (had & CEPH_CAP_PIN)  		--ci->i_pin_ref;  	if (had & CEPH_CAP_FILE_RD) @@ -2137,12 +2225,12 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)  		if (--ci->i_rdcache_ref == 0)  			last++;  	if (had & CEPH_CAP_FILE_BUFFER) { -		if (--ci->i_wrbuffer_ref == 0) { +		if (--ci->i_wb_ref == 0) {  			last++;  			put++;  		} -		dout("put_cap_refs %p wrbuffer %d -> %d (?)\n", -		     inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref); +		dout("put_cap_refs %p wb %d -> %d (?)\n", +		     inode, ci->i_wb_ref+1, ci->i_wb_ref);  	}  	if (had & CEPH_CAP_FILE_WR)  		if (--ci->i_wr_ref == 0) { @@ -2160,7 +2248,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)  				}  			}  		} -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),  	     last ? " last" : "", put ? " put" : ""); @@ -2192,7 +2280,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,  	int found = 0;  	struct ceph_cap_snap *capsnap = NULL; -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	ci->i_wrbuffer_ref -= nr;  	last = !ci->i_wrbuffer_ref; @@ -2241,7 +2329,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,  		}  	} -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	if (last) {  		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); @@ -2255,42 +2343,88 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,  }  /* + * Invalidate unlinked inode's aliases, so we can drop the inode ASAP. + */ +static void invalidate_aliases(struct inode *inode) +{ +	struct dentry *dn, *prev = NULL; + +	dout("invalidate_aliases inode %p\n", inode); +	d_prune_aliases(inode); +	/* +	 * For non-directory inode, d_find_alias() only returns +	 * hashed dentry. After calling d_invalidate(), the +	 * dentry becomes unhashed. +	 * +	 * For directory inode, d_find_alias() can return +	 * unhashed dentry. But directory inode should have +	 * one alias at most. +	 */ +	while ((dn = d_find_alias(inode))) { +		if (dn == prev) { +			dput(dn); +			break; +		} +		d_invalidate(dn); +		if (prev) +			dput(prev); +		prev = dn; +	} +	if (prev) +		dput(prev); +} + +/*   * Handle a cap GRANT message from the MDS.  (Note that a GRANT may   * actually be a revocation if it specifies a smaller cap set.)   * - * caller holds s_mutex and i_lock, we drop both. - * - * return value: - *  0 - ok - *  1 - check_caps on auth cap only (writeback) - *  2 - check_caps (ack revoke) + * caller holds s_mutex and i_ceph_lock, we drop both.   */ -static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, +static void handle_cap_grant(struct ceph_mds_client *mdsc, +			     struct inode *inode, struct ceph_mds_caps *grant, +			     void *snaptrace, int snaptrace_len, +			     struct ceph_buffer *xattr_buf,  			     struct ceph_mds_session *session, -			     struct ceph_cap *cap, -			     struct ceph_buffer *xattr_buf) -		__releases(inode->i_lock) +			     struct ceph_cap *cap, int issued) +	__releases(ci->i_ceph_lock)  {  	struct ceph_inode_info *ci = ceph_inode(inode);  	int mds = session->s_mds; -	unsigned seq = le32_to_cpu(grant->seq); -	unsigned issue_seq = le32_to_cpu(grant->issue_seq); +	int seq = le32_to_cpu(grant->seq);  	int newcaps = le32_to_cpu(grant->caps); -	int issued, implemented, used, wanted, dirty; +	int used, wanted, dirty;  	u64 size = le64_to_cpu(grant->size);  	u64 max_size = le64_to_cpu(grant->max_size);  	struct timespec mtime, atime, ctime;  	int check_caps = 0; -	int wake = 0; -	int writeback = 0; -	int revoked_rdcache = 0; -	int queue_invalidate = 0; - -	dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n", -	     inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps)); +	bool wake = 0; +	bool writeback = 0; +	bool queue_trunc = 0; +	bool queue_invalidate = 0; +	bool queue_revalidate = 0; +	bool deleted_inode = 0; + +	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", +	     inode, cap, mds, seq, ceph_cap_string(newcaps));  	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,  		inode->i_size); + +	/* +	 * auth mds of the inode changed. we received the cap export message, +	 * but still haven't received the cap import message. handle_cap_export +	 * updated the new auth MDS' cap. +	 * +	 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message +	 * that was sent before the cap import message. So don't remove caps. +	 */ +	if (ceph_seq_cmp(seq, cap->seq) <= 0) { +		WARN_ON(cap != ci->i_auth_cap); +		WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); +		seq = cap->seq; +		newcaps |= cap->issued; +	} +  	/*  	 * If CACHE is being revoked, and we have no dirty buffers,  	 * try to invalidate (once).  (If there are dirty buffers, we @@ -2299,9 +2433,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,  	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&  	    (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&  	    !ci->i_wrbuffer_ref) { -		if (try_nonblocking_invalidate(inode) == 0) { -			revoked_rdcache = 1; -		} else { +		if (try_nonblocking_invalidate(inode)) {  			/* there were locked pages.. invalidate later  			   in a separate thread. */  			if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { @@ -2309,27 +2441,33 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,  				ci->i_rdcache_revoking = ci->i_rdcache_gen;  			}  		} + +		ceph_fscache_invalidate(inode);  	}  	/* side effects now are allowed */ - -	issued = __ceph_caps_issued(ci, &implemented); -	issued |= implemented | __ceph_caps_dirty(ci); -  	cap->cap_gen = session->s_cap_gen; +	cap->seq = seq;  	__check_cap_issue(ci, cap, newcaps); -	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { +	if ((newcaps & CEPH_CAP_AUTH_SHARED) && +	    (issued & CEPH_CAP_AUTH_EXCL) == 0) {  		inode->i_mode = le32_to_cpu(grant->mode); -		inode->i_uid = le32_to_cpu(grant->uid); -		inode->i_gid = le32_to_cpu(grant->gid); +		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); +		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));  		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, -		     inode->i_uid, inode->i_gid); +		     from_kuid(&init_user_ns, inode->i_uid), +		     from_kgid(&init_user_ns, inode->i_gid));  	} -	if ((issued & CEPH_CAP_LINK_EXCL) == 0) -		inode->i_nlink = le32_to_cpu(grant->nlink); +	if ((newcaps & CEPH_CAP_AUTH_SHARED) && +	    (issued & CEPH_CAP_LINK_EXCL) == 0) { +		set_nlink(inode, le32_to_cpu(grant->nlink)); +		if (inode->i_nlink == 0 && +		    (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) +			deleted_inode = 1; +	}  	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {  		int len = le32_to_cpu(grant->xattr_len); @@ -2342,29 +2480,44 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,  				ceph_buffer_put(ci->i_xattrs.blob);  			ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);  			ci->i_xattrs.version = version; +			ceph_forget_all_cached_acls(inode);  		}  	} -	/* size/ctime/mtime/atime? */ -	ceph_fill_file_size(inode, issued, -			    le32_to_cpu(grant->truncate_seq), -			    le64_to_cpu(grant->truncate_size), size); -	ceph_decode_timespec(&mtime, &grant->mtime); -	ceph_decode_timespec(&atime, &grant->atime); -	ceph_decode_timespec(&ctime, &grant->ctime); -	ceph_fill_file_time(inode, issued, -			    le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, -			    &atime); - -	/* max size increase? */ -	if (max_size != ci->i_max_size) { -		dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); -		ci->i_max_size = max_size; -		if (max_size >= ci->i_wanted_max_size) { -			ci->i_wanted_max_size = 0;  /* reset */ -			ci->i_requested_max_size = 0; +	/* Do we need to revalidate our fscache cookie. Don't bother on the +	 * first cache cap as we already validate at cookie creation time. */ +	if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) +		queue_revalidate = 1; + +	if (newcaps & CEPH_CAP_ANY_RD) { +		/* ctime/mtime/atime? */ +		ceph_decode_timespec(&mtime, &grant->mtime); +		ceph_decode_timespec(&atime, &grant->atime); +		ceph_decode_timespec(&ctime, &grant->ctime); +		ceph_fill_file_time(inode, issued, +				    le32_to_cpu(grant->time_warp_seq), +				    &ctime, &mtime, &atime); +	} + +	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { +		/* file layout may have changed */ +		ci->i_layout = grant->layout; +		/* size/truncate_seq? */ +		queue_trunc = ceph_fill_file_size(inode, issued, +					le32_to_cpu(grant->truncate_seq), +					le64_to_cpu(grant->truncate_size), +					size); +		/* max size increase? */ +		if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { +			dout("max_size %lld -> %llu\n", +			     ci->i_max_size, max_size); +			ci->i_max_size = max_size; +			if (max_size >= ci->i_wanted_max_size) { +				ci->i_wanted_max_size = 0;  /* reset */ +				ci->i_requested_max_size = 0; +			} +			wake = 1;  		} -		wake = 1;  	}  	/* check cap bits */ @@ -2379,15 +2532,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,  		dout("mds wanted %s -> %s\n",  		     ceph_cap_string(le32_to_cpu(grant->wanted)),  		     ceph_cap_string(wanted)); -		grant->wanted = cpu_to_le32(wanted); +		/* imported cap may not have correct mds_wanted */ +		if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) +			check_caps = 1;  	} -	cap->seq = seq; -	cap->issue_seq = issue_seq; - -	/* file layout may have changed */ -	ci->i_layout = grant->layout; -  	/* revocation, grant, or no-op? */  	if (cap->issued & ~newcaps) {  		int revoking = cap->issued & ~newcaps; @@ -2414,6 +2563,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,  	} else {  		dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),  		     ceph_cap_string(newcaps)); +		/* non-auth MDS is revoking the newly grant caps ? */ +		if (cap == ci->i_auth_cap && +		    __ceph_caps_revoking_other(ci, cap, newcaps)) +		    check_caps = 2; +  		cap->issued = newcaps;  		cap->implemented |= newcaps; /* add bits only, to  					      * avoid stepping on a @@ -2422,7 +2576,25 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,  	}  	BUG_ON(cap->issued & ~cap->implemented); -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock); + +	if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { +		down_write(&mdsc->snap_rwsem); +		ceph_update_snap_trace(mdsc, snaptrace, +				       snaptrace + snaptrace_len, false); +		downgrade_write(&mdsc->snap_rwsem); +		kick_flushing_inode_caps(mdsc, session, inode); +		up_read(&mdsc->snap_rwsem); +		if (newcaps & ~issued) +			wake = 1; +	} + +	if (queue_trunc) { +		ceph_queue_vmtruncate(inode); +		ceph_queue_revalidate(inode); +	} else if (queue_revalidate) +		ceph_queue_revalidate(inode); +  	if (writeback)  		/*  		 * queue inode for writeback: we can't actually call @@ -2432,6 +2604,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,  		ceph_queue_writeback(inode);  	if (queue_invalidate)  		ceph_queue_invalidate(inode); +	if (deleted_inode) +		invalidate_aliases(inode);  	if (wake)  		wake_up_all(&ci->i_cap_wq); @@ -2452,7 +2626,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,  				 struct ceph_mds_caps *m,  				 struct ceph_mds_session *session,  				 struct ceph_cap *cap) -	__releases(inode->i_lock) +	__releases(ci->i_ceph_lock)  {  	struct ceph_inode_info *ci = ceph_inode(inode);  	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; @@ -2508,7 +2682,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,  	wake_up_all(&ci->i_cap_wq);  out: -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	if (drop)  		iput(inode);  } @@ -2531,7 +2705,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,  	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",  	     inode, ci, session->s_mds, follows); -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {  		if (capsnap->follows == follows) {  			if (capsnap->flush_tid != flush_tid) { @@ -2554,7 +2728,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,  			     capsnap, capsnap->follows);  		}  	} -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	if (drop)  		iput(inode);  } @@ -2567,7 +2741,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,  static void handle_cap_trunc(struct inode *inode,  			     struct ceph_mds_caps *trunc,  			     struct ceph_mds_session *session) -	__releases(inode->i_lock) +	__releases(ci->i_ceph_lock)  {  	struct ceph_inode_info *ci = ceph_inode(inode);  	int mds = session->s_mds; @@ -2586,10 +2760,12 @@ static void handle_cap_trunc(struct inode *inode,  	     inode, mds, seq, truncate_size, truncate_seq);  	queue_trunc = ceph_fill_file_size(inode, issued,  					  truncate_seq, truncate_size, size); -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock); -	if (queue_trunc) +	if (queue_trunc) {  		ceph_queue_vmtruncate(inode); +		ceph_fscache_invalidate(inode); +	}  }  /* @@ -2601,96 +2777,200 @@ static void handle_cap_trunc(struct inode *inode,   * caller holds s_mutex   */  static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, -			      struct ceph_mds_session *session, -			      int *open_target_sessions) +			      struct ceph_mds_cap_peer *ph, +			      struct ceph_mds_session *session)  { +	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; +	struct ceph_mds_session *tsession = NULL; +	struct ceph_cap *cap, *tcap, *new_cap = NULL;  	struct ceph_inode_info *ci = ceph_inode(inode); -	int mds = session->s_mds; +	u64 t_cap_id;  	unsigned mseq = le32_to_cpu(ex->migrate_seq); -	struct ceph_cap *cap = NULL, *t; -	struct rb_node *p; -	int remember = 1; +	unsigned t_seq, t_mseq; +	int target, issued; +	int mds = session->s_mds; -	dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", -	     inode, ci, mds, mseq); +	if (ph) { +		t_cap_id = le64_to_cpu(ph->cap_id); +		t_seq = le32_to_cpu(ph->seq); +		t_mseq = le32_to_cpu(ph->mseq); +		target = le32_to_cpu(ph->mds); +	} else { +		t_cap_id = t_seq = t_mseq = 0; +		target = -1; +	} -	spin_lock(&inode->i_lock); +	dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", +	     inode, ci, mds, mseq, target); +retry: +	spin_lock(&ci->i_ceph_lock); +	cap = __get_cap_for_mds(ci, mds); +	if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) +		goto out_unlock; -	/* make sure we haven't seen a higher mseq */ -	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { -		t = rb_entry(p, struct ceph_cap, ci_node); -		if (ceph_seq_cmp(t->mseq, mseq) > 0) { -			dout(" higher mseq on cap from mds%d\n", -			     t->session->s_mds); -			remember = 0; +	if (target < 0) { +		__ceph_remove_cap(cap, false); +		goto out_unlock; +	} + +	/* +	 * now we know we haven't received the cap import message yet +	 * because the exported cap still exist. +	 */ + +	issued = cap->issued; +	WARN_ON(issued != cap->implemented); + +	tcap = __get_cap_for_mds(ci, target); +	if (tcap) { +		/* already have caps from the target */ +		if (tcap->cap_id != t_cap_id || +		    ceph_seq_cmp(tcap->seq, t_seq) < 0) { +			dout(" updating import cap %p mds%d\n", tcap, target); +			tcap->cap_id = t_cap_id; +			tcap->seq = t_seq - 1; +			tcap->issue_seq = t_seq - 1; +			tcap->mseq = t_mseq; +			tcap->issued |= issued; +			tcap->implemented |= issued; +			if (cap == ci->i_auth_cap) +				ci->i_auth_cap = tcap; +			if (ci->i_flushing_caps && ci->i_auth_cap == tcap) { +				spin_lock(&mdsc->cap_dirty_lock); +				list_move_tail(&ci->i_flushing_item, +					       &tcap->session->s_cap_flushing); +				spin_unlock(&mdsc->cap_dirty_lock); +			}  		} -		if (t->session->s_mds == mds) -			cap = t; +		__ceph_remove_cap(cap, false); +		goto out_unlock; +	} else if (tsession) { +		/* add placeholder for the export tagert */ +		int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; +		ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, +			     t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); + +		__ceph_remove_cap(cap, false); +		goto out_unlock;  	} -	if (cap) { -		if (remember) { -			/* make note */ -			ci->i_cap_exporting_mds = mds; -			ci->i_cap_exporting_mseq = mseq; -			ci->i_cap_exporting_issued = cap->issued; +	spin_unlock(&ci->i_ceph_lock); +	mutex_unlock(&session->s_mutex); -			/* -			 * make sure we have open sessions with all possible -			 * export targets, so that we get the matching IMPORT -			 */ -			*open_target_sessions = 1; +	/* open target session */ +	tsession = ceph_mdsc_open_export_target_session(mdsc, target); +	if (!IS_ERR(tsession)) { +		if (mds > target) { +			mutex_lock(&session->s_mutex); +			mutex_lock_nested(&tsession->s_mutex, +					  SINGLE_DEPTH_NESTING); +		} else { +			mutex_lock(&tsession->s_mutex); +			mutex_lock_nested(&session->s_mutex, +					  SINGLE_DEPTH_NESTING);  		} -		__ceph_remove_cap(cap); +		ceph_add_cap_releases(mdsc, tsession); +		new_cap = ceph_get_cap(mdsc, NULL); +	} else { +		WARN_ON(1); +		tsession = NULL; +		target = -1;  	} -	/* else, we already released it */ +	goto retry; -	spin_unlock(&inode->i_lock); +out_unlock: +	spin_unlock(&ci->i_ceph_lock); +	mutex_unlock(&session->s_mutex); +	if (tsession) { +		mutex_unlock(&tsession->s_mutex); +		ceph_put_mds_session(tsession); +	} +	if (new_cap) +		ceph_put_cap(mdsc, new_cap);  }  /* - * Handle cap IMPORT.  If there are temp bits from an older EXPORT, - * clean them up. + * Handle cap IMPORT.   * - * caller holds s_mutex. + * caller holds s_mutex. acquires i_ceph_lock   */  static void handle_cap_import(struct ceph_mds_client *mdsc,  			      struct inode *inode, struct ceph_mds_caps *im, +			      struct ceph_mds_cap_peer *ph,  			      struct ceph_mds_session *session, -			      void *snaptrace, int snaptrace_len) +			      struct ceph_cap **target_cap, int *old_issued) +	__acquires(ci->i_ceph_lock)  {  	struct ceph_inode_info *ci = ceph_inode(inode); +	struct ceph_cap *cap, *ocap, *new_cap = NULL;  	int mds = session->s_mds; -	unsigned issued = le32_to_cpu(im->caps); +	int issued; +	unsigned caps = le32_to_cpu(im->caps);  	unsigned wanted = le32_to_cpu(im->wanted);  	unsigned seq = le32_to_cpu(im->seq);  	unsigned mseq = le32_to_cpu(im->migrate_seq);  	u64 realmino = le64_to_cpu(im->realm);  	u64 cap_id = le64_to_cpu(im->cap_id); +	u64 p_cap_id; +	int peer; + +	if (ph) { +		p_cap_id = le64_to_cpu(ph->cap_id); +		peer = le32_to_cpu(ph->mds); +	} else { +		p_cap_id = 0; +		peer = -1; +	} + +	dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", +	     inode, ci, mds, mseq, peer); -	if (ci->i_cap_exporting_mds >= 0 && -	    ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { -		dout("handle_cap_import inode %p ci %p mds%d mseq %d" -		     " - cleared exporting from mds%d\n", -		     inode, ci, mds, mseq, -		     ci->i_cap_exporting_mds); -		ci->i_cap_exporting_issued = 0; -		ci->i_cap_exporting_mseq = 0; -		ci->i_cap_exporting_mds = -1; +retry: +	spin_lock(&ci->i_ceph_lock); +	cap = __get_cap_for_mds(ci, mds); +	if (!cap) { +		if (!new_cap) { +			spin_unlock(&ci->i_ceph_lock); +			new_cap = ceph_get_cap(mdsc, NULL); +			goto retry; +		} +		cap = new_cap;  	} else { -		dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", -		     inode, ci, mds, mseq); +		if (new_cap) { +			ceph_put_cap(mdsc, new_cap); +			new_cap = NULL; +		} +	} + +	__ceph_caps_issued(ci, &issued); +	issued |= __ceph_caps_dirty(ci); + +	ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq, +		     realmino, CEPH_CAP_FLAG_AUTH, &new_cap); + +	ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; +	if (ocap && ocap->cap_id == p_cap_id) { +		dout(" remove export cap %p mds%d flags %d\n", +		     ocap, peer, ph->flags); +		if ((ph->flags & CEPH_CAP_FLAG_AUTH) && +		    (ocap->seq != le32_to_cpu(ph->seq) || +		     ocap->mseq != le32_to_cpu(ph->mseq))) { +			pr_err("handle_cap_import: mismatched seq/mseq: " +			       "ino (%llx.%llx) mds%d seq %d mseq %d " +			       "importer mds%d has peer seq %d mseq %d\n", +			       ceph_vinop(inode), peer, ocap->seq, +			       ocap->mseq, mds, le32_to_cpu(ph->seq), +			       le32_to_cpu(ph->mseq)); +		} +		__ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));  	} -	down_write(&mdsc->snap_rwsem); -	ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, -			       false); -	downgrade_write(&mdsc->snap_rwsem); -	ceph_add_cap(inode, session, cap_id, -1, -		     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, -		     NULL /* no caps context */); -	try_flush_caps(inode, session, NULL); -	up_read(&mdsc->snap_rwsem); +	/* make sure we re-request max_size, if necessary */ +	ci->i_wanted_max_size = 0; +	ci->i_requested_max_size = 0; + +	*old_issued = issued; +	*target_cap = cap;  }  /* @@ -2705,10 +2985,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,  	struct ceph_mds_client *mdsc = session->s_mdsc;  	struct super_block *sb = mdsc->fsc->sb;  	struct inode *inode; +	struct ceph_inode_info *ci;  	struct ceph_cap *cap;  	struct ceph_mds_caps *h; +	struct ceph_mds_cap_peer *peer = NULL;  	int mds = session->s_mds; -	int op; +	int op, issued;  	u32 seq, mseq;  	struct ceph_vino vino;  	u64 cap_id; @@ -2717,12 +2999,13 @@ void ceph_handle_caps(struct ceph_mds_session *session,  	void *snaptrace;  	size_t snaptrace_len;  	void *flock; +	void *end;  	u32 flock_len; -	int open_target_sessions = 0;  	dout("handle_caps from mds%d\n", mds);  	/* decode */ +	end = msg->front.iov_base + msg->front.iov_len;  	tid = le64_to_cpu(msg->hdr.tid);  	if (msg->front.iov_len < sizeof(*h))  		goto bad; @@ -2740,32 +3023,50 @@ void ceph_handle_caps(struct ceph_mds_session *session,  	snaptrace_len = le32_to_cpu(h->snap_trace_len);  	if (le16_to_cpu(msg->hdr.version) >= 2) { -		void *p, *end; - -		p = snaptrace + snaptrace_len; -		end = msg->front.iov_base + msg->front.iov_len; +		void *p = snaptrace + snaptrace_len;  		ceph_decode_32_safe(&p, end, flock_len, bad); +		if (p + flock_len > end) +			goto bad;  		flock = p;  	} else {  		flock = NULL;  		flock_len = 0;  	} +	if (le16_to_cpu(msg->hdr.version) >= 3) { +		if (op == CEPH_CAP_OP_IMPORT) { +			void *p = flock + flock_len; +			if (p + sizeof(*peer) > end) +				goto bad; +			peer = p; +		} else if (op == CEPH_CAP_OP_EXPORT) { +			/* recorded in unused fields */ +			peer = (void *)&h->size; +		} +	} +  	mutex_lock(&session->s_mutex);  	session->s_seq++;  	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,  	     (unsigned)seq); +	if (op == CEPH_CAP_OP_IMPORT) +		ceph_add_cap_releases(mdsc, session); +  	/* lookup ino */  	inode = ceph_find_inode(sb, vino); +	ci = ceph_inode(inode);  	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,  	     vino.snap, inode);  	if (!inode) {  		dout(" i don't have ino %llx\n", vino.ino); -		if (op == CEPH_CAP_OP_IMPORT) +		if (op == CEPH_CAP_OP_IMPORT) { +			spin_lock(&session->s_cap_lock);  			__queue_cap_release(session, vino.ino, cap_id,  					    mseq, seq); +			spin_unlock(&session->s_cap_lock); +		}  		goto flush_cap_releases;  	} @@ -2776,32 +3077,35 @@ void ceph_handle_caps(struct ceph_mds_session *session,  		goto done;  	case CEPH_CAP_OP_EXPORT: -		handle_cap_export(inode, h, session, &open_target_sessions); -		goto done; +		handle_cap_export(inode, h, peer, session); +		goto done_unlocked;  	case CEPH_CAP_OP_IMPORT: -		handle_cap_import(mdsc, inode, h, session, -				  snaptrace, snaptrace_len); -		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, -				session); +		handle_cap_import(mdsc, inode, h, peer, session, +				  &cap, &issued); +		handle_cap_grant(mdsc, inode, h,  snaptrace, snaptrace_len, +				 msg->middle, session, cap, issued);  		goto done_unlocked;  	}  	/* the rest require a cap */ -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	cap = __get_cap_for_mds(ceph_inode(inode), mds);  	if (!cap) {  		dout(" no cap on %p ino %llx.%llx from mds%d\n",  		     inode, ceph_ino(inode), ceph_snap(inode), mds); -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  		goto flush_cap_releases;  	} -	/* note that each of these drops i_lock for us */ +	/* note that each of these drops i_ceph_lock for us */  	switch (op) {  	case CEPH_CAP_OP_REVOKE:  	case CEPH_CAP_OP_GRANT: -		handle_cap_grant(inode, h, session, cap, msg->middle); +		__ceph_caps_issued(ci, &issued); +		issued |= __ceph_caps_dirty(ci); +		handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle, +				 session, cap, issued);  		goto done_unlocked;  	case CEPH_CAP_OP_FLUSH_ACK: @@ -2813,7 +3117,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,  		break;  	default: -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  		pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,  		       ceph_cap_op_name(op));  	} @@ -2834,8 +3138,6 @@ done:  done_unlocked:  	if (inode)  		iput(inode); -	if (open_target_sessions) -		ceph_mdsc_open_export_target_sessions(mdsc, session);  	return;  bad: @@ -2876,47 +3178,24 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)   */  void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)  { -	struct ceph_inode_info *ci, *nci = NULL; -	struct inode *inode, *ninode = NULL; -	struct list_head *p, *n; +	struct ceph_inode_info *ci; +	struct inode *inode;  	dout("flush_dirty_caps\n");  	spin_lock(&mdsc->cap_dirty_lock); -	list_for_each_safe(p, n, &mdsc->cap_dirty) { -		if (nci) { -			ci = nci; -			inode = ninode; -			ci->i_ceph_flags &= ~CEPH_I_NOFLUSH; -			dout("flush_dirty_caps inode %p (was next inode)\n", -			     inode); -		} else { -			ci = list_entry(p, struct ceph_inode_info, -					i_dirty_item); -			inode = igrab(&ci->vfs_inode); -			BUG_ON(!inode); -			dout("flush_dirty_caps inode %p\n", inode); -		} -		if (n != &mdsc->cap_dirty) { -			nci = list_entry(n, struct ceph_inode_info, -					 i_dirty_item); -			ninode = igrab(&nci->vfs_inode); -			BUG_ON(!ninode); -			nci->i_ceph_flags |= CEPH_I_NOFLUSH; -			dout("flush_dirty_caps next inode %p, noflush\n", -			     ninode); -		} else { -			nci = NULL; -			ninode = NULL; -		} +	while (!list_empty(&mdsc->cap_dirty)) { +		ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, +				      i_dirty_item); +		inode = &ci->vfs_inode; +		ihold(inode); +		dout("flush_dirty_caps %p\n", inode);  		spin_unlock(&mdsc->cap_dirty_lock); -		if (inode) { -			ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, -					NULL); -			iput(inode); -		} +		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); +		iput(inode);  		spin_lock(&mdsc->cap_dirty_lock);  	}  	spin_unlock(&mdsc->cap_dirty_lock); +	dout("flush_dirty_caps done\n");  }  /* @@ -2929,13 +3208,13 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)  	struct inode *inode = &ci->vfs_inode;  	int last = 0; -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,  	     ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);  	BUG_ON(ci->i_nr_by_mode[fmode] == 0);  	if (--ci->i_nr_by_mode[fmode] == 0)  		last++; -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	if (last && ci->i_vino.snap == CEPH_NOSNAP)  		ceph_check_caps(ci, 0, NULL); @@ -2958,7 +3237,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,  	int used, dirty;  	int ret = 0; -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	used = __ceph_caps_used(ci);  	dirty = __ceph_caps_dirty(ci); @@ -2976,21 +3255,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,  		     (cap->issued & unless) == 0)) {  			if ((cap->issued & drop) &&  			    (cap->issued & unless) == 0) { -				dout("encode_inode_release %p cap %p %s -> " -				     "%s\n", inode, cap, +				int wanted = __ceph_caps_wanted(ci); +				if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) +					wanted |= cap->mds_wanted; +				dout("encode_inode_release %p cap %p " +				     "%s -> %s, wanted %s -> %s\n", inode, cap,  				     ceph_cap_string(cap->issued), -				     ceph_cap_string(cap->issued & ~drop)); +				     ceph_cap_string(cap->issued & ~drop), +				     ceph_cap_string(cap->mds_wanted), +				     ceph_cap_string(wanted)); +  				cap->issued &= ~drop;  				cap->implemented &= ~drop; -				if (ci->i_ceph_flags & CEPH_I_NODELAY) { -					int wanted = __ceph_caps_wanted(ci); -					dout("  wanted %s -> %s (act %s)\n", -					     ceph_cap_string(cap->mds_wanted), -					     ceph_cap_string(cap->mds_wanted & -							     ~wanted), -					     ceph_cap_string(wanted)); -					cap->mds_wanted &= wanted; -				} +				cap->mds_wanted = wanted;  			} else {  				dout("encode_inode_release %p cap %p %s"  				     " (force)\n", inode, cap, @@ -3002,7 +3279,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,  			rel->seq = cpu_to_le32(cap->seq);  			rel->issue_seq = cpu_to_le32(cap->issue_seq),  			rel->mseq = cpu_to_le32(cap->mseq); -			rel->caps = cpu_to_le32(cap->issued); +			rel->caps = cpu_to_le32(cap->implemented);  			rel->wanted = cpu_to_le32(cap->mds_wanted);  			rel->dname_len = 0;  			rel->dname_seq = 0; @@ -3013,7 +3290,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,  			     inode, cap, ceph_cap_string(cap->issued));  		}  	} -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	return ret;  } @@ -3028,7 +3305,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,  	/*  	 * force an record for the directory caps if we have a dentry lease. -	 * this is racy (can't take i_lock and d_lock together), but it +	 * this is racy (can't take i_ceph_lock and d_lock together), but it  	 * doesn't have to be perfect; the mds will revoke anything we don't  	 * release.  	 */ diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 7ae1b3d55b5..5a743ac141a 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -60,17 +60,20 @@ static int mdsc_show(struct seq_file *s, void *p)  	for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {  		req = rb_entry(rp, struct ceph_mds_request, r_node); -		if (req->r_request) -			seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds); -		else +		if (req->r_request && req->r_session) +			seq_printf(s, "%lld\tmds%d\t", req->r_tid, +				   req->r_session->s_mds); +		else if (!req->r_request)  			seq_printf(s, "%lld\t(no request)\t", req->r_tid); +		else +			seq_printf(s, "%lld\t(no session)\t", req->r_tid);  		seq_printf(s, "%s", ceph_mds_op_name(req->r_op));  		if (req->r_got_unsafe) -			seq_printf(s, "\t(unsafe)"); +			seq_puts(s, "\t(unsafe)");  		else -			seq_printf(s, "\t"); +			seq_puts(s, "\t");  		if (req->r_inode) {  			seq_printf(s, " #%llx", ceph_ino(req->r_inode)); @@ -90,6 +93,8 @@ static int mdsc_show(struct seq_file *s, void *p)  		} else if (req->r_path1) {  			seq_printf(s, " #%llx/%s", req->r_ino1.ino,  				   req->r_path1); +		} else { +			seq_printf(s, " #%llx", req->r_ino1.ino);  		}  		if (req->r_old_dentry) { @@ -99,7 +104,8 @@ static int mdsc_show(struct seq_file *s, void *p)  				path = NULL;  			spin_lock(&req->r_old_dentry->d_lock);  			seq_printf(s, " #%llx/%.*s (%s)", -			   ceph_ino(req->r_old_dentry->d_parent->d_inode), +				   req->r_old_dentry_dir ? +				   ceph_ino(req->r_old_dentry_dir) : 0,  				   req->r_old_dentry->d_name.len,  				   req->r_old_dentry->d_name.name,  				   path ? path : ""); @@ -113,7 +119,7 @@ static int mdsc_show(struct seq_file *s, void *p)  				seq_printf(s, " %s", req->r_path2);  		} -		seq_printf(s, "\n"); +		seq_puts(s, "\n");  	}  	mutex_unlock(&mdsc->mutex); @@ -198,6 +204,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)  	int err = -ENOMEM;  	dout("ceph_fs_debugfs_init\n"); +	BUG_ON(!fsc->client->debugfs_dir);  	fsc->debugfs_congestion_kb =  		debugfs_create_file("writeback_congestion_kb",  				    0600, @@ -207,8 +214,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)  	if (!fsc->debugfs_congestion_kb)  		goto out; -	dout("a\n"); -  	snprintf(name, sizeof(name), "../../bdi/%s",  		 dev_name(fsc->backing_dev_info.dev));  	fsc->debugfs_bdi = @@ -218,7 +223,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)  	if (!fsc->debugfs_bdi)  		goto out; -	dout("b\n");  	fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",  					0600,  					fsc->client->debugfs_dir, @@ -227,7 +231,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)  	if (!fsc->debugfs_mdsmap)  		goto out; -	dout("ca\n");  	fsc->debugfs_mdsc = debugfs_create_file("mdsc",  						0600,  						fsc->client->debugfs_dir, @@ -236,7 +239,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)  	if (!fsc->debugfs_mdsc)  		goto out; -	dout("da\n");  	fsc->debugfs_caps = debugfs_create_file("caps",  						   0400,  						   fsc->client->debugfs_dir, @@ -245,7 +247,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)  	if (!fsc->debugfs_caps)  		goto out; -	dout("ea\n");  	fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",  					0600,  					fsc->client->debugfs_dir, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index e0a2dc6fcaf..c29d6ae6887 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -40,13 +40,6 @@ int ceph_init_dentry(struct dentry *dentry)  	if (dentry->d_fsdata)  		return 0; -	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) -		dentry->d_op = &ceph_dentry_ops; -	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) -		dentry->d_op = &ceph_snapdir_dentry_ops; -	else -		dentry->d_op = &ceph_snap_dentry_ops; -  	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);  	if (!di)  		return -ENOMEM;          /* oh well */ @@ -57,16 +50,41 @@ int ceph_init_dentry(struct dentry *dentry)  		kmem_cache_free(ceph_dentry_cachep, di);  		goto out_unlock;  	} + +	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) +		d_set_d_op(dentry, &ceph_dentry_ops); +	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) +		d_set_d_op(dentry, &ceph_snapdir_dentry_ops); +	else +		d_set_d_op(dentry, &ceph_snap_dentry_ops); +  	di->dentry = dentry;  	di->lease_session = NULL; -	dentry->d_fsdata = di;  	dentry->d_time = jiffies; +	/* avoid reordering d_fsdata setup so that the check above is safe */ +	smp_mb(); +	dentry->d_fsdata = di;  	ceph_dentry_lru_add(dentry);  out_unlock:  	spin_unlock(&dentry->d_lock);  	return 0;  } +struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) +{ +	struct inode *inode = NULL; + +	if (!dentry) +		return NULL; + +	spin_lock(&dentry->d_lock); +	if (!IS_ROOT(dentry)) { +		inode = dentry->d_parent->d_inode; +		ihold(inode); +	} +	spin_unlock(&dentry->d_lock); +	return inode; +}  /* @@ -82,6 +100,14 @@ static unsigned fpos_off(loff_t p)  	return p & 0xffffffff;  } +static int fpos_cmp(loff_t l, loff_t r) +{ +	int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r)); +	if (v) +		return v; +	return (int)(fpos_off(l) - fpos_off(r)); +} +  /*   * When possible, we try to satisfy a readdir by peeking at the   * dcache.  We make this work by carefully ordering dentries on @@ -89,15 +115,15 @@ static unsigned fpos_off(loff_t p)   * falling back to a "normal" sync readdir if any dentries in the dir   * are dropped.   * - * I_COMPLETE tells indicates we have all dentries in the dir.  It is + * Complete dir indicates that we have all dentries in the dir.  It is   * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by   * the MDS if/when the directory is modified).   */ -static int __dcache_readdir(struct file *filp, -			    void *dirent, filldir_t filldir) +static int __dcache_readdir(struct file *file,  struct dir_context *ctx, +			    u32 shared_gen)  { -	struct ceph_file_info *fi = filp->private_data; -	struct dentry *parent = filp->f_dentry; +	struct ceph_file_info *fi = file->private_data; +	struct dentry *parent = file->f_dentry;  	struct inode *dir = parent->d_inode;  	struct list_head *p;  	struct dentry *dentry, *last; @@ -108,14 +134,14 @@ static int __dcache_readdir(struct file *filp,  	last = fi->dentry;  	fi->dentry = NULL; -	dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, -	     last); +	dout("__dcache_readdir %p v%u at %llu (last %p)\n", +	     dir, shared_gen, ctx->pos, last); -	spin_lock(&dcache_lock); +	spin_lock(&parent->d_lock);  	/* start at beginning? */ -	if (filp->f_pos == 2 || (last && -				 filp->f_pos < ceph_dentry(last)->offset)) { +	if (ctx->pos == 2 || last == NULL || +	    fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) {  		if (list_empty(&parent->d_subdirs))  			goto out_unlock;  		p = parent->d_subdirs.prev; @@ -132,63 +158,65 @@ more:  		     d_unhashed(dentry) ? "!hashed" : "hashed",  		     parent->d_subdirs.prev, parent->d_subdirs.next);  		if (p == &parent->d_subdirs) { -			fi->at_end = 1; +			fi->flags |= CEPH_F_ATEND;  			goto out_unlock;  		} -		if (!d_unhashed(dentry) && dentry->d_inode && +		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); +		if (di->lease_shared_gen == shared_gen && +		    !d_unhashed(dentry) && dentry->d_inode &&  		    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&  		    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && -		    filp->f_pos <= di->offset) +		    fpos_cmp(ctx->pos, di->offset) <= 0)  			break;  		dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,  		     dentry->d_name.len, dentry->d_name.name, di->offset, -		     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", +		     ctx->pos, d_unhashed(dentry) ? " unhashed" : "",  		     !dentry->d_inode ? " null" : ""); +		spin_unlock(&dentry->d_lock);  		p = p->prev;  		dentry = list_entry(p, struct dentry, d_u.d_child);  		di = ceph_dentry(dentry);  	} -	atomic_inc(&dentry->d_count); -	spin_unlock(&dcache_lock); +	dget_dlock(dentry); +	spin_unlock(&dentry->d_lock); +	spin_unlock(&parent->d_lock); -	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, -	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); -	filp->f_pos = di->offset; -	err = filldir(dirent, dentry->d_name.name, -		      dentry->d_name.len, di->offset, -		      dentry->d_inode->i_ino, -		      dentry->d_inode->i_mode >> 12); +	/* make sure a dentry wasn't dropped while we didn't have parent lock */ +	if (!ceph_dir_is_complete(dir)) { +		dout(" lost dir complete on %p; falling back to mds\n", dir); +		dput(dentry); +		err = -EAGAIN; +		goto out; +	} -	if (last) { -		if (err < 0) { +	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos, +	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); +	if (!dir_emit(ctx, dentry->d_name.name, +		      dentry->d_name.len, +		      ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), +		      dentry->d_inode->i_mode >> 12)) { +		if (last) {  			/* remember our position */  			fi->dentry = last; -			fi->next_offset = di->offset; -		} else { -			dput(last); +			fi->next_offset = fpos_off(di->offset);  		} +		dput(dentry); +		return 0;  	} -	last = dentry; -	if (err < 0) -		goto out; - -	filp->f_pos++; +	ctx->pos = di->offset + 1; -	/* make sure a dentry wasn't dropped while we didn't have dcache_lock */ -	if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { -		dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); -		err = -EAGAIN; -		goto out; -	} +	if (last) +		dput(last); +	last = dentry; -	spin_lock(&dcache_lock); +	spin_lock(&parent->d_lock);  	p = p->prev;	/* advance to next dentry */  	goto more;  out_unlock: -	spin_unlock(&dcache_lock); +	spin_unlock(&parent->d_lock);  out:  	if (last)  		dput(last); @@ -214,61 +242,64 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,  	return 0;  } -static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int ceph_readdir(struct file *file, struct dir_context *ctx)  { -	struct ceph_file_info *fi = filp->private_data; -	struct inode *inode = filp->f_dentry->d_inode; +	struct ceph_file_info *fi = file->private_data; +	struct inode *inode = file_inode(file);  	struct ceph_inode_info *ci = ceph_inode(inode);  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);  	struct ceph_mds_client *mdsc = fsc->mdsc; -	unsigned frag = fpos_frag(filp->f_pos); -	int off = fpos_off(filp->f_pos); +	unsigned frag = fpos_frag(ctx->pos); +	int off = fpos_off(ctx->pos);  	int err;  	u32 ftype;  	struct ceph_mds_reply_info_parsed *rinfo; -	const int max_entries = fsc->mount_options->max_readdir; -	const int max_bytes = fsc->mount_options->max_readdir_bytes; -	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); -	if (fi->at_end) +	dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); +	if (fi->flags & CEPH_F_ATEND)  		return 0;  	/* always start with . and .. */ -	if (filp->f_pos == 0) { +	if (ctx->pos == 0) {  		/* note dir version at start of readdir so we can tell  		 * if any dentries get dropped */ -		fi->dir_release_count = ci->i_release_count; +		fi->dir_release_count = atomic_read(&ci->i_release_count);  		dout("readdir off 0 -> '.'\n"); -		if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), -			    inode->i_ino, inode->i_mode >> 12) < 0) +		if (!dir_emit(ctx, ".", 1,  +			    ceph_translate_ino(inode->i_sb, inode->i_ino), +			    inode->i_mode >> 12))  			return 0; -		filp->f_pos = 1; +		ctx->pos = 1;  		off = 1;  	} -	if (filp->f_pos == 1) { +	if (ctx->pos == 1) { +		ino_t ino = parent_ino(file->f_dentry);  		dout("readdir off 1 -> '..'\n"); -		if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), -			    filp->f_dentry->d_parent->d_inode->i_ino, -			    inode->i_mode >> 12) < 0) +		if (!dir_emit(ctx, "..", 2, +			    ceph_translate_ino(inode->i_sb, ino), +			    inode->i_mode >> 12))  			return 0; -		filp->f_pos = 2; +		ctx->pos = 2;  		off = 2;  	}  	/* can we use the dcache? */ -	spin_lock(&inode->i_lock); -	if ((filp->f_pos == 2 || fi->dentry) && +	spin_lock(&ci->i_ceph_lock); +	if ((ctx->pos == 2 || fi->dentry) &&  	    !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&  	    ceph_snap(inode) != CEPH_SNAPDIR && -	    (ci->i_ceph_flags & CEPH_I_COMPLETE) && +	    __ceph_dir_is_complete(ci) &&  	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { -		spin_unlock(&inode->i_lock); -		err = __dcache_readdir(filp, dirent, filldir); +		u32 shared_gen = ci->i_shared_gen; +		spin_unlock(&ci->i_ceph_lock); +		err = __dcache_readdir(file, ctx, shared_gen);  		if (err != -EAGAIN)  			return err; +		frag = fpos_frag(ctx->pos); +		off = fpos_off(ctx->pos);  	} else { -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  	}  	if (fi->dentry) {  		err = note_last_dentry(fi, fi->dentry->d_name.name, @@ -294,16 +325,19 @@ more:  			fi->last_readdir = NULL;  		} -		/* requery frag tree, as the frag topology may have changed */ -		frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); -  		dout("readdir fetching %llx.%llx frag %x offset '%s'\n",  		     ceph_vinop(inode), frag, fi->last_name);  		req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);  		if (IS_ERR(req))  			return PTR_ERR(req); -		req->r_inode = igrab(inode); -		req->r_dentry = dget(filp->f_dentry); +		err = ceph_alloc_readdir_reply_buffer(req, inode); +		if (err) { +			ceph_mdsc_put_request(req); +			return err; +		} +		req->r_inode = inode; +		ihold(inode); +		req->r_dentry = dget(file->f_dentry);  		/* hints to request -> mds selection code */  		req->r_direct_mode = USE_AUTH_MDS;  		req->r_direct_hash = ceph_frag_value(frag); @@ -311,9 +345,6 @@ more:  		req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);  		req->r_readdir_offset = fi->next_offset;  		req->r_args.readdir.frag = cpu_to_le32(frag); -		req->r_args.readdir.max_entries = cpu_to_le32(max_entries); -		req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); -		req->r_num_caps = max_entries + 1;  		err = ceph_mdsc_do_request(mdsc, NULL, req);  		if (err < 0) {  			ceph_mdsc_put_request(req); @@ -326,19 +357,32 @@ more:  		if (!req->r_did_prepopulate) {  			dout("readdir !did_prepopulate"); -			fi->dir_release_count--;    /* preclude I_COMPLETE */ +			/* preclude from marking dir complete */ +			fi->dir_release_count--;  		}  		/* note next offset and last dentry name */ +		rinfo = &req->r_reply_info; +		if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { +			frag = le32_to_cpu(rinfo->dir_dir->frag); +			if (ceph_frag_is_leftmost(frag)) +				fi->next_offset = 2; +			else +				fi->next_offset = 0; +			off = fi->next_offset; +		} +		fi->frag = frag;  		fi->offset = fi->next_offset;  		fi->last_readdir = req;  		if (req->r_reply_info.dir_end) {  			kfree(fi->last_name);  			fi->last_name = NULL; -			fi->next_offset = 2; +			if (ceph_frag_is_rightmost(frag)) +				fi->next_offset = 2; +			else +				fi->next_offset = 0;  		} else { -			rinfo = &req->r_reply_info;  			err = note_last_dentry(fi,  				       rinfo->dir_dname[rinfo->dir_nr-1],  				       rinfo->dir_dname_len[rinfo->dir_nr-1]); @@ -351,27 +395,32 @@ more:  	rinfo = &fi->last_readdir->r_reply_info;  	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,  	     rinfo->dir_nr, off, fi->offset); -	while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) { -		u64 pos = ceph_make_fpos(frag, off); + +	ctx->pos = ceph_make_fpos(frag, off); +	while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {  		struct ceph_mds_reply_inode *in =  			rinfo->dir_in[off - fi->offset].in; +		struct ceph_vino vino; +		ino_t ino; +  		dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", -		     off, off - fi->offset, rinfo->dir_nr, pos, +		     off, off - fi->offset, rinfo->dir_nr, ctx->pos,  		     rinfo->dir_dname_len[off - fi->offset],  		     rinfo->dir_dname[off - fi->offset], in);  		BUG_ON(!in);  		ftype = le32_to_cpu(in->mode) >> 12; -		if (filldir(dirent, +		vino.ino = le64_to_cpu(in->ino); +		vino.snap = le64_to_cpu(in->snapid); +		ino = ceph_vino_to_ino(vino); +		if (!dir_emit(ctx,  			    rinfo->dir_dname[off - fi->offset],  			    rinfo->dir_dname_len[off - fi->offset], -			    pos, -			    le64_to_cpu(in->ino), -			    ftype) < 0) { +			    ceph_translate_ino(inode->i_sb, ino), ftype)) {  			dout("filldir stopping us...\n");  			return 0;  		}  		off++; -		filp->f_pos = pos + 1; +		ctx->pos++;  	}  	if (fi->last_name) { @@ -384,65 +433,73 @@ more:  	if (!ceph_frag_is_rightmost(frag)) {  		frag = ceph_frag_next(frag);  		off = 0; -		filp->f_pos = ceph_make_fpos(frag, off); +		ctx->pos = ceph_make_fpos(frag, off);  		dout("readdir next frag is %x\n", frag);  		goto more;  	} -	fi->at_end = 1; +	fi->flags |= CEPH_F_ATEND;  	/*  	 * if dir_release_count still matches the dir, no dentries  	 * were released during the whole readdir, and we should have  	 * the complete dir contents in our cache.  	 */ -	spin_lock(&inode->i_lock); -	if (ci->i_release_count == fi->dir_release_count) { +	spin_lock(&ci->i_ceph_lock); +	if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {  		dout(" marking %p complete\n", inode); -		ci->i_ceph_flags |= CEPH_I_COMPLETE; -		ci->i_max_offset = filp->f_pos; +		__ceph_dir_set_complete(ci, fi->dir_release_count);  	} -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock); -	dout("readdir %p filp %p done.\n", inode, filp); +	dout("readdir %p file %p done.\n", inode, file);  	return 0;  } -static void reset_readdir(struct ceph_file_info *fi) +static void reset_readdir(struct ceph_file_info *fi, unsigned frag)  {  	if (fi->last_readdir) {  		ceph_mdsc_put_request(fi->last_readdir);  		fi->last_readdir = NULL;  	}  	kfree(fi->last_name); -	fi->next_offset = 2;  /* compensate for . and .. */ +	fi->last_name = NULL; +	if (ceph_frag_is_leftmost(frag)) +		fi->next_offset = 2;  /* compensate for . and .. */ +	else +		fi->next_offset = 0;  	if (fi->dentry) {  		dput(fi->dentry);  		fi->dentry = NULL;  	} -	fi->at_end = 0; +	fi->flags &= ~CEPH_F_ATEND;  } -static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) +static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)  {  	struct ceph_file_info *fi = file->private_data;  	struct inode *inode = file->f_mapping->host; -	loff_t old_offset = offset; +	loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);  	loff_t retval;  	mutex_lock(&inode->i_mutex); -	switch (origin) { +	retval = -EINVAL; +	switch (whence) {  	case SEEK_END:  		offset += inode->i_size + 2;   /* FIXME */  		break;  	case SEEK_CUR:  		offset += file->f_pos; +	case SEEK_SET: +		break; +	default: +		goto out;  	} -	retval = -EINVAL; -	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { + +	if (offset >= 0) {  		if (offset != file->f_pos) {  			file->f_pos = offset;  			file->f_version = 0; -			fi->at_end = 0; +			fi->flags &= ~CEPH_F_ATEND;  		}  		retval = offset; @@ -451,39 +508,33 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)  		 * seek to new frag, or seek prior to current chunk.  		 */  		if (offset == 0 || -		    fpos_frag(offset) != fpos_frag(old_offset) || +		    fpos_frag(offset) != fi->frag ||  		    fpos_off(offset) < fi->offset) {  			dout("dir_llseek dropping %p content\n", file); -			reset_readdir(fi); +			reset_readdir(fi, fpos_frag(offset));  		}  		/* bump dir_release_count if we did a forward seek */ -		if (offset > old_offset) +		if (fpos_cmp(offset, old_offset) > 0)  			fi->dir_release_count--;  	} +out:  	mutex_unlock(&inode->i_mutex);  	return retval;  }  /* - * Process result of a lookup/open request. - * - * Mainly, make sure we return the final req->r_dentry (if it already - * existed) in place of the original VFS-provided dentry when they - * differ. - * - * Gracefully handle the case where the MDS replies with -ENOENT and - * no trace (which it may do, at its discretion, e.g., if it doesn't - * care to issue a lease on the negative dentry). + * Handle lookups for the hidden .snap directory.   */ -struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, -				  struct dentry *dentry, int err) +int ceph_handle_snapdir(struct ceph_mds_request *req, +			struct dentry *dentry, int err)  {  	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); -	struct inode *parent = dentry->d_parent->d_inode; +	struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */  	/* .snap dir? */  	if (err == -ENOENT && +	    ceph_snap(parent) == CEPH_NOSNAP &&  	    strcmp(dentry->d_name.name,  		   fsc->mount_options->snapdir_name) == 0) {  		struct inode *inode = ceph_get_snapdir(parent); @@ -493,7 +544,23 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,  		d_add(dentry, inode);  		err = 0;  	} +	return err; +} +/* + * Figure out final result of a lookup/open request. + * + * Mainly, make sure we return the final req->r_dentry (if it already + * existed) in place of the original VFS-provided dentry when they + * differ. + * + * Gracefully handle the case where the MDS replies with -ENOENT and + * no trace (which it may do, at its discretion, e.g., if it doesn't + * care to issue a lease on the negative dentry). + */ +struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, +				  struct dentry *dentry, int err) +{  	if (err == -ENOENT) {  		/* no trace? */  		err = 0; @@ -528,7 +595,7 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)   * the MDS so that it gets our 'caps wanted' value in a single op.   */  static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, -				  struct nameidata *nd) +				  unsigned int flags)  {  	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);  	struct ceph_mds_client *mdsc = fsc->mdsc; @@ -546,35 +613,26 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,  	if (err < 0)  		return ERR_PTR(err); -	/* open (but not create!) intent? */ -	if (nd && -	    (nd->flags & LOOKUP_OPEN) && -	    (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */ -	    !(nd->intent.open.flags & O_CREAT)) { -		int mode = nd->intent.open.create_mode & ~current->fs->umask; -		return ceph_lookup_open(dir, dentry, nd, mode, 1); -	} -  	/* can we conclude ENOENT locally? */  	if (dentry->d_inode == NULL) {  		struct ceph_inode_info *ci = ceph_inode(dir);  		struct ceph_dentry_info *di = ceph_dentry(dentry); -		spin_lock(&dir->i_lock); +		spin_lock(&ci->i_ceph_lock);  		dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);  		if (strncmp(dentry->d_name.name,  			    fsc->mount_options->snapdir_name,  			    dentry->d_name.len) &&  		    !is_root_ceph_dentry(dir, dentry) && -		    (ci->i_ceph_flags & CEPH_I_COMPLETE) && +		    __ceph_dir_is_complete(ci) &&  		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { -			spin_unlock(&dir->i_lock); +			spin_unlock(&ci->i_ceph_lock);  			dout(" dir %p complete, -ENOENT\n", dir);  			d_add(dentry, NULL);  			di->lease_shared_gen = ci->i_shared_gen;  			return NULL;  		} -		spin_unlock(&dir->i_lock); +		spin_unlock(&ci->i_ceph_lock);  	}  	op = ceph_snap(dir) == CEPH_SNAPDIR ? @@ -588,6 +646,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,  	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);  	req->r_locked_dir = dir;  	err = ceph_mdsc_do_request(mdsc, NULL, req); +	err = ceph_handle_snapdir(req, dentry, err);  	dentry = ceph_finish_lookup(req, dentry, err);  	ceph_mdsc_put_request(req);  /* will dput(dentry) */  	dout("lookup result=%p\n", dentry); @@ -600,7 +659,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,   */  int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)  { -	struct dentry *result = ceph_lookup(dir, dentry, NULL); +	struct dentry *result = ceph_lookup(dir, dentry, 0);  	if (result && !IS_ERR(result)) {  		/* @@ -618,7 +677,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)  }  static int ceph_mknod(struct inode *dir, struct dentry *dentry, -		      int mode, dev_t rdev) +		      umode_t mode, dev_t rdev)  {  	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);  	struct ceph_mds_client *mdsc = fsc->mdsc; @@ -628,7 +687,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,  	if (ceph_snap(dir) != CEPH_NOSNAP)  		return -EROFS; -	dout("mknod in dir %p dentry %p mode 0%o rdev %d\n", +	dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",  	     dir, dentry, mode, rdev);  	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);  	if (IS_ERR(req)) { @@ -646,31 +705,18 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,  	if (!err && !req->r_reply_info.head->is_dentry)  		err = ceph_handle_notrace_create(dir, dentry);  	ceph_mdsc_put_request(req); -	if (err) + +	if (!err) +		ceph_init_acl(dentry, dentry->d_inode, dir); +	else  		d_drop(dentry);  	return err;  } -static int ceph_create(struct inode *dir, struct dentry *dentry, int mode, -		       struct nameidata *nd) +static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, +		       bool excl)  { -	dout("create in dir %p dentry %p name '%.*s'\n", -	     dir, dentry, dentry->d_name.len, dentry->d_name.name); - -	if (ceph_snap(dir) != CEPH_NOSNAP) -		return -EROFS; - -	if (nd) { -		BUG_ON((nd->flags & LOOKUP_OPEN) == 0); -		dentry = ceph_lookup_open(dir, dentry, nd, mode, 0); -		/* hrm, what should i do here if we get aliased? */ -		if (IS_ERR(dentry)) -			return PTR_ERR(dentry); -		return 0; -	} - -	/* fall back to mknod */ -	return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0); +	return ceph_mknod(dir, dentry, mode, 0);  }  static int ceph_symlink(struct inode *dir, struct dentry *dentry, @@ -700,12 +746,14 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,  	if (!err && !req->r_reply_info.head->is_dentry)  		err = ceph_handle_notrace_create(dir, dentry);  	ceph_mdsc_put_request(req); -	if (err) +	if (!err) +		ceph_init_acl(dentry, dentry->d_inode, dir); +	else  		d_drop(dentry);  	return err;  } -static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  {  	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);  	struct ceph_mds_client *mdsc = fsc->mdsc; @@ -719,7 +767,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)  		dout("mksnap dir %p snap '%.*s' dn %p\n", dir,  		     dentry->d_name.len, dentry->d_name.name, dentry);  	} else if (ceph_snap(dir) == CEPH_NOSNAP) { -		dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode); +		dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);  		op = CEPH_MDS_OP_MKDIR;  	} else {  		goto out; @@ -741,7 +789,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)  		err = ceph_handle_notrace_create(dir, dentry);  	ceph_mdsc_put_request(req);  out: -	if (err < 0) +	if (!err) +		ceph_init_acl(dentry, dentry->d_inode, dir); +	else  		d_drop(dentry);  	return err;  } @@ -766,15 +816,19 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,  	}  	req->r_dentry = dget(dentry);  	req->r_num_caps = 2; -	req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ +	req->r_old_dentry = dget(old_dentry);  	req->r_locked_dir = dir;  	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;  	req->r_dentry_unless = CEPH_CAP_FILE_EXCL; +	/* release LINK_SHARED on source inode (mds will lock it) */ +	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;  	err = ceph_mdsc_do_request(mdsc, dir, req); -	if (err) +	if (err) {  		d_drop(dentry); -	else if (!req->r_reply_info.head->is_dentry) -		d_instantiate(dentry, igrab(old_dentry->d_inode)); +	} else if (!req->r_reply_info.head->is_dentry) { +		ihold(old_dentry->d_inode); +		d_instantiate(dentry, old_dentry->d_inode); +	}  	ceph_mdsc_put_request(req);  	return err;  } @@ -790,12 +844,12 @@ static int drop_caps_for_unlink(struct inode *inode)  	struct ceph_inode_info *ci = ceph_inode(inode);  	int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	if (inode->i_nlink == 1) {  		drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);  		ci->i_ceph_flags |= CEPH_I_NODELAY;  	} -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	return drop;  } @@ -819,7 +873,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)  	} else if (ceph_snap(dir) == CEPH_NOSNAP) {  		dout("unlink/rmdir dir %p dn %p inode %p\n",  		     dir, dentry, inode); -		op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ? +		op = S_ISDIR(dentry->d_inode->i_mode) ?  			CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;  	} else  		goto out; @@ -860,9 +914,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,  	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);  	if (IS_ERR(req))  		return PTR_ERR(req); +	ihold(old_dir);  	req->r_dentry = dget(new_dentry);  	req->r_num_caps = 2;  	req->r_old_dentry = dget(old_dentry); +	req->r_old_dentry_dir = old_dir;  	req->r_locked_dir = new_dir;  	req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;  	req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -880,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,  		 * to do it here.  		 */ -		/* d_move screws up d_subdirs order */ -		ceph_i_clear(new_dir, CEPH_I_COMPLETE); -  		d_move(old_dentry, new_dentry);  		/* ensure target dentry is invalidated, despite  		   rehashing bug in vfs_rename_dir */  		ceph_invalidate_dentry_lease(new_dentry); + +		/* d_move screws up sibling dentries' offsets */ +		ceph_dir_clear_complete(old_dir); +		ceph_dir_clear_complete(new_dir); +  	}  	ceph_mdsc_put_request(req);  	return err; @@ -921,12 +979,12 @@ static int dentry_lease_is_valid(struct dentry *dentry)  	spin_lock(&dentry->d_lock);  	di = ceph_dentry(dentry); -	if (di && di->lease_session) { +	if (di->lease_session) {  		s = di->lease_session; -		spin_lock(&s->s_cap_lock); +		spin_lock(&s->s_gen_ttl_lock);  		gen = s->s_cap_gen;  		ttl = s->s_cap_ttl; -		spin_unlock(&s->s_cap_lock); +		spin_unlock(&s->s_gen_ttl_lock);  		if (di->lease_gen == gen &&  		    time_before(jiffies, dentry->d_time) && @@ -963,10 +1021,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)  	struct ceph_dentry_info *di = ceph_dentry(dentry);  	int valid = 0; -	spin_lock(&dir->i_lock); +	spin_lock(&ci->i_ceph_lock);  	if (ci->i_shared_gen == di->lease_shared_gen)  		valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); -	spin_unlock(&dir->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",  	     dir, (unsigned)ci->i_shared_gen, dentry,  	     (unsigned)di->lease_shared_gen, valid); @@ -976,75 +1034,64 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)  /*   * Check if cached dentry can be trusted.   */ -static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) +static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)  { -	struct inode *dir = dentry->d_parent->d_inode; +	int valid = 0; +	struct inode *dir; + +	if (flags & LOOKUP_RCU) +		return -ECHILD;  	dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,  	     dentry->d_name.len, dentry->d_name.name, dentry->d_inode,  	     ceph_dentry(dentry)->offset); +	dir = ceph_get_dentry_parent_inode(dentry); +  	/* always trust cached snapped dentries, snapdir dentry */  	if (ceph_snap(dir) != CEPH_NOSNAP) {  		dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,  		     dentry->d_name.len, dentry->d_name.name, dentry->d_inode); -		goto out_touch; +		valid = 1; +	} else if (dentry->d_inode && +		   ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) { +		valid = 1; +	} else if (dentry_lease_is_valid(dentry) || +		   dir_lease_is_valid(dir, dentry)) { +		if (dentry->d_inode) +			valid = ceph_is_any_caps(dentry->d_inode); +		else +			valid = 1;  	} -	if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) -		goto out_touch; - -	if (dentry_lease_is_valid(dentry) || -	    dir_lease_is_valid(dir, dentry)) -		goto out_touch; -	dout("d_revalidate %p invalid\n", dentry); -	d_drop(dentry); -	return 0; -out_touch: -	ceph_dentry_lru_touch(dentry); -	return 1; +	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); +	if (valid) { +		ceph_dentry_lru_touch(dentry); +	} else { +		ceph_dir_clear_complete(dir); +		d_drop(dentry); +	} +	iput(dir); +	return valid;  }  /* - * When a dentry is released, clear the dir I_COMPLETE if it was part - * of the current dir gen or if this is in the snapshot namespace. + * Release our ceph_dentry_info.   */ -static void ceph_dentry_release(struct dentry *dentry) +static void ceph_d_release(struct dentry *dentry)  {  	struct ceph_dentry_info *di = ceph_dentry(dentry); -	struct inode *parent_inode = NULL; -	u64 snapid = CEPH_NOSNAP; -	if (!IS_ROOT(dentry)) { -		parent_inode = dentry->d_parent->d_inode; -		if (parent_inode) -			snapid = ceph_snap(parent_inode); -	} -	dout("dentry_release %p parent %p\n", dentry, parent_inode); -	if (parent_inode && snapid != CEPH_SNAPDIR) { -		struct ceph_inode_info *ci = ceph_inode(parent_inode); - -		spin_lock(&parent_inode->i_lock); -		if (ci->i_shared_gen == di->lease_shared_gen || -		    snapid <= CEPH_MAXSNAP) { -			dout(" clearing %p complete (d_release)\n", -			     parent_inode); -			ci->i_ceph_flags &= ~CEPH_I_COMPLETE; -			ci->i_release_count++; -		} -		spin_unlock(&parent_inode->i_lock); -	} -	if (di) { -		ceph_dentry_lru_del(dentry); -		if (di->lease_session) -			ceph_put_mds_session(di->lease_session); -		kmem_cache_free(ceph_dentry_cachep, di); -		dentry->d_fsdata = NULL; -	} +	dout("d_release %p\n", dentry); +	ceph_dentry_lru_del(dentry); +	if (di->lease_session) +		ceph_put_mds_session(di->lease_session); +	kmem_cache_free(ceph_dentry_cachep, di); +	dentry->d_fsdata = NULL;  }  static int ceph_snapdir_d_revalidate(struct dentry *dentry, -					  struct nameidata *nd) +					  unsigned int flags)  {  	/*  	 * Eventually, we'll want to revalidate snapped metadata @@ -1053,7 +1100,30 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,  	return 1;  } +/* + * When the VFS prunes a dentry from the cache, we need to clear the + * complete flag on the parent directory. + * + * Called under dentry->d_lock. + */ +static void ceph_d_prune(struct dentry *dentry) +{ +	dout("ceph_d_prune %p\n", dentry); + +	/* do we have a valid parent? */ +	if (IS_ROOT(dentry)) +		return; +	/* if we are not hashed, we don't affect dir's completeness */ +	if (d_unhashed(dentry)) +		return; + +	/* +	 * we hold d_lock, so d_parent is stable, and d_fsdata is never +	 * cleared until d_release +	 */ +	ceph_dir_clear_complete(dentry->d_parent->d_inode); +}  /*   * read() on a dir.  This weird interface hack only works if mounted @@ -1063,19 +1133,20 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,  			     loff_t *ppos)  {  	struct ceph_file_info *cf = file->private_data; -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_inode_info *ci = ceph_inode(inode);  	int left; +	const int bufsize = 1024;  	if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))  		return -EISDIR;  	if (!cf->dir_info) { -		cf->dir_info = kmalloc(1024, GFP_NOFS); +		cf->dir_info = kmalloc(bufsize, GFP_NOFS);  		if (!cf->dir_info)  			return -ENOMEM;  		cf->dir_info_len = -			sprintf(cf->dir_info, +			snprintf(cf->dir_info, bufsize,  				"entries:   %20lld\n"  				" files:    %20lld\n"  				" subdirs:  %20lld\n" @@ -1109,9 +1180,10 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,   * an fsync() on a dir will wait for any uncommitted directory   * operations to commit.   */ -static int ceph_dir_fsync(struct file *file, int datasync) +static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, +			  int datasync)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_inode_info *ci = ceph_inode(inode);  	struct list_head *head = &ci->i_unsafe_dirops;  	struct ceph_mds_request *req; @@ -1119,6 +1191,11 @@ static int ceph_dir_fsync(struct file *file, int datasync)  	int ret = 0;  	dout("dir_fsync %p\n", inode); +	ret = filemap_write_and_wait_range(inode->i_mapping, start, end); +	if (ret) +		return ret; +	mutex_lock(&inode->i_mutex); +  	spin_lock(&ci->i_unsafe_lock);  	if (list_empty(head))  		goto out; @@ -1130,6 +1207,7 @@ static int ceph_dir_fsync(struct file *file, int datasync)  	do {  		ceph_mdsc_get_request(req);  		spin_unlock(&ci->i_unsafe_lock); +  		dout("dir_fsync %p wait on tid %llu (until %llu)\n",  		     inode, req->r_tid, last_tid);  		if (req->r_timeout) { @@ -1142,9 +1220,9 @@ static int ceph_dir_fsync(struct file *file, int datasync)  		} else {  			wait_for_completion(&req->r_safe_completion);  		} -		spin_lock(&ci->i_unsafe_lock);  		ceph_mdsc_put_request(req); +		spin_lock(&ci->i_unsafe_lock);  		if (ret || list_empty(head))  			break;  		req = list_entry(head->next, @@ -1152,6 +1230,8 @@ static int ceph_dir_fsync(struct file *file, int datasync)  	} while (req->r_tid < last_tid);  out:  	spin_unlock(&ci->i_unsafe_lock); +	mutex_unlock(&inode->i_mutex); +  	return ret;  } @@ -1167,13 +1247,11 @@ void ceph_dentry_lru_add(struct dentry *dn)  	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,  	     dn->d_name.len, dn->d_name.name); -	if (di) { -		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; -		spin_lock(&mdsc->dentry_lru_lock); -		list_add_tail(&di->lru, &mdsc->dentry_lru); -		mdsc->num_dentry++; -		spin_unlock(&mdsc->dentry_lru_lock); -	} +	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; +	spin_lock(&mdsc->dentry_lru_lock); +	list_add_tail(&di->lru, &mdsc->dentry_lru); +	mdsc->num_dentry++; +	spin_unlock(&mdsc->dentry_lru_lock);  }  void ceph_dentry_lru_touch(struct dentry *dn) @@ -1183,12 +1261,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)  	dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,  	     dn->d_name.len, dn->d_name.name, di->offset); -	if (di) { -		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; -		spin_lock(&mdsc->dentry_lru_lock); -		list_move_tail(&di->lru, &mdsc->dentry_lru); -		spin_unlock(&mdsc->dentry_lru_lock); -	} +	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; +	spin_lock(&mdsc->dentry_lru_lock); +	list_move_tail(&di->lru, &mdsc->dentry_lru); +	spin_unlock(&mdsc->dentry_lru_lock);  }  void ceph_dentry_lru_del(struct dentry *dn) @@ -1198,18 +1274,35 @@ void ceph_dentry_lru_del(struct dentry *dn)  	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,  	     dn->d_name.len, dn->d_name.name); -	if (di) { -		mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; -		spin_lock(&mdsc->dentry_lru_lock); -		list_del_init(&di->lru); -		mdsc->num_dentry--; -		spin_unlock(&mdsc->dentry_lru_lock); +	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; +	spin_lock(&mdsc->dentry_lru_lock); +	list_del_init(&di->lru); +	mdsc->num_dentry--; +	spin_unlock(&mdsc->dentry_lru_lock); +} + +/* + * Return name hash for a given dentry.  This is dependent on + * the parent directory's hash function. + */ +unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) +{ +	struct ceph_inode_info *dci = ceph_inode(dir); + +	switch (dci->i_dir_layout.dl_dir_hash) { +	case 0:	/* for backward compat */ +	case CEPH_STR_HASH_LINUX: +		return dn->d_name.hash; + +	default: +		return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, +				     dn->d_name.name, dn->d_name.len);  	}  }  const struct file_operations ceph_dir_fops = {  	.read = ceph_read_dir, -	.readdir = ceph_readdir, +	.iterate = ceph_readdir,  	.llseek = ceph_dir_llseek,  	.open = ceph_open,  	.release = ceph_release, @@ -1226,6 +1319,8 @@ const struct inode_operations ceph_dir_iops = {  	.getxattr = ceph_getxattr,  	.listxattr = ceph_listxattr,  	.removexattr = ceph_removexattr, +	.get_acl = ceph_get_acl, +	.set_acl = ceph_set_acl,  	.mknod = ceph_mknod,  	.symlink = ceph_symlink,  	.mkdir = ceph_mkdir, @@ -1234,18 +1329,21 @@ const struct inode_operations ceph_dir_iops = {  	.rmdir = ceph_unlink,  	.rename = ceph_rename,  	.create = ceph_create, +	.atomic_open = ceph_atomic_open,  };  const struct dentry_operations ceph_dentry_ops = {  	.d_revalidate = ceph_d_revalidate, -	.d_release = ceph_dentry_release, +	.d_release = ceph_d_release, +	.d_prune = ceph_d_prune,  };  const struct dentry_operations ceph_snapdir_dentry_ops = {  	.d_revalidate = ceph_snapdir_d_revalidate, -	.d_release = ceph_dentry_release, +	.d_release = ceph_d_release,  };  const struct dentry_operations ceph_snap_dentry_ops = { -	.d_release = ceph_dentry_release, +	.d_release = ceph_d_release, +	.d_prune = ceph_d_prune,  }; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 2297d942699..8d7d782f438 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -8,23 +8,6 @@  #include "mds_client.h"  /* - * NFS export support - * - * NFS re-export of a ceph mount is, at present, only semireliable. - * The basic issue is that the Ceph architectures doesn't lend itself - * well to generating filehandles that will remain valid forever. - * - * So, we do our best.  If you're lucky, your inode will be in the - * client's cache.  If it's not, and you have a connectable fh, then - * the MDS server may be able to find it for you.  Otherwise, you get - * ESTALE. - * - * There are ways to this more reliable, but in the non-connectable fh - * case, we won't every work perfectly, and in the connectable case, - * some changes are needed on the MDS side to work better. - */ - -/*   * Basic fh   */  struct ceph_nfs_fh { @@ -32,22 +15,18 @@ struct ceph_nfs_fh {  } __attribute__ ((packed));  /* - * Larger 'connectable' fh that includes parent ino and name hash. - * Use this whenever possible, as it works more reliably. + * Larger fh that includes parent ino.   */  struct ceph_nfs_confh {  	u64 ino, parent_ino; -	u32 parent_name_hash;  } __attribute__ ((packed)); -static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, -			  int connectable) +static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, +			  struct inode *parent_inode)  {  	int type;  	struct ceph_nfs_fh *fh = (void *)rawfh;  	struct ceph_nfs_confh *cfh = (void *)rawfh; -	struct dentry *parent = dentry->d_parent; -	struct inode *inode = dentry->d_inode;  	int connected_handle_length = sizeof(*cfh)/4;  	int handle_length = sizeof(*fh)/4; @@ -55,176 +34,217 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,  	if (ceph_snap(inode) != CEPH_NOSNAP)  		return -EINVAL; -	if (*max_len >= connected_handle_length) { -		dout("encode_fh %p connectable\n", dentry); -		cfh->ino = ceph_ino(dentry->d_inode); -		cfh->parent_ino = ceph_ino(parent->d_inode); -		cfh->parent_name_hash = parent->d_name.hash; +	if (parent_inode && (*max_len < connected_handle_length)) {  		*max_len = connected_handle_length; -		type = 2; -	} else if (*max_len >= handle_length) { -		if (connectable) { -			*max_len = connected_handle_length; -			return 255; -		} -		dout("encode_fh %p\n", dentry); -		fh->ino = ceph_ino(dentry->d_inode); +		return FILEID_INVALID; +	} else if (*max_len < handle_length) {  		*max_len = handle_length; -		type = 1; +		return FILEID_INVALID; +	} + +	if (parent_inode) { +		dout("encode_fh %llx with parent %llx\n", +		     ceph_ino(inode), ceph_ino(parent_inode)); +		cfh->ino = ceph_ino(inode); +		cfh->parent_ino = ceph_ino(parent_inode); +		*max_len = connected_handle_length; +		type = FILEID_INO32_GEN_PARENT;  	} else { +		dout("encode_fh %llx\n", ceph_ino(inode)); +		fh->ino = ceph_ino(inode);  		*max_len = handle_length; -		return 255; +		type = FILEID_INO32_GEN;  	}  	return type;  } -/* - * convert regular fh to dentry - * - * FIXME: we should try harder by querying the mds for the ino. - */ -static struct dentry *__fh_to_dentry(struct super_block *sb, -				     struct ceph_nfs_fh *fh) +static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)  { +	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;  	struct inode *inode;  	struct dentry *dentry;  	struct ceph_vino vino;  	int err; -	dout("__fh_to_dentry %llx\n", fh->ino); -	vino.ino = fh->ino; +	vino.ino = ino;  	vino.snap = CEPH_NOSNAP;  	inode = ceph_find_inode(sb, vino); -	if (!inode) -		return ERR_PTR(-ESTALE); +	if (!inode) { +		struct ceph_mds_request *req; + +		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, +					       USE_ANY_MDS); +		if (IS_ERR(req)) +			return ERR_CAST(req); + +		req->r_ino1 = vino; +		req->r_num_caps = 1; +		err = ceph_mdsc_do_request(mdsc, NULL, req); +		inode = req->r_target_inode; +		if (inode) +			ihold(inode); +		ceph_mdsc_put_request(req); +		if (!inode) +			return ERR_PTR(-ESTALE); +	}  	dentry = d_obtain_alias(inode);  	if (IS_ERR(dentry)) { -		pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", -		       fh->ino, inode);  		iput(inode);  		return dentry;  	}  	err = ceph_init_dentry(dentry); -  	if (err < 0) { -		iput(inode); +		dput(dentry);  		return ERR_PTR(err);  	} -	dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry); +	dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry);  	return dentry;  }  /* - * convert connectable fh to dentry + * convert regular fh to dentry   */ -static struct dentry *__cfh_to_dentry(struct super_block *sb, -				      struct ceph_nfs_confh *cfh) +static struct dentry *ceph_fh_to_dentry(struct super_block *sb, +					struct fid *fid, +					int fh_len, int fh_type) +{ +	struct ceph_nfs_fh *fh = (void *)fid->raw; + +	if (fh_type != FILEID_INO32_GEN  && +	    fh_type != FILEID_INO32_GEN_PARENT) +		return NULL; +	if (fh_len < sizeof(*fh) / 4) +		return NULL; + +	dout("fh_to_dentry %llx\n", fh->ino); +	return __fh_to_dentry(sb, fh->ino); +} + +static struct dentry *__get_parent(struct super_block *sb, +				   struct dentry *child, u64 ino)  {  	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; +	struct ceph_mds_request *req;  	struct inode *inode;  	struct dentry *dentry; -	struct ceph_vino vino;  	int err; -	dout("__cfh_to_dentry %llx (%llx/%x)\n", -	     cfh->ino, cfh->parent_ino, cfh->parent_name_hash); - -	vino.ino = cfh->ino; -	vino.snap = CEPH_NOSNAP; -	inode = ceph_find_inode(sb, vino); -	if (!inode) { -		struct ceph_mds_request *req; - -		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, -					       USE_ANY_MDS); -		if (IS_ERR(req)) -			return ERR_CAST(req); +	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT, +				       USE_ANY_MDS); +	if (IS_ERR(req)) +		return ERR_CAST(req); -		req->r_ino1 = vino; -		req->r_ino2.ino = cfh->parent_ino; -		req->r_ino2.snap = CEPH_NOSNAP; -		req->r_path2 = kmalloc(16, GFP_NOFS); -		snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); -		req->r_num_caps = 1; -		err = ceph_mdsc_do_request(mdsc, NULL, req); -		ceph_mdsc_put_request(req); -		inode = ceph_find_inode(sb, vino); -		if (!inode) -			return ERR_PTR(err ? err : -ESTALE); +	if (child) { +		req->r_inode = child->d_inode; +		ihold(child->d_inode); +	} else { +		req->r_ino1 = (struct ceph_vino) { +			.ino = ino, +			.snap = CEPH_NOSNAP, +		};  	} +	req->r_num_caps = 1; +	err = ceph_mdsc_do_request(mdsc, NULL, req); +	inode = req->r_target_inode; +	if (inode) +		ihold(inode); +	ceph_mdsc_put_request(req); +	if (!inode) +		return ERR_PTR(-ENOENT);  	dentry = d_obtain_alias(inode);  	if (IS_ERR(dentry)) { -		pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", -		       cfh->ino, inode);  		iput(inode);  		return dentry;  	}  	err = ceph_init_dentry(dentry);  	if (err < 0) { -		iput(inode); +		dput(dentry);  		return ERR_PTR(err);  	} -	dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry); +	dout("__get_parent ino %llx parent %p ino %llx.%llx\n", +	     child ? ceph_ino(child->d_inode) : ino, +	     dentry, ceph_vinop(inode));  	return dentry;  } -static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid, -					int fh_len, int fh_type) +static struct dentry *ceph_get_parent(struct dentry *child)  { -	if (fh_type == 1) -		return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw); -	else -		return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw); +	/* don't re-export snaps */ +	if (ceph_snap(child->d_inode) != CEPH_NOSNAP) +		return ERR_PTR(-EINVAL); + +	dout("get_parent %p ino %llx.%llx\n", +	     child, ceph_vinop(child->d_inode)); +	return __get_parent(child->d_sb, child, 0);  }  /* - * get parent, if possible. - * - * FIXME: we could do better by querying the mds to discover the - * parent. + * convert regular fh to parent   */  static struct dentry *ceph_fh_to_parent(struct super_block *sb, -					 struct fid *fid, +					struct fid *fid,  					int fh_len, int fh_type)  {  	struct ceph_nfs_confh *cfh = (void *)fid->raw; -	struct ceph_vino vino; -	struct inode *inode;  	struct dentry *dentry; -	int err; -	if (fh_type == 1) -		return ERR_PTR(-ESTALE); +	if (fh_type != FILEID_INO32_GEN_PARENT) +		return NULL; +	if (fh_len < sizeof(*cfh) / 4) +		return NULL; -	pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino, -		 cfh->parent_name_hash); +	dout("fh_to_parent %llx\n", cfh->parent_ino); +	dentry = __get_parent(sb, NULL, cfh->ino); +	if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT) +		dentry = __fh_to_dentry(sb, cfh->parent_ino); +	return dentry; +} -	vino.ino = cfh->ino; -	vino.snap = CEPH_NOSNAP; -	inode = ceph_find_inode(sb, vino); -	if (!inode) -		return ERR_PTR(-ESTALE); +static int ceph_get_name(struct dentry *parent, char *name, +			 struct dentry *child) +{ +	struct ceph_mds_client *mdsc; +	struct ceph_mds_request *req; +	int err; -	dentry = d_obtain_alias(inode); -	if (IS_ERR(dentry)) { -		pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", -		       cfh->ino, inode); -		iput(inode); -		return dentry; -	} -	err = ceph_init_dentry(dentry); -	if (err < 0) { -		iput(inode); -		return ERR_PTR(err); +	mdsc = ceph_inode_to_client(child->d_inode)->mdsc; +	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, +				       USE_ANY_MDS); +	if (IS_ERR(req)) +		return PTR_ERR(req); + +	mutex_lock(&parent->d_inode->i_mutex); + +	req->r_inode = child->d_inode; +	ihold(child->d_inode); +	req->r_ino2 = ceph_vino(parent->d_inode); +	req->r_locked_dir = parent->d_inode; +	req->r_num_caps = 2; +	err = ceph_mdsc_do_request(mdsc, NULL, req); + +	mutex_unlock(&parent->d_inode->i_mutex); + +	if (!err) { +		struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; +		memcpy(name, rinfo->dname, rinfo->dname_len); +		name[rinfo->dname_len] = 0; +		dout("get_name %p ino %llx.%llx name %s\n", +		     child, ceph_vinop(child->d_inode), name); +	} else { +		dout("get_name %p ino %llx.%llx err %d\n", +		     child, ceph_vinop(child->d_inode), err);  	} -	dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry); -	return dentry; + +	ceph_mdsc_put_request(req); +	return err;  }  const struct export_operations ceph_export_ops = {  	.encode_fh = ceph_encode_fh,  	.fh_to_dentry = ceph_fh_to_dentry,  	.fh_to_parent = ceph_fh_to_parent, +	.get_parent = ceph_get_parent, +	.get_name = ceph_get_name,  }; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e77c28cf369..302085100c2 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -4,11 +4,15 @@  #include <linux/sched.h>  #include <linux/slab.h>  #include <linux/file.h> +#include <linux/mount.h>  #include <linux/namei.h>  #include <linux/writeback.h> +#include <linux/aio.h> +#include <linux/falloc.h>  #include "super.h"  #include "mds_client.h" +#include "cache.h"  /*   * Ceph file operations @@ -54,7 +58,6 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)  	req->r_fmode = ceph_flags_to_mode(flags);  	req->r_args.open.flags = cpu_to_le32(flags);  	req->r_args.open.mode = cpu_to_le32(create_mode); -	req->r_args.open.preferred = cpu_to_le32(-1);  out:  	return req;  } @@ -67,9 +70,23 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)  {  	struct ceph_file_info *cf;  	int ret = 0; +	struct ceph_inode_info *ci = ceph_inode(inode); +	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); +	struct ceph_mds_client *mdsc = fsc->mdsc;  	switch (inode->i_mode & S_IFMT) {  	case S_IFREG: +		/* First file open request creates the cookie, we want to keep +		 * this cookie around for the filetime of the inode as not to +		 * have to worry about fscache register / revoke / operation +		 * races. +		 * +		 * Also, if we know the operation is going to invalidate data +		 * (non readonly) just nuke the cache right away. +		 */ +		ceph_fscache_register_inode_cookie(mdsc->fsc, ci); +		if ((fmode & CEPH_FILE_MODE_WR)) +			ceph_fscache_invalidate(inode);  	case S_IFDIR:  		dout("init_file %p %p 0%o (regular)\n", inode, file,  		     inode->i_mode); @@ -107,9 +124,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)  }  /* - * If the filp already has private_data, that means the file was - * already opened by intent during lookup, and we do nothing. - *   * If we already have the requisite capabilities, we can satisfy   * the open request locally (no need to request new caps from the   * MDS).  We do, however, need to inform the MDS (asynchronously) @@ -122,7 +136,7 @@ int ceph_open(struct inode *inode, struct file *file)  	struct ceph_mds_client *mdsc = fsc->mdsc;  	struct ceph_mds_request *req;  	struct ceph_file_info *cf = file->private_data; -	struct inode *parent_inode = file->f_dentry->d_parent->d_inode; +	struct inode *parent_inode = NULL;  	int err;  	int flags, fmode, wanted; @@ -147,18 +161,20 @@ int ceph_open(struct inode *inode, struct file *file)  	/* trivially open snapdir */  	if (ceph_snap(inode) == CEPH_SNAPDIR) { -		spin_lock(&inode->i_lock); +		spin_lock(&ci->i_ceph_lock);  		__ceph_get_fmode(ci, fmode); -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  		return ceph_init_file(inode, file, fmode);  	}  	/* -	 * No need to block if we have any caps.  Update wanted set +	 * No need to block if we have caps on the auth MDS (for +	 * write) or any MDS (for read).  Update wanted set  	 * asynchronously.  	 */ -	spin_lock(&inode->i_lock); -	if (__ceph_is_any_real_caps(ci)) { +	spin_lock(&ci->i_ceph_lock); +	if (__ceph_is_any_real_caps(ci) && +	    (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {  		int mds_wanted = __ceph_caps_mds_wanted(ci);  		int issued = __ceph_caps_issued(ci, NULL); @@ -166,7 +182,7 @@ int ceph_open(struct inode *inode, struct file *file)  		     inode, fmode, ceph_cap_string(wanted),  		     ceph_cap_string(issued));  		__ceph_get_fmode(ci, fmode); -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  		/* adjust wanted? */  		if ((issued & wanted) != wanted && @@ -178,10 +194,11 @@ int ceph_open(struct inode *inode, struct file *file)  	} else if (ceph_snap(inode) != CEPH_NOSNAP &&  		   (ci->i_snap_caps & wanted) == wanted) {  		__ceph_get_fmode(ci, fmode); -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  		return ceph_init_file(inode, file, fmode);  	} -	spin_unlock(&inode->i_lock); + +	spin_unlock(&ci->i_ceph_lock);  	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));  	req = prepare_open_request(inode->i_sb, flags, 0); @@ -189,9 +206,14 @@ int ceph_open(struct inode *inode, struct file *file)  		err = PTR_ERR(req);  		goto out;  	} -	req->r_inode = igrab(inode); +	req->r_inode = inode; +	ihold(inode); +  	req->r_num_caps = 1; +	if (flags & O_CREAT) +		parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);  	err = ceph_mdsc_do_request(mdsc, parent_inode, req); +	iput(parent_inode);  	if (!err)  		err = ceph_init_file(inode, file, req->r_fmode);  	ceph_mdsc_put_request(req); @@ -202,36 +224,34 @@ out:  /* - * Do a lookup + open with a single request. - * - * If this succeeds, but some subsequent check in the vfs - * may_open() fails, the struct *file gets cleaned up (i.e. - * ceph_release gets called).  So fear not! - */ -/* - * flags - *  path_lookup_open   -> LOOKUP_OPEN - *  path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE + * Do a lookup + open with a single request.  If we get a non-existent + * file or symlink, return 1 so the VFS can retry.   */ -struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, -				struct nameidata *nd, int mode, -				int locked_dir) +int ceph_atomic_open(struct inode *dir, struct dentry *dentry, +		     struct file *file, unsigned flags, umode_t mode, +		     int *opened)  {  	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);  	struct ceph_mds_client *mdsc = fsc->mdsc; -	struct file *file = nd->intent.open.file; -	struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);  	struct ceph_mds_request *req; +	struct dentry *dn;  	int err; -	int flags = nd->intent.open.flags - 1;  /* silly vfs! */ -	dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n", -	     dentry, dentry->d_name.len, dentry->d_name.name, flags, mode); +	dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n", +	     dir, dentry, dentry->d_name.len, dentry->d_name.name, +	     d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); + +	if (dentry->d_name.len > NAME_MAX) +		return -ENAMETOOLONG; + +	err = ceph_init_dentry(dentry); +	if (err < 0) +		return err;  	/* do the open */  	req = prepare_open_request(dir->i_sb, flags, mode);  	if (IS_ERR(req)) -		return ERR_CAST(req); +		return PTR_ERR(req);  	req->r_dentry = dget(dentry);  	req->r_num_caps = 2;  	if (flags & O_CREAT) { @@ -239,16 +259,44 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,  		req->r_dentry_unless = CEPH_CAP_FILE_EXCL;  	}  	req->r_locked_dir = dir;           /* caller holds dir->i_mutex */ -	err = ceph_mdsc_do_request(mdsc, parent_inode, req); -	dentry = ceph_finish_lookup(req, dentry, err); -	if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) +	err = ceph_mdsc_do_request(mdsc, +				   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, +				   req); +	if (err) +		goto out_err; + +	err = ceph_handle_snapdir(req, dentry, err); +	if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)  		err = ceph_handle_notrace_create(dir, dentry); -	if (!err) -		err = ceph_init_file(req->r_dentry->d_inode, file, -				     req->r_fmode); + +	if (d_unhashed(dentry)) { +		dn = ceph_finish_lookup(req, dentry, err); +		if (IS_ERR(dn)) +			err = PTR_ERR(dn); +	} else { +		/* we were given a hashed negative dentry */ +		dn = NULL; +	} +	if (err) +		goto out_err; +	if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) { +		/* make vfs retry on splice, ENOENT, or symlink */ +		dout("atomic_open finish_no_open on dn %p\n", dn); +		err = finish_no_open(file, dn); +	} else { +		dout("atomic_open finish_open on dn %p\n", dn); +		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { +			ceph_init_acl(dentry, dentry->d_inode, dir); +			*opened |= FILE_CREATED; +		} +		err = finish_open(file, dentry, ceph_open, opened); +	} +out_err: +	if (!req->r_err && req->r_target_inode) +		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);  	ceph_mdsc_put_request(req); -	dout("ceph_lookup_open result=%p\n", dentry); -	return dentry; +	dout("atomic_open result=%d\n", err); +	return err;  }  int ceph_release(struct inode *inode, struct file *file) @@ -280,13 +328,14 @@ int ceph_release(struct inode *inode, struct file *file)  static int striped_read(struct inode *inode,  			u64 off, u64 len,  			struct page **pages, int num_pages, -			int *checkeof) +			int *checkeof, bool o_direct, +			unsigned long buf_align)  {  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);  	struct ceph_inode_info *ci = ceph_inode(inode); -	u64 pos, this_len; -	int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ -	int left, pages_left; +	u64 pos, this_len, left; +	int io_align, page_align; +	int pages_left;  	int read;  	struct page **page_pos;  	int ret; @@ -300,58 +349,57 @@ static int striped_read(struct inode *inode,  	page_pos = pages;  	pages_left = num_pages;  	read = 0; +	io_align = off & ~PAGE_MASK;  more: +	if (o_direct) +		page_align = (pos - io_align + buf_align) & ~PAGE_MASK; +	else +		page_align = pos & ~PAGE_MASK;  	this_len = left;  	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),  				  &ci->i_layout, pos, &this_len,  				  ci->i_truncate_seq,  				  ci->i_truncate_size, -				  page_pos, pages_left); -	hit_stripe = this_len < left; -	was_short = ret >= 0 && ret < this_len; +				  page_pos, pages_left, page_align);  	if (ret == -ENOENT)  		ret = 0; -	dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, +	hit_stripe = this_len < left; +	was_short = ret >= 0 && ret < this_len; +	dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read,  	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); -	if (ret > 0) { -		int didpages = -			((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT; - -		if (read < pos - off) { -			dout(" zero gap %llu to %llu\n", off + read, pos); -			ceph_zero_page_vector_range(page_off + read, -						    pos - off - read, pages); +	if (ret >= 0) { +		int didpages; +		if (was_short && (pos + ret < inode->i_size)) { +			u64 tmp = min(this_len - ret, +					inode->i_size - pos - ret); +			dout(" zero gap %llu to %llu\n", +				pos + ret, pos + ret + tmp); +			ceph_zero_page_vector_range(page_align + read + ret, +							tmp, pages); +			ret += tmp;  		} + +		didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;  		pos += ret;  		read = pos - off;  		left -= ret;  		page_pos += didpages;  		pages_left -= didpages; -		/* hit stripe? */ -		if (left && hit_stripe) +		/* hit stripe and need continue*/ +		if (left && hit_stripe && pos < inode->i_size)  			goto more;  	} -	if (was_short) { -		/* was original extent fully inside i_size? */ -		if (pos + left <= inode->i_size) { -			dout("zero tail\n"); -			ceph_zero_page_vector_range(page_off + read, len - read, -						    pages); -			read = len; -			goto out; -		} - -		/* check i_size */ -		*checkeof = 1; +	if (read > 0) { +		ret = read; +		/* did we bounce off eof? */ +		if (pos + left > inode->i_size) +			*checkeof = 1;  	} -out: -	if (ret >= 0) -		ret = read;  	dout("striped_read returns %d\n", ret);  	return ret;  } @@ -362,213 +410,361 @@ out:   *   * If the read spans object boundary, just do multiple reads.   */ -static ssize_t ceph_sync_read(struct file *file, char __user *data, -			      unsigned len, loff_t *poff, int *checkeof) +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, +				int *checkeof)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct file *file = iocb->ki_filp; +	struct inode *inode = file_inode(file);  	struct page **pages; -	u64 off = *poff; -	int num_pages = calc_pages_for(off, len); -	int ret; +	u64 off = iocb->ki_pos; +	int num_pages, ret; +	size_t len = iov_iter_count(i); -	dout("sync_read on file %p %llu~%u %s\n", file, off, len, +	dout("sync_read on file %p %llu~%u %s\n", file, off, +	     (unsigned)len,  	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); +	/* +	 * flush any page cache pages in this range.  this +	 * will make concurrent normal and sync io slow, +	 * but it will at least behave sensibly when they are +	 * in sequence. +	 */ +	ret = filemap_write_and_wait_range(inode->i_mapping, off, +						off + len); +	if (ret < 0) +		return ret;  	if (file->f_flags & O_DIRECT) { -		pages = ceph_get_direct_page_vector(data, num_pages, off, len); +		while (iov_iter_count(i)) { +			size_t start; +			ssize_t n; -		/* -		 * flush any page cache pages in this range.  this -		 * will make concurrent normal and O_DIRECT io slow, -		 * but it will at least behave sensibly when they are -		 * in sequence. -		 */ -	} else { -		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); -	} -	if (IS_ERR(pages)) -		return PTR_ERR(pages); +			n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start); +			if (n < 0) +				return n; -	ret = filemap_write_and_wait(inode->i_mapping); -	if (ret < 0) -		goto done; +			num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE; -	ret = striped_read(inode, off, len, pages, num_pages, checkeof); +			ret = striped_read(inode, off, n, +					   pages, num_pages, checkeof, +					   1, start); -	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) -		ret = ceph_copy_page_vector_to_user(pages, data, off, ret); -	if (ret >= 0) -		*poff = off + ret; +			ceph_put_page_vector(pages, num_pages, true); -done: -	if (file->f_flags & O_DIRECT) -		ceph_put_page_vector(pages, num_pages); -	else +			if (ret <= 0) +				break; +			off += ret; +			iov_iter_advance(i, ret); +			if (ret < n) +				break; +		} +	} else { +		num_pages = calc_pages_for(off, len); +		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); +		if (IS_ERR(pages)) +			return PTR_ERR(pages); +		ret = striped_read(inode, off, len, pages, +					num_pages, checkeof, 0, 0); +		if (ret > 0) { +			int l, k = 0; +			size_t left = ret; + +			while (left) { +				int copy = min_t(size_t, PAGE_SIZE, left); +				l = copy_page_to_iter(pages[k++], 0, copy, i); +				off += l; +				left -= l; +				if (l < copy) +					break; +			} +		}  		ceph_release_page_vector(pages, num_pages); +	} + +	if (off > iocb->ki_pos) { +		ret = off - iocb->ki_pos; +		iocb->ki_pos = off; +	} +  	dout("sync_read result %d\n", ret);  	return ret;  }  /* - * Write commit callback, called if we requested both an ACK and - * ONDISK commit reply from the OSD. + * Write commit request unsafe callback, called to tell us when a + * request is unsafe (that is, in flight--has been handed to the + * messenger to send to its target osd).  It is called again when + * we've received a response message indicating the request is + * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request + * is completed early (and unsuccessfully) due to a timeout or + * interrupt. + * + * This is used if we requested both an ACK and ONDISK commit reply + * from the OSD.   */ -static void sync_write_commit(struct ceph_osd_request *req, -			      struct ceph_msg *msg) +static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)  {  	struct ceph_inode_info *ci = ceph_inode(req->r_inode); -	dout("sync_write_commit %p tid %llu\n", req, req->r_tid); -	spin_lock(&ci->i_unsafe_lock); -	list_del_init(&req->r_unsafe_item); -	spin_unlock(&ci->i_unsafe_lock); -	ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); +	dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid, +		unsafe ? "un" : ""); +	if (unsafe) { +		ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); +		spin_lock(&ci->i_unsafe_lock); +		list_add_tail(&req->r_unsafe_item, +			      &ci->i_unsafe_writes); +		spin_unlock(&ci->i_unsafe_lock); +	} else { +		spin_lock(&ci->i_unsafe_lock); +		list_del_init(&req->r_unsafe_item); +		spin_unlock(&ci->i_unsafe_lock); +		ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); +	}  } +  /* - * Synchronous write, straight from __user pointer or user pages (if - * O_DIRECT). + * Synchronous write, straight from __user pointer or user pages.   *   * If write spans object boundary, just do multiple writes.  (For a   * correct atomic write, we should e.g. take write locks on all   * objects, rollback on failure, etc.)   */ -static ssize_t ceph_sync_write(struct file *file, const char __user *data, -			       size_t left, loff_t *offset) +static ssize_t +ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct file *file = iocb->ki_filp; +	struct inode *inode = file_inode(file);  	struct ceph_inode_info *ci = ceph_inode(inode);  	struct ceph_fs_client *fsc = ceph_inode_to_client(inode); +	struct ceph_snap_context *snapc; +	struct ceph_vino vino;  	struct ceph_osd_request *req;  	struct page **pages;  	int num_pages; -	long long unsigned pos; -	u64 len;  	int written = 0;  	int flags; -	int do_sync = 0;  	int check_caps = 0;  	int ret;  	struct timespec mtime = CURRENT_TIME; +	loff_t pos = iocb->ki_pos; +	size_t count = iov_iter_count(from); -	if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP) +	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)  		return -EROFS; -	dout("sync_write on file %p %lld~%u %s\n", file, *offset, -	     (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - -	if (file->f_flags & O_APPEND) -		pos = i_size_read(inode); -	else -		pos = *offset; +	dout("sync_direct_write on file %p %lld~%u\n", file, pos, +	     (unsigned)count); -	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); +	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);  	if (ret < 0)  		return ret;  	ret = invalidate_inode_pages2_range(inode->i_mapping,  					    pos >> PAGE_CACHE_SHIFT, -					    (pos + left) >> PAGE_CACHE_SHIFT); +					    (pos + count) >> PAGE_CACHE_SHIFT);  	if (ret < 0)  		dout("invalidate_inode_pages2_range returned %d\n", ret);  	flags = CEPH_OSD_FLAG_ORDERSNAP |  		CEPH_OSD_FLAG_ONDISK |  		CEPH_OSD_FLAG_WRITE; -	if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) -		flags |= CEPH_OSD_FLAG_ACK; -	else -		do_sync = 1; -	/* -	 * we may need to do multiple writes here if we span an object -	 * boundary.  this isn't atomic, unfortunately.  :( -	 */ -more: -	len = left; -	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, -				    ceph_vino(inode), pos, &len, -				    CEPH_OSD_OP_WRITE, flags, -				    ci->i_snap_realm->cached_context, -				    do_sync, -				    ci->i_truncate_seq, ci->i_truncate_size, -				    &mtime, false, 2); -	if (!req) -		return -ENOMEM; - -	num_pages = calc_pages_for(pos, len); +	while (iov_iter_count(from) > 0) { +		u64 len = iov_iter_single_seg_count(from); +		size_t start; +		ssize_t n; + +		snapc = ci->i_snap_realm->cached_context; +		vino = ceph_vino(inode); +		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, +					    vino, pos, &len, +					    2,/*include a 'startsync' command*/ +					    CEPH_OSD_OP_WRITE, flags, snapc, +					    ci->i_truncate_seq, +					    ci->i_truncate_size, +					    false); +		if (IS_ERR(req)) { +			ret = PTR_ERR(req); +			break; +		} -	if (file->f_flags & O_DIRECT) { -		pages = ceph_get_direct_page_vector(data, num_pages, pos, len); -		if (IS_ERR(pages)) { -			ret = PTR_ERR(pages); -			goto out; +		n = iov_iter_get_pages_alloc(from, &pages, len, &start); +		if (unlikely(n < 0)) { +			ret = n; +			ceph_osdc_put_request(req); +			break;  		} +		num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;  		/*  		 * throw out any page cache pages in this range. this  		 * may block.  		 */  		truncate_inode_pages_range(inode->i_mapping, pos, -					   (pos+len) | (PAGE_CACHE_SIZE-1)); -	} else { +				   (pos+n) | (PAGE_CACHE_SIZE-1)); +		osd_req_op_extent_osd_data_pages(req, 0, pages, n, start, +						false, false); + +		/* BUG_ON(vino.snap != CEPH_NOSNAP); */ +		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); + +		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); +		if (!ret) +			ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + +		ceph_put_page_vector(pages, num_pages, false); + +		ceph_osdc_put_request(req); +		if (ret) +			break; +		pos += n; +		written += n; +		iov_iter_advance(from, n); + +		if (pos > i_size_read(inode)) { +			check_caps = ceph_inode_set_size(inode, pos); +			if (check_caps) +				ceph_check_caps(ceph_inode(inode), +						CHECK_CAPS_AUTHONLY, +						NULL); +		} +	} + +	if (ret != -EOLDSNAPC && written > 0) { +		iocb->ki_pos = pos; +		ret = written; +	} +	return ret; +} + + +/* + * Synchronous write, straight from __user pointer or user pages. + * + * If write spans object boundary, just do multiple writes.  (For a + * correct atomic write, we should e.g. take write locks on all + * objects, rollback on failure, etc.) + */ +static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from) +{ +	struct file *file = iocb->ki_filp; +	struct inode *inode = file_inode(file); +	struct ceph_inode_info *ci = ceph_inode(inode); +	struct ceph_fs_client *fsc = ceph_inode_to_client(inode); +	struct ceph_snap_context *snapc; +	struct ceph_vino vino; +	struct ceph_osd_request *req; +	struct page **pages; +	u64 len; +	int num_pages; +	int written = 0; +	int flags; +	int check_caps = 0; +	int ret; +	struct timespec mtime = CURRENT_TIME; +	loff_t pos = iocb->ki_pos; +	size_t count = iov_iter_count(from); + +	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) +		return -EROFS; + +	dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count); + +	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); +	if (ret < 0) +		return ret; + +	ret = invalidate_inode_pages2_range(inode->i_mapping, +					    pos >> PAGE_CACHE_SHIFT, +					    (pos + count) >> PAGE_CACHE_SHIFT); +	if (ret < 0) +		dout("invalidate_inode_pages2_range returned %d\n", ret); + +	flags = CEPH_OSD_FLAG_ORDERSNAP | +		CEPH_OSD_FLAG_ONDISK | +		CEPH_OSD_FLAG_WRITE | +		CEPH_OSD_FLAG_ACK; + +	while ((len = iov_iter_count(from)) > 0) { +		size_t left; +		int n; + +		snapc = ci->i_snap_realm->cached_context; +		vino = ceph_vino(inode); +		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, +					    vino, pos, &len, 1, +					    CEPH_OSD_OP_WRITE, flags, snapc, +					    ci->i_truncate_seq, +					    ci->i_truncate_size, +					    false); +		if (IS_ERR(req)) { +			ret = PTR_ERR(req); +			break; +		} + +		/* +		 * write from beginning of first page, +		 * regardless of io alignment +		 */ +		num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +  		pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);  		if (IS_ERR(pages)) {  			ret = PTR_ERR(pages);  			goto out;  		} -		ret = ceph_copy_user_to_page_vector(pages, data, pos, len); + +		left = len; +		for (n = 0; n < num_pages; n++) { +			size_t plen = min_t(size_t, left, PAGE_SIZE); +			ret = copy_page_from_iter(pages[n], 0, plen, from); +			if (ret != plen) { +				ret = -EFAULT; +				break; +			} +			left -= ret; +		} +  		if (ret < 0) {  			ceph_release_page_vector(pages, num_pages);  			goto out;  		} -		if ((file->f_flags & O_SYNC) == 0) { -			/* get a second commit callback */ -			req->r_safe_callback = sync_write_commit; -			req->r_own_pages = 1; -		} -	} -	req->r_pages = pages; -	req->r_num_pages = num_pages; -	req->r_inode = inode; +		/* get a second commit callback */ +		req->r_unsafe_callback = ceph_sync_write_unsafe; +		req->r_inode = inode; -	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); -	if (!ret) { -		if (req->r_safe_callback) { -			/* -			 * Add to inode unsafe list only after we -			 * start_request so that a tid has been assigned. -			 */ -			spin_lock(&ci->i_unsafe_lock); -			list_add(&req->r_unsafe_item, &ci->i_unsafe_writes); -			spin_unlock(&ci->i_unsafe_lock); -			ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); -		} -		ret = ceph_osdc_wait_request(&fsc->client->osdc, req); -	} +		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, +						false, true); -	if (file->f_flags & O_DIRECT) -		ceph_put_page_vector(pages, num_pages); -	else if (file->f_flags & O_SYNC) -		ceph_release_page_vector(pages, num_pages); +		/* BUG_ON(vino.snap != CEPH_NOSNAP); */ +		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); + +		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); +		if (!ret) +			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);  out: -	ceph_osdc_put_request(req); -	if (ret == 0) { -		pos += len; -		written += len; -		left -= len; -		if (left) -			goto more; +		ceph_osdc_put_request(req); +		if (ret == 0) { +			pos += len; +			written += len; + +			if (pos > i_size_read(inode)) { +				check_caps = ceph_inode_set_size(inode, pos); +				if (check_caps) +					ceph_check_caps(ceph_inode(inode), +							CHECK_CAPS_AUTHONLY, +							NULL); +			} +		} else +			break; +	} +	if (ret != -EOLDSNAPC && written > 0) {  		ret = written; -		*offset = pos; -		if (pos > i_size_read(inode)) -			check_caps = ceph_inode_set_size(inode, pos); -		if (check_caps) -			ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, -					NULL); +		iocb->ki_pos = pos;  	}  	return ret;  } @@ -580,61 +776,69 @@ out:   *   * Hmm, the sync read case isn't actually async... should it be?   */ -static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, -			     unsigned long nr_segs, loff_t pos) +static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)  {  	struct file *filp = iocb->ki_filp;  	struct ceph_file_info *fi = filp->private_data; -	loff_t *ppos = &iocb->ki_pos; -	size_t len = iov->iov_len; -	struct inode *inode = filp->f_dentry->d_inode; +	size_t len = iocb->ki_nbytes; +	struct inode *inode = file_inode(filp);  	struct ceph_inode_info *ci = ceph_inode(inode); -	void __user *base = iov->iov_base;  	ssize_t ret;  	int want, got = 0;  	int checkeof = 0, read = 0; -	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", -	     inode, ceph_vinop(inode), pos, (unsigned)len, inode);  again: -	__ceph_do_pending_vmtruncate(inode); +	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", +	     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); +  	if (fi->fmode & CEPH_FILE_MODE_LAZY)  		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;  	else  		want = CEPH_CAP_FILE_CACHE;  	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);  	if (ret < 0) -		goto out; -	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", -	     inode, ceph_vinop(inode), pos, (unsigned)len, -	     ceph_cap_string(got)); +		return ret;  	if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||  	    (iocb->ki_filp->f_flags & O_DIRECT) || -	    (inode->i_sb->s_flags & MS_SYNCHRONOUS)) +	    (fi->flags & CEPH_F_SYNC)) { + +		dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", +		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, +		     ceph_cap_string(got)); +  		/* hmm, this isn't really async... */ -		ret = ceph_sync_read(filp, base, len, ppos, &checkeof); -	else -		ret = generic_file_aio_read(iocb, iov, nr_segs, pos); +		ret = ceph_sync_read(iocb, to, &checkeof); +	} else { +		dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", +		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, +		     ceph_cap_string(got)); -out: +		ret = generic_file_read_iter(iocb, to); +	}  	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",  	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);  	ceph_put_cap_refs(ci, got);  	if (checkeof && ret >= 0) { -		int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); +		int statret = ceph_do_getattr(inode, +					      CEPH_STAT_CAP_SIZE);  		/* hit EOF or hole? */ -		if (statret == 0 && *ppos < inode->i_size) { -			dout("aio_read sync_read hit hole, reading more\n"); +		if (statret == 0 && iocb->ki_pos < inode->i_size && +			ret < len) { +			dout("sync_read hit hole, ppos %lld < size %lld" +			     ", reading more\n", iocb->ki_pos, +			     inode->i_size); + +			iov_iter_advance(to, ret);  			read += ret; -			base += ret;  			len -= ret;  			checkeof = 0;  			goto again;  		}  	} +  	if (ret >= 0)  		ret += read; @@ -651,95 +855,150 @@ out:   *   * If we are near ENOSPC, write synchronously.   */ -static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, -		       unsigned long nr_segs, loff_t pos) +static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)  {  	struct file *file = iocb->ki_filp;  	struct ceph_file_info *fi = file->private_data; -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_inode_info *ci = ceph_inode(inode);  	struct ceph_osd_client *osdc =  		&ceph_sb_to_client(inode->i_sb)->client->osdc; -	loff_t endoff = pos + iov->iov_len; -	int want, got = 0; -	int ret, err; +	ssize_t count = iov_iter_count(from), written = 0; +	int err, want, got; +	loff_t pos = iocb->ki_pos;  	if (ceph_snap(inode) != CEPH_NOSNAP)  		return -EROFS; +	mutex_lock(&inode->i_mutex); + +	/* We can write back this queue in page reclaim */ +	current->backing_dev_info = file->f_mapping->backing_dev_info; + +	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); +	if (err) +		goto out; + +	if (count == 0) +		goto out; +	iov_iter_truncate(from, count); + +	err = file_remove_suid(file); +	if (err) +		goto out; + +	err = file_update_time(file); +	if (err) +		goto out; +  retry_snap: -	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) -		return -ENOSPC; -	__ceph_do_pending_vmtruncate(inode); -	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", -	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, -	     inode->i_size); +	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { +		err = -ENOSPC; +		goto out; +	} + +	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", +	     inode, ceph_vinop(inode), pos, count, inode->i_size);  	if (fi->fmode & CEPH_FILE_MODE_LAZY)  		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;  	else  		want = CEPH_CAP_FILE_BUFFER; -	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); -	if (ret < 0) +	got = 0; +	err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count); +	if (err < 0)  		goto out; -	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n", -	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, -	     ceph_cap_string(got)); +	dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", +	     inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));  	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || -	    (iocb->ki_filp->f_flags & O_DIRECT) || -	    (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { -		ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, -			&iocb->ki_pos); -	} else { -		ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - -		if ((ret >= 0 || ret == -EIOCBQUEUED) && -		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) -		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { -			err = vfs_fsync_range(file, pos, pos + ret - 1, 1); -			if (err < 0) -				ret = err; +	    (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) { +		struct iov_iter data; +		mutex_unlock(&inode->i_mutex); +		/* we might need to revert back to that point */ +		data = *from; +		if (file->f_flags & O_DIRECT) +			written = ceph_sync_direct_write(iocb, &data); +		else +			written = ceph_sync_write(iocb, &data); +		if (written == -EOLDSNAPC) { +			dout("aio_write %p %llx.%llx %llu~%u" +				"got EOLDSNAPC, retrying\n", +				inode, ceph_vinop(inode), +				pos, (unsigned)count); +			mutex_lock(&inode->i_mutex); +			goto retry_snap;  		} +		if (written > 0) +			iov_iter_advance(from, written); +	} else { +		loff_t old_size = inode->i_size; +		/* +		 * No need to acquire the i_truncate_mutex. Because +		 * the MDS revokes Fwb caps before sending truncate +		 * message to us. We can't get Fwb cap while there +		 * are pending vmtruncate. So write and vmtruncate +		 * can not run at the same time +		 */ +		written = generic_perform_write(file, from, pos); +		if (likely(written >= 0)) +			iocb->ki_pos = pos + written; +		if (inode->i_size > old_size) +			ceph_fscache_update_objectsize(inode); +		mutex_unlock(&inode->i_mutex);  	} -	if (ret >= 0) { -		spin_lock(&inode->i_lock); -		__ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); -		spin_unlock(&inode->i_lock); + +	if (written >= 0) { +		int dirty; +		spin_lock(&ci->i_ceph_lock); +		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); +		spin_unlock(&ci->i_ceph_lock); +		if (dirty) +			__mark_inode_dirty(inode, dirty);  	} -out:  	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n", -	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, +	     inode, ceph_vinop(inode), pos, (unsigned)count,  	     ceph_cap_string(got));  	ceph_put_cap_refs(ci, got); -	if (ret == -EOLDSNAPC) { -		dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", -		     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); -		goto retry_snap; +	if (written >= 0 && +	    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || +	     ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { +		err = vfs_fsync_range(file, pos, pos + written - 1, 1); +		if (err < 0) +			written = err;  	} -	return ret; +	goto out_unlocked; + +out: +	mutex_unlock(&inode->i_mutex); +out_unlocked: +	current->backing_dev_info = NULL; +	return written ? written : err;  }  /*   * llseek.  be sure to verify file size on SEEK_END.   */ -static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) +static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)  {  	struct inode *inode = file->f_mapping->host;  	int ret;  	mutex_lock(&inode->i_mutex); -	__ceph_do_pending_vmtruncate(inode); -	switch (origin) { -	case SEEK_END: + +	if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {  		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);  		if (ret < 0) {  			offset = ret;  			goto out;  		} +	} + +	switch (whence) { +	case SEEK_END:  		offset += inode->i_size;  		break;  	case SEEK_CUR: @@ -755,39 +1014,239 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)  		}  		offset += file->f_pos;  		break; +	case SEEK_DATA: +		if (offset >= inode->i_size) { +			ret = -ENXIO; +			goto out; +		} +		break; +	case SEEK_HOLE: +		if (offset >= inode->i_size) { +			ret = -ENXIO; +			goto out; +		} +		offset = inode->i_size; +		break;  	} -	if (offset < 0 || offset > inode->i_sb->s_maxbytes) { -		offset = -EINVAL; +	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); + +out: +	mutex_unlock(&inode->i_mutex); +	return offset; +} + +static inline void ceph_zero_partial_page( +	struct inode *inode, loff_t offset, unsigned size) +{ +	struct page *page; +	pgoff_t index = offset >> PAGE_CACHE_SHIFT; + +	page = find_lock_page(inode->i_mapping, index); +	if (page) { +		wait_on_page_writeback(page); +		zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size); +		unlock_page(page); +		page_cache_release(page); +	} +} + +static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, +				      loff_t length) +{ +	loff_t nearly = round_up(offset, PAGE_CACHE_SIZE); +	if (offset < nearly) { +		loff_t size = nearly - offset; +		if (length < size) +			size = length; +		ceph_zero_partial_page(inode, offset, size); +		offset += size; +		length -= size; +	} +	if (length >= PAGE_CACHE_SIZE) { +		loff_t size = round_down(length, PAGE_CACHE_SIZE); +		truncate_pagecache_range(inode, offset, offset + size - 1); +		offset += size; +		length -= size; +	} +	if (length) +		ceph_zero_partial_page(inode, offset, length); +} + +static int ceph_zero_partial_object(struct inode *inode, +				    loff_t offset, loff_t *length) +{ +	struct ceph_inode_info *ci = ceph_inode(inode); +	struct ceph_fs_client *fsc = ceph_inode_to_client(inode); +	struct ceph_osd_request *req; +	int ret = 0; +	loff_t zero = 0; +	int op; + +	if (!length) { +		op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; +		length = &zero; +	} else { +		op = CEPH_OSD_OP_ZERO; +	} + +	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, +					ceph_vino(inode), +					offset, length, +					1, op, +					CEPH_OSD_FLAG_WRITE | +					CEPH_OSD_FLAG_ONDISK, +					NULL, 0, 0, false); +	if (IS_ERR(req)) { +		ret = PTR_ERR(req);  		goto out;  	} -	/* Special lock needed here? */ -	if (offset != file->f_pos) { -		file->f_pos = offset; -		file->f_version = 0; +	ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, +				&inode->i_mtime); + +	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); +	if (!ret) { +		ret = ceph_osdc_wait_request(&fsc->client->osdc, req); +		if (ret == -ENOENT) +			ret = 0;  	} +	ceph_osdc_put_request(req);  out: +	return ret; +} + +static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) +{ +	int ret = 0; +	struct ceph_inode_info *ci = ceph_inode(inode); +	s32 stripe_unit = ceph_file_layout_su(ci->i_layout); +	s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout); +	s32 object_size = ceph_file_layout_object_size(ci->i_layout); +	u64 object_set_size = object_size * stripe_count; +	u64 nearly, t; + +	/* round offset up to next period boundary */ +	nearly = offset + object_set_size - 1; +	t = nearly; +	nearly -= do_div(t, object_set_size); + +	while (length && offset < nearly) { +		loff_t size = length; +		ret = ceph_zero_partial_object(inode, offset, &size); +		if (ret < 0) +			return ret; +		offset += size; +		length -= size; +	} +	while (length >= object_set_size) { +		int i; +		loff_t pos = offset; +		for (i = 0; i < stripe_count; ++i) { +			ret = ceph_zero_partial_object(inode, pos, NULL); +			if (ret < 0) +				return ret; +			pos += stripe_unit; +		} +		offset += object_set_size; +		length -= object_set_size; +	} +	while (length) { +		loff_t size = length; +		ret = ceph_zero_partial_object(inode, offset, &size); +		if (ret < 0) +			return ret; +		offset += size; +		length -= size; +	} +	return ret; +} + +static long ceph_fallocate(struct file *file, int mode, +				loff_t offset, loff_t length) +{ +	struct ceph_file_info *fi = file->private_data; +	struct inode *inode = file_inode(file); +	struct ceph_inode_info *ci = ceph_inode(inode); +	struct ceph_osd_client *osdc = +		&ceph_inode_to_client(inode)->client->osdc; +	int want, got = 0; +	int dirty; +	int ret = 0; +	loff_t endoff = 0; +	loff_t size; + +	if (!S_ISREG(inode->i_mode)) +		return -EOPNOTSUPP; + +	mutex_lock(&inode->i_mutex); + +	if (ceph_snap(inode) != CEPH_NOSNAP) { +		ret = -EROFS; +		goto unlock; +	} + +	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) && +		!(mode & FALLOC_FL_PUNCH_HOLE)) { +		ret = -ENOSPC; +		goto unlock; +	} + +	size = i_size_read(inode); +	if (!(mode & FALLOC_FL_KEEP_SIZE)) +		endoff = offset + length; + +	if (fi->fmode & CEPH_FILE_MODE_LAZY) +		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; +	else +		want = CEPH_CAP_FILE_BUFFER; + +	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); +	if (ret < 0) +		goto unlock; + +	if (mode & FALLOC_FL_PUNCH_HOLE) { +		if (offset < size) +			ceph_zero_pagecache_range(inode, offset, length); +		ret = ceph_zero_objects(inode, offset, length); +	} else if (endoff > size) { +		truncate_pagecache_range(inode, size, -1); +		if (ceph_inode_set_size(inode, endoff)) +			ceph_check_caps(ceph_inode(inode), +				CHECK_CAPS_AUTHONLY, NULL); +	} + +	if (!ret) { +		spin_lock(&ci->i_ceph_lock); +		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); +		spin_unlock(&ci->i_ceph_lock); +		if (dirty) +			__mark_inode_dirty(inode, dirty); +	} + +	ceph_put_cap_refs(ci, got); +unlock:  	mutex_unlock(&inode->i_mutex); -	return offset; +	return ret;  }  const struct file_operations ceph_file_fops = {  	.open = ceph_open,  	.release = ceph_release,  	.llseek = ceph_llseek, -	.read = do_sync_read, -	.write = do_sync_write, -	.aio_read = ceph_aio_read, -	.aio_write = ceph_aio_write, +	.read = new_sync_read, +	.write = new_sync_write, +	.read_iter = ceph_read_iter, +	.write_iter = ceph_write_iter,  	.mmap = ceph_mmap,  	.fsync = ceph_fsync,  	.lock = ceph_lock,  	.flock = ceph_flock,  	.splice_read = generic_file_splice_read, -	.splice_write = generic_file_splice_write, +	.splice_write = iter_file_splice_write,  	.unlocked_ioctl = ceph_ioctl,  	.compat_ioctl	= ceph_ioctl, +	.fallocate	= ceph_fallocate,  }; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1d6a45b5a04..04c89c266ce 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2,7 +2,6 @@  #include <linux/module.h>  #include <linux/fs.h> -#include <linux/smp_lock.h>  #include <linux/slab.h>  #include <linux/string.h>  #include <linux/uaccess.h> @@ -10,10 +9,12 @@  #include <linux/namei.h>  #include <linux/writeback.h>  #include <linux/vmalloc.h> -#include <linux/pagevec.h> +#include <linux/posix_acl.h> +#include <linux/random.h>  #include "super.h"  #include "mds_client.h" +#include "cache.h"  #include <linux/ceph/decode.h>  /* @@ -37,6 +38,13 @@ static void ceph_vmtruncate_work(struct work_struct *work);  /*   * find or create an inode, given the ceph ino number   */ +static int ceph_set_ino_cb(struct inode *inode, void *data) +{ +	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; +	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); +	return 0; +} +  struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)  {  	struct inode *inode; @@ -89,6 +97,8 @@ const struct inode_operations ceph_file_iops = {  	.getxattr = ceph_getxattr,  	.listxattr = ceph_listxattr,  	.removexattr = ceph_removexattr, +	.get_acl = ceph_get_acl, +	.set_acl = ceph_set_acl,  }; @@ -170,9 +180,8 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)   * specified, copy the frag delegation info to the caller if   * it is present.   */ -u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, -		     struct ceph_inode_frag *pfrag, -		     int *found) +static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, +			      struct ceph_inode_frag *pfrag, int *found)  {  	u32 t = ceph_frag_make(0, 0);  	struct ceph_inode_frag *frag; @@ -182,7 +191,6 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,  	if (found)  		*found = 0; -	mutex_lock(&ci->i_fragtree_mutex);  	while (1) {  		WARN_ON(!ceph_frag_contains_value(t, v));  		frag = __ceph_find_frag(ci, t); @@ -211,10 +219,19 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,  	}  	dout("choose_frag(%x) = %x\n", v, t); -	mutex_unlock(&ci->i_fragtree_mutex);  	return t;  } +u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, +		     struct ceph_inode_frag *pfrag, int *found) +{ +	u32 ret; +	mutex_lock(&ci->i_fragtree_mutex); +	ret = __ceph_choose_frag(ci, v, pfrag, found); +	mutex_unlock(&ci->i_fragtree_mutex); +	return ret; +} +  /*   * Process dirfrag (delegation) info from the mds.  Include leaf   * fragment in tree ONLY if ndist > 0.  Otherwise, only @@ -228,11 +245,17 @@ static int ceph_fill_dirfrag(struct inode *inode,  	u32 id = le32_to_cpu(dirinfo->frag);  	int mds = le32_to_cpu(dirinfo->auth);  	int ndist = le32_to_cpu(dirinfo->ndist); +	int diri_auth = -1;  	int i;  	int err = 0; +	spin_lock(&ci->i_ceph_lock); +	if (ci->i_auth_cap) +		diri_auth = ci->i_auth_cap->mds; +	spin_unlock(&ci->i_ceph_lock); +  	mutex_lock(&ci->i_fragtree_mutex); -	if (ndist == 0) { +	if (ndist == 0 && mds == diri_auth) {  		/* no delegation info needed. */  		frag = __ceph_find_frag(ci, id);  		if (!frag) @@ -277,6 +300,75 @@ out:  	return err;  } +static int ceph_fill_fragtree(struct inode *inode, +			      struct ceph_frag_tree_head *fragtree, +			      struct ceph_mds_reply_dirfrag *dirinfo) +{ +	struct ceph_inode_info *ci = ceph_inode(inode); +	struct ceph_inode_frag *frag; +	struct rb_node *rb_node; +	int i; +	u32 id, nsplits; +	bool update = false; + +	mutex_lock(&ci->i_fragtree_mutex); +	nsplits = le32_to_cpu(fragtree->nsplits); +	if (nsplits) { +		i = prandom_u32() % nsplits; +		id = le32_to_cpu(fragtree->splits[i].frag); +		if (!__ceph_find_frag(ci, id)) +			update = true; +	} else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) { +		rb_node = rb_first(&ci->i_fragtree); +		frag = rb_entry(rb_node, struct ceph_inode_frag, node); +		if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node)) +			update = true; +	} +	if (!update && dirinfo) { +		id = le32_to_cpu(dirinfo->frag); +		if (id != __ceph_choose_frag(ci, id, NULL, NULL)) +			update = true; +	} +	if (!update) +		goto out_unlock; + +	dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); +	rb_node = rb_first(&ci->i_fragtree); +	for (i = 0; i < nsplits; i++) { +		id = le32_to_cpu(fragtree->splits[i].frag); +		frag = NULL; +		while (rb_node) { +			frag = rb_entry(rb_node, struct ceph_inode_frag, node); +			if (ceph_frag_compare(frag->frag, id) >= 0) { +				if (frag->frag != id) +					frag = NULL; +				else +					rb_node = rb_next(rb_node); +				break; +			} +			rb_node = rb_next(rb_node); +			rb_erase(&frag->node, &ci->i_fragtree); +			kfree(frag); +			frag = NULL; +		} +		if (!frag) { +			frag = __get_or_create_frag(ci, id); +			if (IS_ERR(frag)) +				continue; +		} +		frag->split_by = le32_to_cpu(fragtree->splits[i].by); +		dout(" frag %x split by %d\n", frag->frag, frag->split_by); +	} +	while (rb_node) { +		frag = rb_entry(rb_node, struct ceph_inode_frag, node); +		rb_node = rb_next(rb_node); +		rb_erase(&frag->node, &ci->i_fragtree); +		kfree(frag); +	} +out_unlock: +	mutex_unlock(&ci->i_fragtree_mutex); +	return 0; +}  /*   * initialize a newly allocated inode. @@ -292,12 +384,17 @@ struct inode *ceph_alloc_inode(struct super_block *sb)  	dout("alloc_inode %p\n", &ci->vfs_inode); +	spin_lock_init(&ci->i_ceph_lock); +  	ci->i_version = 0;  	ci->i_time_warp_seq = 0;  	ci->i_ceph_flags = 0; -	ci->i_release_count = 0; +	atomic_set(&ci->i_release_count, 1); +	atomic_set(&ci->i_complete_count, 0);  	ci->i_symlink = NULL; +	memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); +  	ci->i_fragtree = RB_ROOT;  	mutex_init(&ci->i_fragtree_mutex); @@ -324,9 +421,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)  	ci->i_hold_caps_min = 0;  	ci->i_hold_caps_max = 0;  	INIT_LIST_HEAD(&ci->i_cap_delay_list); -	ci->i_cap_exporting_mds = 0; -	ci->i_cap_exporting_mseq = 0; -	ci->i_cap_exporting_issued = 0;  	INIT_LIST_HEAD(&ci->i_cap_snaps);  	ci->i_head_snapc = NULL;  	ci->i_snap_caps = 0; @@ -334,6 +428,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)  	for (i = 0; i < CEPH_FILE_MODE_NUM; i++)  		ci->i_nr_by_mode[i] = 0; +	mutex_init(&ci->i_truncate_mutex);  	ci->i_truncate_seq = 0;  	ci->i_truncate_size = 0;  	ci->i_truncate_pending = 0; @@ -347,6 +442,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)  	ci->i_rd_ref = 0;  	ci->i_rdcache_ref = 0;  	ci->i_wr_ref = 0; +	ci->i_wb_ref = 0;  	ci->i_wrbuffer_ref = 0;  	ci->i_wrbuffer_ref_head = 0;  	ci->i_shared_gen = 0; @@ -366,9 +462,19 @@ struct inode *ceph_alloc_inode(struct super_block *sb)  	INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work); +	ceph_fscache_inode_init(ci); +  	return &ci->vfs_inode;  } +static void ceph_i_callback(struct rcu_head *head) +{ +	struct inode *inode = container_of(head, struct inode, i_rcu); +	struct ceph_inode_info *ci = ceph_inode(inode); + +	kmem_cache_free(ceph_inode_cachep, ci); +} +  void ceph_destroy_inode(struct inode *inode)  {  	struct ceph_inode_info *ci = ceph_inode(inode); @@ -377,11 +483,13 @@ void ceph_destroy_inode(struct inode *inode)  	dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); +	ceph_fscache_unregister_inode_cookie(ci); +  	ceph_queue_caps_release(inode);  	/*  	 * we may still have a snap_realm reference if there are stray -	 * caps in i_cap_exporting_issued or i_snap_caps. +	 * caps in i_snap_caps.  	 */  	if (ci->i_snap_realm) {  		struct ceph_mds_client *mdsc = @@ -408,9 +516,18 @@ void ceph_destroy_inode(struct inode *inode)  	if (ci->i_xattrs.prealloc_blob)  		ceph_buffer_put(ci->i_xattrs.prealloc_blob); -	kmem_cache_free(ceph_inode_cachep, ci); +	call_rcu(&inode->i_rcu, ceph_i_callback);  } +int ceph_drop_inode(struct inode *inode) +{ +	/* +	 * Positve dentry and corresponding inode are always accompanied +	 * in MDS reply. So no need to keep inode in the cache after +	 * dropping all its aliases. +	 */ +	return 1; +}  /*   * Helpers to fill in size, ctime, mtime, and atime.  We have to be @@ -436,16 +553,20 @@ int ceph_fill_file_size(struct inode *inode, int issued,  			dout("truncate_seq %u -> %u\n",  			     ci->i_truncate_seq, truncate_seq);  			ci->i_truncate_seq = truncate_seq; + +			/* the MDS should have revoked these caps */ +			WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL | +					       CEPH_CAP_FILE_RD | +					       CEPH_CAP_FILE_WR | +					       CEPH_CAP_FILE_LAZYIO));  			/*  			 * If we hold relevant caps, or in the case where we're  			 * not the only client referencing this file and we  			 * don't hold those caps, then we need to check whether  			 * the file is either opened or mmaped  			 */ -			if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| -				       CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| -				       CEPH_CAP_FILE_EXCL| -				       CEPH_CAP_FILE_LAZYIO)) || +			if ((issued & (CEPH_CAP_FILE_CACHE| +				       CEPH_CAP_FILE_BUFFER)) ||  			    mapping_mapped(inode->i_mapping) ||  			    __ceph_caps_file_wanted(ci)) {  				ci->i_truncate_pending++; @@ -459,6 +580,10 @@ int ceph_fill_file_size(struct inode *inode, int issued,  		     truncate_size);  		ci->i_truncate_size = truncate_size;  	} + +	if (queue_trunc) +		ceph_fscache_invalidate(inode); +  	return queue_trunc;  } @@ -471,7 +596,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,  	if (issued & (CEPH_CAP_FILE_EXCL|  		      CEPH_CAP_FILE_WR| -		      CEPH_CAP_FILE_BUFFER)) { +		      CEPH_CAP_FILE_BUFFER| +		      CEPH_CAP_AUTH_EXCL| +		      CEPH_CAP_XATTR_EXCL)) {  		if (timespec_compare(ctime, &inode->i_ctime) > 0) {  			dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",  			     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, @@ -511,7 +638,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,  			warn = 1;  		}  	} else { -		/* we have no write caps; whatever the MDS says is true */ +		/* we have no write|excl caps; whatever the MDS says is true */  		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {  			inode->i_ctime = *ctime;  			inode->i_mtime = *mtime; @@ -537,20 +664,26 @@ static int fill_inode(struct inode *inode,  		      unsigned long ttl_from, int cap_fmode,  		      struct ceph_cap_reservation *caps_reservation)  { +	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;  	struct ceph_mds_reply_inode *info = iinfo->in;  	struct ceph_inode_info *ci = ceph_inode(inode); -	int i; -	int issued, implemented; +	int issued = 0, implemented, new_issued;  	struct timespec mtime, atime, ctime; -	u32 nsplits;  	struct ceph_buffer *xattr_blob = NULL; +	struct ceph_cap *new_cap = NULL;  	int err = 0; -	int queue_trunc = 0; +	bool wake = false; +	bool queue_trunc = false; +	bool new_version = false;  	dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",  	     inode, ceph_vinop(inode), le64_to_cpu(info->version),  	     ci->i_version); +	/* prealloc new cap struct */ +	if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP) +		new_cap = ceph_get_cap(mdsc, caps_reservation); +  	/*  	 * prealloc xattr data, if it looks like we'll need it.  only  	 * if len > 4 (meaning there are actually xattrs; the first 4 @@ -563,52 +696,73 @@ static int fill_inode(struct inode *inode,  			       iinfo->xattr_len);  	} -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	/*  	 * provided version will be odd if inode value is projected, -	 * even if stable.  skip the update if we have a newer info -	 * (e.g., due to inode info racing form multiple MDSs), or if -	 * we are getting projected (unstable) inode info. +	 * even if stable.  skip the update if we have newer stable +	 * info (ours>=theirs, e.g. due to racing mds replies), unless +	 * we are getting projected (unstable) info (in which case the +	 * version is odd, and we want ours>theirs). +	 *   us   them +	 *   2    2     skip +	 *   3    2     skip +	 *   3    3     update  	 */ -	if (le64_to_cpu(info->version) > 0 && -	    (ci->i_version & ~1) > le64_to_cpu(info->version)) -		goto no_change; +	if (ci->i_version == 0 || +	    ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && +	     le64_to_cpu(info->version) > (ci->i_version & ~1))) +		new_version = true;  	issued = __ceph_caps_issued(ci, &implemented);  	issued |= implemented | __ceph_caps_dirty(ci); +	new_issued = ~issued & le32_to_cpu(info->cap.caps);  	/* update inode */  	ci->i_version = le64_to_cpu(info->version);  	inode->i_version++;  	inode->i_rdev = le32_to_cpu(info->rdev); +	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; -	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { +	if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && +	    (issued & CEPH_CAP_AUTH_EXCL) == 0) {  		inode->i_mode = le32_to_cpu(info->mode); -		inode->i_uid = le32_to_cpu(info->uid); -		inode->i_gid = le32_to_cpu(info->gid); +		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); +		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));  		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, -		     inode->i_uid, inode->i_gid); +		     from_kuid(&init_user_ns, inode->i_uid), +		     from_kgid(&init_user_ns, inode->i_gid));  	} -	if ((issued & CEPH_CAP_LINK_EXCL) == 0) -		inode->i_nlink = le32_to_cpu(info->nlink); - -	/* be careful with mtime, atime, size */ -	ceph_decode_timespec(&atime, &info->atime); -	ceph_decode_timespec(&mtime, &info->mtime); -	ceph_decode_timespec(&ctime, &info->ctime); -	queue_trunc = ceph_fill_file_size(inode, issued, -					  le32_to_cpu(info->truncate_seq), -					  le64_to_cpu(info->truncate_size), -					  le64_to_cpu(info->size)); -	ceph_fill_file_time(inode, issued, -			    le32_to_cpu(info->time_warp_seq), -			    &ctime, &mtime, &atime); - -	ci->i_max_size = le64_to_cpu(info->max_size); -	ci->i_layout = info->layout; -	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; +	if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && +	    (issued & CEPH_CAP_LINK_EXCL) == 0) +		set_nlink(inode, le32_to_cpu(info->nlink)); + +	if (new_version || (new_issued & CEPH_CAP_ANY_RD)) { +		/* be careful with mtime, atime, size */ +		ceph_decode_timespec(&atime, &info->atime); +		ceph_decode_timespec(&mtime, &info->mtime); +		ceph_decode_timespec(&ctime, &info->ctime); +		ceph_fill_file_time(inode, issued, +				le32_to_cpu(info->time_warp_seq), +				&ctime, &mtime, &atime); +	} + +	if (new_version || +	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { +		ci->i_layout = info->layout; +		queue_trunc = ceph_fill_file_size(inode, issued, +					le32_to_cpu(info->truncate_seq), +					le64_to_cpu(info->truncate_size), +					le64_to_cpu(info->size)); +		/* only update max_size on auth cap */ +		if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && +		    ci->i_max_size != le64_to_cpu(info->max_size)) { +			dout("max_size %lld -> %llu\n", ci->i_max_size, +					le64_to_cpu(info->max_size)); +			ci->i_max_size = le64_to_cpu(info->max_size); +		} +	}  	/* xattrs */  	/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ @@ -621,6 +775,7 @@ static int fill_inode(struct inode *inode,  			memcpy(ci->i_xattrs.blob->vec.iov_base,  			       iinfo->xattr_data, iinfo->xattr_len);  		ci->i_xattrs.version = le64_to_cpu(info->xattr_version); +		ceph_forget_all_cached_acls(inode);  		xattr_blob = NULL;  	} @@ -643,20 +798,21 @@ static int fill_inode(struct inode *inode,  	case S_IFLNK:  		inode->i_op = &ceph_symlink_iops;  		if (!ci->i_symlink) { -			int symlen = iinfo->symlink_len; +			u32 symlen = iinfo->symlink_len;  			char *sym; -			BUG_ON(symlen != inode->i_size); -			spin_unlock(&inode->i_lock); +			spin_unlock(&ci->i_ceph_lock); + +			err = -EINVAL; +			if (WARN_ON(symlen != inode->i_size)) +				goto out;  			err = -ENOMEM; -			sym = kmalloc(symlen+1, GFP_NOFS); +			sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);  			if (!sym)  				goto out; -			memcpy(sym, iinfo->symlink, symlen); -			sym[symlen] = 0; -			spin_lock(&inode->i_lock); +			spin_lock(&ci->i_ceph_lock);  			if (!ci->i_symlink)  				ci->i_symlink = sym;  			else @@ -667,54 +823,30 @@ static int fill_inode(struct inode *inode,  		inode->i_op = &ceph_dir_iops;  		inode->i_fop = &ceph_dir_fops; +		ci->i_dir_layout = iinfo->dir_layout; +  		ci->i_files = le64_to_cpu(info->files);  		ci->i_subdirs = le64_to_cpu(info->subdirs);  		ci->i_rbytes = le64_to_cpu(info->rbytes);  		ci->i_rfiles = le64_to_cpu(info->rfiles);  		ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);  		ceph_decode_timespec(&ci->i_rctime, &info->rctime); - -		/* set dir completion flag? */ -		if (ci->i_files == 0 && ci->i_subdirs == 0 && -		    ceph_snap(inode) == CEPH_NOSNAP && -		    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && -		    (issued & CEPH_CAP_FILE_EXCL) == 0 && -		    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { -			dout(" marking %p complete (empty)\n", inode); -			ci->i_ceph_flags |= CEPH_I_COMPLETE; -			ci->i_max_offset = 2; -		} - -		/* it may be better to set st_size in getattr instead? */ -		if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) -			inode->i_size = ci->i_rbytes;  		break;  	default:  		pr_err("fill_inode %llx.%llx BAD mode 0%o\n",  		       ceph_vinop(inode), inode->i_mode);  	} -no_change: -	spin_unlock(&inode->i_lock); - -	/* queue truncate if we saw i_size decrease */ -	if (queue_trunc) -		ceph_queue_vmtruncate(inode); - -	/* populate frag tree */ -	/* FIXME: move me up, if/when version reflects fragtree changes */ -	nsplits = le32_to_cpu(info->fragtree.nsplits); -	mutex_lock(&ci->i_fragtree_mutex); -	for (i = 0; i < nsplits; i++) { -		u32 id = le32_to_cpu(info->fragtree.splits[i].frag); -		struct ceph_inode_frag *frag = __get_or_create_frag(ci, id); - -		if (IS_ERR(frag)) -			continue; -		frag->split_by = le32_to_cpu(info->fragtree.splits[i].by); -		dout(" frag %x split by %d\n", frag->frag, frag->split_by); +	/* set dir completion flag? */ +	if (S_ISDIR(inode->i_mode) && +	    ci->i_files == 0 && ci->i_subdirs == 0 && +	    ceph_snap(inode) == CEPH_NOSNAP && +	    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && +	    (issued & CEPH_CAP_FILE_EXCL) == 0 && +	    !__ceph_dir_is_complete(ci)) { +		dout(" marking %p complete (empty)\n", inode); +		__ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));  	} -	mutex_unlock(&ci->i_fragtree_mutex);  	/* were we issued a capability? */  	if (info->cap.caps) { @@ -727,30 +859,41 @@ no_change:  				     le32_to_cpu(info->cap.seq),  				     le32_to_cpu(info->cap.mseq),  				     le64_to_cpu(info->cap.realm), -				     info->cap.flags, -				     caps_reservation); +				     info->cap.flags, &new_cap); +			wake = true;  		} else { -			spin_lock(&inode->i_lock);  			dout(" %p got snap_caps %s\n", inode,  			     ceph_cap_string(le32_to_cpu(info->cap.caps)));  			ci->i_snap_caps |= le32_to_cpu(info->cap.caps);  			if (cap_fmode >= 0)  				__ceph_get_fmode(ci, cap_fmode); -			spin_unlock(&inode->i_lock);  		}  	} else if (cap_fmode >= 0) { -		pr_warning("mds issued no caps on %llx.%llx\n", +		pr_warn("mds issued no caps on %llx.%llx\n",  			   ceph_vinop(inode));  		__ceph_get_fmode(ci, cap_fmode);  	} +	spin_unlock(&ci->i_ceph_lock); + +	if (wake) +		wake_up_all(&ci->i_cap_wq); + +	/* queue truncate if we saw i_size decrease */ +	if (queue_trunc) +		ceph_queue_vmtruncate(inode); + +	/* populate frag tree */ +	if (S_ISDIR(inode->i_mode)) +		ceph_fill_fragtree(inode, &info->fragtree, dirinfo);  	/* update delegation info? */  	if (dirinfo)  		ceph_fill_dirfrag(inode, dirinfo);  	err = 0; -  out: +	if (new_cap) +		ceph_put_cap(mdsc, new_cap);  	if (xattr_blob)  		ceph_buffer_put(xattr_blob);  	return err; @@ -775,14 +918,14 @@ static void update_dentry_lease(struct dentry *dentry,  		return;  	spin_lock(&dentry->d_lock); -	dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n", -	     dentry, le16_to_cpu(lease->mask), duration, ttl); +	dout("update_dentry_lease %p duration %lu ms ttl %lu\n", +	     dentry, duration, ttl);  	/* make lease_rdcache_gen match directory */  	dir = dentry->d_parent->d_inode;  	di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; -	if (lease->mask == 0) +	if (duration == 0)  		goto out_unlock;  	if (di->lease_gen == session->s_cap_gen && @@ -807,37 +950,6 @@ out_unlock:  }  /* - * Set dentry's directory position based on the current dir's max, and - * order it in d_subdirs, so that dcache_readdir behaves. - */ -static void ceph_set_dentry_offset(struct dentry *dn) -{ -	struct dentry *dir = dn->d_parent; -	struct inode *inode = dn->d_parent->d_inode; -	struct ceph_dentry_info *di; - -	BUG_ON(!inode); - -	di = ceph_dentry(dn); - -	spin_lock(&inode->i_lock); -	if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) { -		spin_unlock(&inode->i_lock); -		return; -	} -	di->offset = ceph_inode(inode)->i_max_offset++; -	spin_unlock(&inode->i_lock); - -	spin_lock(&dcache_lock); -	spin_lock(&dn->d_lock); -	list_move(&dn->d_u.d_child, &dir->d_subdirs); -	dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, -	     dn->d_u.d_child.prev, dn->d_u.d_child.next); -	spin_unlock(&dn->d_lock); -	spin_unlock(&dcache_lock); -} - -/*   * splice a dentry to an inode.   * caller must hold directory i_mutex for this to be safe.   * @@ -846,7 +958,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)   * the caller) if we fail.   */  static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, -				    bool *prehash, bool set_offset) +				    bool *prehash)  {  	struct dentry *realdn; @@ -866,8 +978,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,  	} else if (realdn) {  		dout("dn %p (%d) spliced with %p (%d) "  		     "inode %p ino %llx.%llx\n", -		     dn, atomic_read(&dn->d_count), -		     realdn, atomic_read(&realdn->d_count), +		     dn, d_count(dn), +		     realdn, d_count(realdn),  		     realdn->d_inode, ceph_vinop(realdn->d_inode));  		dput(dn);  		dn = realdn; @@ -878,8 +990,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,  	}  	if ((!prehash || *prehash) && d_unhashed(dn))  		d_rehash(dn); -	if (set_offset) -		ceph_set_dentry_offset(dn);  out:  	return dn;  } @@ -900,10 +1010,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,  {  	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;  	struct inode *in = NULL; -	struct ceph_mds_reply_inode *ininfo;  	struct ceph_vino vino;  	struct ceph_fs_client *fsc = ceph_sb_to_client(sb); -	int i = 0;  	int err = 0;  	dout("fill_trace %p is_dentry %d is_target %d\n", req, @@ -953,11 +1061,87 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,  	if (rinfo->head->is_dentry) {  		struct inode *dir = req->r_locked_dir; -		err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, -				 session, req->r_request_started, -1, -				 &req->r_caps_reservation); -		if (err < 0) -			return err; +		if (dir) { +			err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, +					 session, req->r_request_started, -1, +					 &req->r_caps_reservation); +			if (err < 0) +				goto done; +		} else { +			WARN_ON_ONCE(1); +		} + +		if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) { +			struct qstr dname; +			struct dentry *dn, *parent; + +			BUG_ON(!rinfo->head->is_target); +			BUG_ON(req->r_dentry); + +			parent = d_find_any_alias(dir); +			BUG_ON(!parent); + +			dname.name = rinfo->dname; +			dname.len = rinfo->dname_len; +			dname.hash = full_name_hash(dname.name, dname.len); +			vino.ino = le64_to_cpu(rinfo->targeti.in->ino); +			vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); +retry_lookup: +			dn = d_lookup(parent, &dname); +			dout("d_lookup on parent=%p name=%.*s got %p\n", +			     parent, dname.len, dname.name, dn); + +			if (!dn) { +				dn = d_alloc(parent, &dname); +				dout("d_alloc %p '%.*s' = %p\n", parent, +				     dname.len, dname.name, dn); +				if (dn == NULL) { +					dput(parent); +					err = -ENOMEM; +					goto done; +				} +				err = ceph_init_dentry(dn); +				if (err < 0) { +					dput(dn); +					dput(parent); +					goto done; +				} +			} else if (dn->d_inode && +				   (ceph_ino(dn->d_inode) != vino.ino || +				    ceph_snap(dn->d_inode) != vino.snap)) { +				dout(" dn %p points to wrong inode %p\n", +				     dn, dn->d_inode); +				d_delete(dn); +				dput(dn); +				goto retry_lookup; +			} + +			req->r_dentry = dn; +			dput(parent); +		} +	} + +	if (rinfo->head->is_target) { +		vino.ino = le64_to_cpu(rinfo->targeti.in->ino); +		vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + +		in = ceph_get_inode(sb, vino); +		if (IS_ERR(in)) { +			err = PTR_ERR(in); +			goto done; +		} +		req->r_target_inode = in; + +		err = fill_inode(in, &rinfo->targeti, NULL, +				session, req->r_request_started, +				(!req->r_aborted && rinfo->head->result == 0) ? +				req->r_fmode : -1, +				&req->r_caps_reservation); +		if (err < 0) { +			pr_err("fill_inode badness %p %llx.%llx\n", +				in, ceph_vinop(in)); +			goto done; +		}  	}  	/* @@ -965,6 +1149,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,  	 * will have trouble splicing in the virtual snapdir later  	 */  	if (rinfo->head->is_dentry && !req->r_aborted && +	    req->r_locked_dir &&  	    (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,  					       fsc->mount_options->snapdir_name,  					       req->r_dentry->d_name.len))) { @@ -992,14 +1177,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,  		/* do we have a dn lease? */  		have_lease = have_dir_cap || -			(le16_to_cpu(rinfo->dlease->mask) & -			 CEPH_LOCK_DN); - +			le32_to_cpu(rinfo->dlease->duration_ms);  		if (!have_lease)  			dout("fill_trace  no dentry lease or dir cap\n");  		/* rename? */  		if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { +			struct inode *olddir = req->r_old_dentry_dir; +			BUG_ON(!olddir); +  			dout(" src %p '%.*s' dst %p '%.*s'\n",  			     req->r_old_dentry,  			     req->r_old_dentry->d_name.len, @@ -1008,9 +1194,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,  			dout("fill_trace doing d_move %p -> %p\n",  			     req->r_old_dentry, dn); -			/* d_move screws up d_subdirs order */ -			ceph_i_clear(dir, CEPH_I_COMPLETE); -  			d_move(req->r_old_dentry, dn);  			dout(" src %p '%.*s' dst %p '%.*s'\n",  			     req->r_old_dentry, @@ -1022,15 +1205,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,  			   rehashing bug in vfs_rename_dir */  			ceph_invalidate_dentry_lease(dn); -			/* take overwritten dentry's readdir offset */ -			dout("dn %p gets %p offset %lld (old offset %lld)\n", -			     req->r_old_dentry, dn, ceph_dentry(dn)->offset, +			/* d_move screws up sibling dentries' offsets */ +			ceph_dir_clear_complete(dir); +			ceph_dir_clear_complete(olddir); + +			dout("dn %p gets new offset %lld\n", req->r_old_dentry,  			     ceph_dentry(req->r_old_dentry)->offset); -			ceph_dentry(req->r_old_dentry)->offset = -				ceph_dentry(dn)->offset;  			dn = req->r_old_dentry;  /* use old_dentry */ -			in = dn->d_inode;  		}  		/* null dentry? */ @@ -1052,106 +1234,87 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,  		}  		/* attach proper inode */ -		ininfo = rinfo->targeti.in; -		vino.ino = le64_to_cpu(ininfo->ino); -		vino.snap = le64_to_cpu(ininfo->snapid);  		if (!dn->d_inode) { -			in = ceph_get_inode(sb, vino); -			if (IS_ERR(in)) { -				pr_err("fill_trace bad get_inode " -				       "%llx.%llx\n", vino.ino, vino.snap); -				err = PTR_ERR(in); -				d_delete(dn); -				goto done; -			} -			dn = splice_dentry(dn, in, &have_lease, true); +			ceph_dir_clear_complete(dir); +			ihold(in); +			dn = splice_dentry(dn, in, &have_lease);  			if (IS_ERR(dn)) {  				err = PTR_ERR(dn);  				goto done;  			}  			req->r_dentry = dn;  /* may have spliced */ -			igrab(in); -		} else if (ceph_ino(in) == vino.ino && -			   ceph_snap(in) == vino.snap) { -			igrab(in); -		} else { +		} else if (dn->d_inode && dn->d_inode != in) {  			dout(" %p links to %p %llx.%llx, not %llx.%llx\n", -			     dn, in, ceph_ino(in), ceph_snap(in), -			     vino.ino, vino.snap); +			     dn, dn->d_inode, ceph_vinop(dn->d_inode), +			     ceph_vinop(in));  			have_lease = false; -			in = NULL;  		}  		if (have_lease)  			update_dentry_lease(dn, rinfo->dlease, session,  					    req->r_request_started);  		dout(" final dn %p\n", dn); -		i++; -	} else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || -		   req->r_op == CEPH_MDS_OP_MKSNAP) { +	} else if (!req->r_aborted && +		   (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || +		    req->r_op == CEPH_MDS_OP_MKSNAP)) {  		struct dentry *dn = req->r_dentry; +		struct inode *dir = req->r_locked_dir;  		/* fill out a snapdir LOOKUPSNAP dentry */  		BUG_ON(!dn); -		BUG_ON(!req->r_locked_dir); -		BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR); -		ininfo = rinfo->targeti.in; -		vino.ino = le64_to_cpu(ininfo->ino); -		vino.snap = le64_to_cpu(ininfo->snapid); -		in = ceph_get_inode(sb, vino); -		if (IS_ERR(in)) { -			pr_err("fill_inode get_inode badness %llx.%llx\n", -			       vino.ino, vino.snap); -			err = PTR_ERR(in); -			d_delete(dn); -			goto done; -		} +		BUG_ON(!dir); +		BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);  		dout(" linking snapped dir %p to dn %p\n", in, dn); -		dn = splice_dentry(dn, in, NULL, true); +		ceph_dir_clear_complete(dir); +		ihold(in); +		dn = splice_dentry(dn, in, NULL);  		if (IS_ERR(dn)) {  			err = PTR_ERR(dn);  			goto done;  		}  		req->r_dentry = dn;  /* may have spliced */ -		igrab(in); -		rinfo->head->is_dentry = 1;  /* fool notrace handlers */  	} +done: +	dout("fill_trace done err=%d\n", err); +	return err; +} -	if (rinfo->head->is_target) { -		vino.ino = le64_to_cpu(rinfo->targeti.in->ino); -		vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); +/* + * Prepopulate our cache with readdir results, leases, etc. + */ +static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, +					   struct ceph_mds_session *session) +{ +	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; +	int i, err = 0; -		if (in == NULL || ceph_ino(in) != vino.ino || -		    ceph_snap(in) != vino.snap) { -			in = ceph_get_inode(sb, vino); -			if (IS_ERR(in)) { -				err = PTR_ERR(in); -				goto done; -			} -		} -		req->r_target_inode = in; +	for (i = 0; i < rinfo->dir_nr; i++) { +		struct ceph_vino vino; +		struct inode *in; +		int rc; -		err = fill_inode(in, -				 &rinfo->targeti, NULL, -				 session, req->r_request_started, -				 (le32_to_cpu(rinfo->head->result) == 0) ? -				 req->r_fmode : -1, -				 &req->r_caps_reservation); -		if (err < 0) { -			pr_err("fill_inode badness %p %llx.%llx\n", -			       in, ceph_vinop(in)); -			goto done; +		vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); +		vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + +		in = ceph_get_inode(req->r_dentry->d_sb, vino); +		if (IS_ERR(in)) { +			err = PTR_ERR(in); +			dout("new_inode badness got %d\n", err); +			continue; +		} +		rc = fill_inode(in, &rinfo->dir_in[i], NULL, session, +				req->r_request_started, -1, +				&req->r_caps_reservation); +		if (rc < 0) { +			pr_err("fill_inode badness on %p got %d\n", in, rc); +			err = rc; +			continue;  		}  	} -done: -	dout("fill_trace done err=%d\n", err);  	return err;  } -/* - * Prepopulate our cache with readdir results, leases, etc. - */  int ceph_readdir_prepopulate(struct ceph_mds_request *req,  			     struct ceph_mds_session *session)  { @@ -1160,11 +1323,26 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,  	struct qstr dname;  	struct dentry *dn;  	struct inode *in; -	int err = 0, i; +	int err = 0, ret, i;  	struct inode *snapdir = NULL;  	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; -	u64 frag = le32_to_cpu(rhead->args.readdir.frag);  	struct ceph_dentry_info *di; +	u64 r_readdir_offset = req->r_readdir_offset; +	u32 frag = le32_to_cpu(rhead->args.readdir.frag); + +	if (rinfo->dir_dir && +	    le32_to_cpu(rinfo->dir_dir->frag) != frag) { +		dout("readdir_prepopulate got new frag %x -> %x\n", +		     frag, le32_to_cpu(rinfo->dir_dir->frag)); +		frag = le32_to_cpu(rinfo->dir_dir->frag); +		if (ceph_frag_is_leftmost(frag)) +			r_readdir_offset = 2; +		else +			r_readdir_offset = 0; +	} + +	if (req->r_aborted) +		return readdir_prepopulate_inodes_only(req, session);  	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {  		snapdir = ceph_get_snapdir(parent->d_inode); @@ -1178,6 +1356,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,  			ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);  	} +	/* FIXME: release caps/leases if error occurs */  	for (i = 0; i < rinfo->dir_nr; i++) {  		struct ceph_vino vino; @@ -1202,9 +1381,10 @@ retry_lookup:  				err = -ENOMEM;  				goto out;  			} -			err = ceph_init_dentry(dn); -			if (err < 0) { +			ret = ceph_init_dentry(dn); +			if (ret < 0) {  				dput(dn); +				err = ret;  				goto out;  			}  		} else if (dn->d_inode && @@ -1217,16 +1397,13 @@ retry_lookup:  			goto retry_lookup;  		} else {  			/* reorder parent's d_subdirs */ -			spin_lock(&dcache_lock); -			spin_lock(&dn->d_lock); +			spin_lock(&parent->d_lock); +			spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);  			list_move(&dn->d_u.d_child, &parent->d_subdirs);  			spin_unlock(&dn->d_lock); -			spin_unlock(&dcache_lock); +			spin_unlock(&parent->d_lock);  		} -		di = dn->d_fsdata; -		di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); -  		/* inode */  		if (dn->d_inode) {  			in = dn->d_inode; @@ -1234,31 +1411,44 @@ retry_lookup:  			in = ceph_get_inode(parent->d_sb, vino);  			if (IS_ERR(in)) {  				dout("new_inode badness\n"); -				d_delete(dn); +				d_drop(dn);  				dput(dn);  				err = PTR_ERR(in);  				goto out;  			} -			dn = splice_dentry(dn, in, NULL, false); -			if (IS_ERR(dn)) -				dn = NULL;  		}  		if (fill_inode(in, &rinfo->dir_in[i], NULL, session,  			       req->r_request_started, -1,  			       &req->r_caps_reservation) < 0) {  			pr_err("fill_inode badness on %p\n", in); +			if (!dn->d_inode) +				iput(in); +			d_drop(dn);  			goto next_item;  		} -		if (dn) -			update_dentry_lease(dn, rinfo->dir_dlease[i], -					    req->r_session, -					    req->r_request_started); + +		if (!dn->d_inode) { +			dn = splice_dentry(dn, in, NULL); +			if (IS_ERR(dn)) { +				err = PTR_ERR(dn); +				dn = NULL; +				goto next_item; +			} +		} + +		di = dn->d_fsdata; +		di->offset = ceph_make_fpos(frag, i + r_readdir_offset); + +		update_dentry_lease(dn, rinfo->dir_dlease[i], +				    req->r_session, +				    req->r_request_started);  next_item:  		if (dn)  			dput(dn);  	} -	req->r_did_prepopulate = true; +	if (err == 0) +		req->r_did_prepopulate = true;  out:  	if (snapdir) { @@ -1274,7 +1464,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)  	struct ceph_inode_info *ci = ceph_inode(inode);  	int ret = 0; -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);  	inode->i_size = size;  	inode->i_blocks = (size + (1 << 9) - 1) >> 9; @@ -1284,7 +1474,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)  	    (ci->i_reported_size << 1) < ci->i_max_size)  		ret = 1; -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	return ret;  } @@ -1294,12 +1484,13 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)   */  void ceph_queue_writeback(struct inode *inode)  { +	ihold(inode);  	if (queue_work(ceph_inode_to_client(inode)->wb_wq,  		       &ceph_inode(inode)->i_wb_work)) {  		dout("ceph_queue_writeback %p\n", inode); -		igrab(inode);  	} else {  		dout("ceph_queue_writeback %p failed\n", inode); +		iput(inode);  	}  } @@ -1319,55 +1510,13 @@ static void ceph_writeback_work(struct work_struct *work)   */  void ceph_queue_invalidate(struct inode *inode)  { +	ihold(inode);  	if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,  		       &ceph_inode(inode)->i_pg_inv_work)) {  		dout("ceph_queue_invalidate %p\n", inode); -		igrab(inode);  	} else {  		dout("ceph_queue_invalidate %p failed\n", inode); -	} -} - -/* - * invalidate any pages that are not dirty or under writeback.  this - * includes pages that are clean and mapped. - */ -static void ceph_invalidate_nondirty_pages(struct address_space *mapping) -{ -	struct pagevec pvec; -	pgoff_t next = 0; -	int i; - -	pagevec_init(&pvec, 0); -	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { -		for (i = 0; i < pagevec_count(&pvec); i++) { -			struct page *page = pvec.pages[i]; -			pgoff_t index; -			int skip_page = -				(PageDirty(page) || PageWriteback(page)); - -			if (!skip_page) -				skip_page = !trylock_page(page); - -			/* -			 * We really shouldn't be looking at the ->index of an -			 * unlocked page.  But we're not allowed to lock these -			 * pages.  So we rely upon nobody altering the ->index -			 * of this (pinned-by-us) page. -			 */ -			index = page->index; -			if (index > next) -				next = index; -			next++; - -			if (skip_page) -				continue; - -			generic_error_remove_page(mapping, page); -			unlock_page(page); -		} -		pagevec_release(&pvec); -		cond_resched(); +		iput(inode);  	}  } @@ -1383,44 +1532,47 @@ static void ceph_invalidate_work(struct work_struct *work)  	u32 orig_gen;  	int check = 0; -	spin_lock(&inode->i_lock); +	mutex_lock(&ci->i_truncate_mutex); +	spin_lock(&ci->i_ceph_lock);  	dout("invalidate_pages %p gen %d revoking %d\n", inode,  	     ci->i_rdcache_gen, ci->i_rdcache_revoking); -	if (ci->i_rdcache_gen == 0 || -	    ci->i_rdcache_revoking != ci->i_rdcache_gen) { -		BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen); -		/* nevermind! */ -		ci->i_rdcache_revoking = 0; -		spin_unlock(&inode->i_lock); +	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { +		if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) +			check = 1; +		spin_unlock(&ci->i_ceph_lock); +		mutex_unlock(&ci->i_truncate_mutex);  		goto out;  	}  	orig_gen = ci->i_rdcache_gen; -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock); -	ceph_invalidate_nondirty_pages(inode->i_mapping); +	truncate_pagecache(inode, 0); -	spin_lock(&inode->i_lock); -	if (orig_gen == ci->i_rdcache_gen) { +	spin_lock(&ci->i_ceph_lock); +	if (orig_gen == ci->i_rdcache_gen && +	    orig_gen == ci->i_rdcache_revoking) {  		dout("invalidate_pages %p gen %d successful\n", inode,  		     ci->i_rdcache_gen); -		ci->i_rdcache_gen = 0; -		ci->i_rdcache_revoking = 0; +		ci->i_rdcache_revoking--;  		check = 1;  	} else { -		dout("invalidate_pages %p gen %d raced, gen now %d\n", -		     inode, orig_gen, ci->i_rdcache_gen); +		dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", +		     inode, orig_gen, ci->i_rdcache_gen, +		     ci->i_rdcache_revoking); +		if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) +			check = 1;  	} -	spin_unlock(&inode->i_lock); - +	spin_unlock(&ci->i_ceph_lock); +	mutex_unlock(&ci->i_truncate_mutex); +out:  	if (check)  		ceph_check_caps(ci, 0, NULL); -out:  	iput(inode);  }  /* - * called by trunc_wq; take i_mutex ourselves + * called by trunc_wq;   *   * We also truncate in a separate thread as well.   */ @@ -1431,9 +1583,7 @@ static void ceph_vmtruncate_work(struct work_struct *work)  	struct inode *inode = &ci->vfs_inode;  	dout("vmtruncate_work %p\n", inode); -	mutex_lock(&inode->i_mutex);  	__ceph_do_pending_vmtruncate(inode); -	mutex_unlock(&inode->i_mutex);  	iput(inode);  } @@ -1445,19 +1595,19 @@ void ceph_queue_vmtruncate(struct inode *inode)  {  	struct ceph_inode_info *ci = ceph_inode(inode); +	ihold(inode); +  	if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,  		       &ci->i_vmtruncate_work)) {  		dout("ceph_queue_vmtruncate %p\n", inode); -		igrab(inode);  	} else {  		dout("ceph_queue_vmtruncate %p failed, pending=%d\n",  		     inode, ci->i_truncate_pending); +		iput(inode);  	}  }  /* - * called with i_mutex held. - *   * Make sure any pending truncation is applied before doing anything   * that may depend on it.   */ @@ -1465,13 +1615,15 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)  {  	struct ceph_inode_info *ci = ceph_inode(inode);  	u64 to; -	int wrbuffer_refs, wake = 0; +	int wrbuffer_refs, finish = 0; +	mutex_lock(&ci->i_truncate_mutex);  retry: -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	if (ci->i_truncate_pending == 0) {  		dout("__do_pending_vmtruncate %p none pending\n", inode); -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock); +		mutex_unlock(&ci->i_truncate_mutex);  		return;  	} @@ -1482,32 +1634,39 @@ retry:  	if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {  		dout("__do_pending_vmtruncate %p flushing snaps first\n",  		     inode); -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  		filemap_write_and_wait_range(&inode->i_data, 0,  					     inode->i_sb->s_maxbytes);  		goto retry;  	} +	/* there should be no reader or writer */ +	WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref); +  	to = ci->i_truncate_size;  	wrbuffer_refs = ci->i_wrbuffer_ref;  	dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,  	     ci->i_truncate_pending, to); -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock); + +	truncate_pagecache(inode, to); -	truncate_inode_pages(inode->i_mapping, to); +	spin_lock(&ci->i_ceph_lock); +	if (to == ci->i_truncate_size) { +		ci->i_truncate_pending = 0; +		finish = 1; +	} +	spin_unlock(&ci->i_ceph_lock); +	if (!finish) +		goto retry; -	spin_lock(&inode->i_lock); -	ci->i_truncate_pending--; -	if (ci->i_truncate_pending == 0) -		wake = 1; -	spin_unlock(&inode->i_lock); +	mutex_unlock(&ci->i_truncate_mutex);  	if (wrbuffer_refs == 0)  		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); -	if (wake) -		wake_up_all(&ci->i_cap_wq); -} +	wake_up_all(&ci->i_cap_wq); +}  /*   * symlinks @@ -1522,6 +1681,12 @@ static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)  static const struct inode_operations ceph_symlink_iops = {  	.readlink = generic_readlink,  	.follow_link = ceph_sym_follow_link, +	.setattr = ceph_setattr, +	.getattr = ceph_getattr, +	.setxattr = ceph_setxattr, +	.getxattr = ceph_getxattr, +	.listxattr = ceph_listxattr, +	.removexattr = ceph_removexattr,  };  /* @@ -1531,7 +1696,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)  {  	struct inode *inode = dentry->d_inode;  	struct ceph_inode_info *ci = ceph_inode(inode); -	struct inode *parent_inode = dentry->d_parent->d_inode;  	const unsigned int ia_valid = attr->ia_valid;  	struct ceph_mds_request *req;  	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; @@ -1539,12 +1703,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)  	int release = 0, dirtied = 0;  	int mask = 0;  	int err = 0; +	int inode_dirty_flags = 0;  	if (ceph_snap(inode) != CEPH_NOSNAP)  		return -EROFS; -	__ceph_do_pending_vmtruncate(inode); -  	err = inode_change_ok(inode, attr);  	if (err != 0)  		return err; @@ -1554,32 +1717,36 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)  	if (IS_ERR(req))  		return PTR_ERR(req); -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	issued = __ceph_caps_issued(ci, NULL);  	dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));  	if (ia_valid & ATTR_UID) {  		dout("setattr %p uid %d -> %d\n", inode, -		     inode->i_uid, attr->ia_uid); +		     from_kuid(&init_user_ns, inode->i_uid), +		     from_kuid(&init_user_ns, attr->ia_uid));  		if (issued & CEPH_CAP_AUTH_EXCL) {  			inode->i_uid = attr->ia_uid;  			dirtied |= CEPH_CAP_AUTH_EXCL;  		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || -			   attr->ia_uid != inode->i_uid) { -			req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid); +			   !uid_eq(attr->ia_uid, inode->i_uid)) { +			req->r_args.setattr.uid = cpu_to_le32( +				from_kuid(&init_user_ns, attr->ia_uid));  			mask |= CEPH_SETATTR_UID;  			release |= CEPH_CAP_AUTH_SHARED;  		}  	}  	if (ia_valid & ATTR_GID) {  		dout("setattr %p gid %d -> %d\n", inode, -		     inode->i_gid, attr->ia_gid); +		     from_kgid(&init_user_ns, inode->i_gid), +		     from_kgid(&init_user_ns, attr->ia_gid));  		if (issued & CEPH_CAP_AUTH_EXCL) {  			inode->i_gid = attr->ia_gid;  			dirtied |= CEPH_CAP_AUTH_EXCL;  		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || -			   attr->ia_gid != inode->i_gid) { -			req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid); +			   !gid_eq(attr->ia_gid, inode->i_gid)) { +			req->r_args.setattr.gid = cpu_to_le32( +				from_kgid(&init_user_ns, attr->ia_gid));  			mask |= CEPH_SETATTR_GID;  			release |= CEPH_CAP_AUTH_SHARED;  		} @@ -1592,6 +1759,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)  			dirtied |= CEPH_CAP_AUTH_EXCL;  		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||  			   attr->ia_mode != inode->i_mode) { +			inode->i_mode = attr->ia_mode;  			req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);  			mask |= CEPH_SETATTR_MODE;  			release |= CEPH_CAP_AUTH_SHARED; @@ -1697,28 +1865,40 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)  		dout("setattr %p ATTR_FILE ... hrm!\n", inode);  	if (dirtied) { -		__ceph_mark_dirty_caps(ci, dirtied); +		inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);  		inode->i_ctime = CURRENT_TIME;  	}  	release &= issued; -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock); + +	if (inode_dirty_flags) +		__mark_inode_dirty(inode, inode_dirty_flags); + +	if (ia_valid & ATTR_MODE) { +		err = posix_acl_chmod(inode, attr->ia_mode); +		if (err) +			goto out_put; +	}  	if (mask) { -		req->r_inode = igrab(inode); +		req->r_inode = inode; +		ihold(inode);  		req->r_inode_drop = release;  		req->r_args.setattr.mask = cpu_to_le32(mask);  		req->r_num_caps = 1; -		err = ceph_mdsc_do_request(mdsc, parent_inode, req); +		err = ceph_mdsc_do_request(mdsc, NULL, req);  	}  	dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,  	     ceph_cap_string(dirtied), mask);  	ceph_mdsc_put_request(req); -	__ceph_do_pending_vmtruncate(inode); +	if (mask & CEPH_SETATTR_SIZE) +		__ceph_do_pending_vmtruncate(inode);  	return err;  out: -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock); +out_put:  	ceph_mdsc_put_request(req);  	return err;  } @@ -1739,14 +1919,15 @@ int ceph_do_getattr(struct inode *inode, int mask)  		return 0;  	} -	dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask)); +	dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);  	if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))  		return 0;  	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);  	if (IS_ERR(req))  		return PTR_ERR(req); -	req->r_inode = igrab(inode); +	req->r_inode = inode; +	ihold(inode);  	req->r_num_caps = 1;  	req->r_args.getattr.mask = cpu_to_le32(mask);  	err = ceph_mdsc_do_request(mdsc, NULL, req); @@ -1762,10 +1943,15 @@ int ceph_do_getattr(struct inode *inode, int mask)   */  int ceph_permission(struct inode *inode, int mask)  { -	int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED); +	int err; + +	if (mask & MAY_NOT_BLOCK) +		return -ECHILD; + +	err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);  	if (!err) -		err = generic_permission(inode, mask, NULL); +		err = generic_permission(inode, mask);  	return err;  } @@ -1783,13 +1969,17 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,  	err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);  	if (!err) {  		generic_fillattr(inode, stat); -		stat->ino = inode->i_ino; +		stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);  		if (ceph_snap(inode) != CEPH_NOSNAP)  			stat->dev = ceph_snap(inode);  		else  			stat->dev = 0;  		if (S_ISDIR(inode->i_mode)) { -			stat->size = ci->i_rbytes; +			if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), +						RBYTES)) +				stat->size = ci->i_rbytes; +			else +				stat->size = ci->i_files + ci->i_subdirs;  			stat->blocks = 0;  			stat->blksize = 65536;  		} diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 8888c9ba68d..a822a6e5829 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -1,9 +1,8 @@ +#include <linux/ceph/ceph_debug.h>  #include <linux/in.h>  #include "super.h"  #include "mds_client.h" -#include <linux/ceph/ceph_debug.h> -  #include "ioctl.h" @@ -16,18 +15,17 @@   */  static long ceph_ioctl_get_layout(struct file *file, void __user *arg)  { -	struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); +	struct ceph_inode_info *ci = ceph_inode(file_inode(file));  	struct ceph_ioctl_layout l;  	int err; -	err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); +	err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT);  	if (!err) {  		l.stripe_unit = ceph_file_layout_su(ci->i_layout);  		l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);  		l.object_size = ceph_file_layout_object_size(ci->i_layout);  		l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); -		l.preferred_osd = -			(s32)le32_to_cpu(ci->i_layout.fl_pg_preferred); +		l.preferred_osd = (s32)-1;  		if (copy_to_user(arg, &l, sizeof(l)))  			return -EFAULT;  	} @@ -35,45 +33,84 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)  	return err;  } +static long __validate_layout(struct ceph_mds_client *mdsc, +			      struct ceph_ioctl_layout *l) +{ +	int i, err; + +	/* validate striping parameters */ +	if ((l->object_size & ~PAGE_MASK) || +	    (l->stripe_unit & ~PAGE_MASK) || +	    (l->stripe_unit != 0 && +	     ((unsigned)l->object_size % (unsigned)l->stripe_unit))) +		return -EINVAL; + +	/* make sure it's a valid data pool */ +	mutex_lock(&mdsc->mutex); +	err = -EINVAL; +	for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) +		if (mdsc->mdsmap->m_data_pg_pools[i] == l->data_pool) { +			err = 0; +			break; +		} +	mutex_unlock(&mdsc->mutex); +	if (err) +		return err; + +	return 0; +} +  static long ceph_ioctl_set_layout(struct file *file, void __user *arg)  { -	struct inode *inode = file->f_dentry->d_inode; -	struct inode *parent_inode = file->f_dentry->d_parent->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;  	struct ceph_mds_request *req;  	struct ceph_ioctl_layout l; -	int err, i; +	struct ceph_inode_info *ci = ceph_inode(file_inode(file)); +	struct ceph_ioctl_layout nl; +	int err; -	/* copy and validate */  	if (copy_from_user(&l, arg, sizeof(l)))  		return -EFAULT; -	if ((l.object_size & ~PAGE_MASK) || -	    (l.stripe_unit & ~PAGE_MASK) || -	    !l.stripe_unit || -	    (l.object_size && -	     (unsigned)l.object_size % (unsigned)l.stripe_unit)) -		return -EINVAL; - -	/* make sure it's a valid data pool */ -	if (l.data_pool > 0) { -		mutex_lock(&mdsc->mutex); -		err = -EINVAL; -		for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) -			if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) { -				err = 0; -				break; -			} -		mutex_unlock(&mdsc->mutex); -		if (err) -			return err; -	} +	/* validate changed params against current layout */ +	err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); +	if (err) +		return err; + +	memset(&nl, 0, sizeof(nl)); +	if (l.stripe_count) +		nl.stripe_count = l.stripe_count; +	else +		nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); +	if (l.stripe_unit) +		nl.stripe_unit = l.stripe_unit; +	else +		nl.stripe_unit = ceph_file_layout_su(ci->i_layout); +	if (l.object_size) +		nl.object_size = l.object_size; +	else +		nl.object_size = ceph_file_layout_object_size(ci->i_layout); +	if (l.data_pool) +		nl.data_pool = l.data_pool; +	else +		nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout); + +	/* this is obsolete, and always -1 */ +	nl.preferred_osd = le64_to_cpu(-1); + +	err = __validate_layout(mdsc, &nl); +	if (err) +		return err;  	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,  				       USE_AUTH_MDS);  	if (IS_ERR(req))  		return PTR_ERR(req); -	req->r_inode = igrab(inode); +	req->r_inode = inode; +	ihold(inode); +	req->r_num_caps = 1; +  	req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;  	req->r_args.setlayout.layout.fl_stripe_unit = @@ -83,10 +120,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)  	req->r_args.setlayout.layout.fl_object_size =  		cpu_to_le32(l.object_size);  	req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); -	req->r_args.setlayout.layout.fl_pg_preferred = -		cpu_to_le32(l.preferred_osd); -	err = ceph_mdsc_do_request(mdsc, parent_inode, req); +	err = ceph_mdsc_do_request(mdsc, NULL, req);  	ceph_mdsc_put_request(req);  	return err;  } @@ -99,43 +134,28 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)   */  static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_mds_request *req;  	struct ceph_ioctl_layout l; -	int err, i; +	int err;  	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;  	/* copy and validate */  	if (copy_from_user(&l, arg, sizeof(l)))  		return -EFAULT; -	if ((l.object_size & ~PAGE_MASK) || -	    (l.stripe_unit & ~PAGE_MASK) || -	    !l.stripe_unit || -	    (l.object_size && -	        (unsigned)l.object_size % (unsigned)l.stripe_unit)) -		return -EINVAL; - -	/* make sure it's a valid data pool */ -	if (l.data_pool > 0) { -		mutex_lock(&mdsc->mutex); -		err = -EINVAL; -		for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) -			if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) { -				err = 0; -				break; -			} -		mutex_unlock(&mdsc->mutex); -		if (err) -			return err; -	} +	err = __validate_layout(mdsc, &l); +	if (err) +		return err;  	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,  				       USE_AUTH_MDS);  	if (IS_ERR(req))  		return PTR_ERR(req); -	req->r_inode = igrab(inode); +	req->r_inode = inode; +	ihold(inode); +	req->r_num_caps = 1;  	req->r_args.setlayout.layout.fl_stripe_unit =  			cpu_to_le32(l.stripe_unit); @@ -145,8 +165,6 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)  			cpu_to_le32(l.object_size);  	req->r_args.setlayout.layout.fl_pg_pool =  			cpu_to_le32(l.data_pool); -	req->r_args.setlayout.layout.fl_pg_preferred = -			cpu_to_le32(l.preferred_osd);  	err = ceph_mdsc_do_request(mdsc, inode, req);  	ceph_mdsc_put_request(req); @@ -160,22 +178,29 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)  static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)  {  	struct ceph_ioctl_dataloc dl; -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_inode_info *ci = ceph_inode(inode);  	struct ceph_osd_client *osdc =  		&ceph_sb_to_client(inode->i_sb)->client->osdc; +	struct ceph_object_locator oloc; +	struct ceph_object_id oid;  	u64 len = 1, olen;  	u64 tmp; -	struct ceph_object_layout ol;  	struct ceph_pg pgid; +	int r;  	/* copy and validate */  	if (copy_from_user(&dl, arg, sizeof(dl)))  		return -EFAULT;  	down_read(&osdc->map_sem); -	ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, -				      &dl.object_no, &dl.object_offset, &olen); +	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, +					  &dl.object_no, &dl.object_offset, +					  &olen); +	if (r < 0) { +		up_read(&osdc->map_sem); +		return -EIO; +	}  	dl.file_offset -= dl.object_offset;  	dl.object_size = ceph_file_layout_object_size(ci->i_layout);  	dl.block_size = ceph_file_layout_su(ci->i_layout); @@ -186,10 +211,16 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)  	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",  		 ceph_ino(inode), dl.object_no); -	ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout, -				osdc->osdmap); -	pgid = ol.ol_pgid; +	oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); +	ceph_oid_set_name(&oid, dl.object_name); + +	r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); +	if (r < 0) { +		up_read(&osdc->map_sem); +		return r; +	} +  	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);  	if (dl.osd >= 0) {  		struct ceph_entity_addr *a = @@ -211,15 +242,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)  static long ceph_ioctl_lazyio(struct file *file)  {  	struct ceph_file_info *fi = file->private_data; -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct ceph_inode_info *ci = ceph_inode(inode);  	if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { -		spin_lock(&inode->i_lock); +		spin_lock(&ci->i_ceph_lock);  		ci->i_nr_by_mode[fi->fmode]--;  		fi->fmode |= CEPH_FILE_MODE_LAZY;  		ci->i_nr_by_mode[fi->fmode]++; -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  		dout("ioctl_layzio: file %p marked lazy\n", file);  		ceph_check_caps(ci, 0, NULL); @@ -229,6 +260,14 @@ static long ceph_ioctl_lazyio(struct file *file)  	return 0;  } +static long ceph_ioctl_syncio(struct file *file) +{ +	struct ceph_file_info *fi = file->private_data; + +	fi->flags |= CEPH_F_SYNC; +	return 0; +} +  long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)  {  	dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); @@ -247,6 +286,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)  	case CEPH_IOC_LAZYIO:  		return ceph_ioctl_lazyio(file); + +	case CEPH_IOC_SYNCIO: +		return ceph_ioctl_syncio(file);  	}  	return -ENOTTY; diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index a6ce54e94eb..c77028afb1e 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -4,12 +4,38 @@  #include <linux/ioctl.h>  #include <linux/types.h> -#define CEPH_IOCTL_MAGIC 0x98 +#define CEPH_IOCTL_MAGIC 0x97 -/* just use u64 to align sanely on all archs */ +/* + * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy + * CEPH_IOC_SET_LAYOUT - set file layout + * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy + * + * The file layout specifies how file data is striped over objects in + * the distributed object store, which object pool they belong to (if + * it differs from the default), and an optional 'preferred osd' to + * store them on. + * + * Files get a new layout based on the policy set on the containing + * directory or one of its ancestors.  The GET_LAYOUT ioctl will let + * you examine the layout for a file or the policy on a directory. + * + * SET_LAYOUT will let you set a layout on a newly created file.  This + * only works immediately after the file is created and before any + * data is written to it. + * + * SET_LAYOUT_POLICY will let you set a layout policy (default layout) + * on a directory that will apply to any new files created in that + * directory (or any child directory that doesn't specify a layout of + * its own). + */ + +/* use u64 to align sanely on all archs */  struct ceph_ioctl_layout {  	__u64 stripe_unit, stripe_count, object_size;  	__u64 data_pool; + +	/* obsolete.  new values ignored, always return -1 */  	__s64 preferred_osd;  }; @@ -21,6 +47,8 @@ struct ceph_ioctl_layout {  				   struct ceph_ioctl_layout)  /* + * CEPH_IOC_GET_DATALOC - get location of file data in the cluster + *   * Extract identity, address of the OSD and object storing a given   * file offset.   */ @@ -39,6 +67,34 @@ struct ceph_ioctl_dataloc {  #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3,	\  				   struct ceph_ioctl_dataloc) +/* + * CEPH_IOC_LAZYIO - relax consistency + * + * Normally Ceph switches to synchronous IO when multiple clients have + * the file open (and or more for write).  Reads and writes bypass the + * page cache and go directly to the OSD.  Setting this flag on a file + * descriptor will allow buffered IO for this file in cases where the + * application knows it won't interfere with other nodes (or doesn't + * care). + */  #define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) +/* + * CEPH_IOC_SYNCIO - force synchronous IO + * + * This ioctl sets a file flag that forces the synchronous IO that + * bypasses the page cache, even if it is not necessary.  This is + * essentially the opposite behavior of IOC_LAZYIO.  This forces the + * same read/write path as a file opened by multiple clients when one + * or more of those clients is opened for write. + * + * Note that this type of sync IO takes a different path than a file + * opened with O_SYNC/D_SYNC (writes hit the page cache and are + * immediately flushed on page boundaries).  It is very similar to + * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes + * are not copied (user page must remain stable) and O_DIRECT writes + * have alignment restrictions (on the buffer and file offset). + */ +#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5) +  #endif diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 40abde93c34..fbc39c47bac 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -2,49 +2,97 @@  #include <linux/file.h>  #include <linux/namei.h> +#include <linux/random.h>  #include "super.h"  #include "mds_client.h"  #include <linux/ceph/pagelist.h> +static u64 lock_secret; + +static inline u64 secure_addr(void *addr) +{ +	u64 v = lock_secret ^ (u64)(unsigned long)addr; +	/* +	 * Set the most significant bit, so that MDS knows the 'owner' +	 * is sufficient to identify the owner of lock. (old code uses +	 * both 'owner' and 'pid') +	 */ +	v |= (1ULL << 63); +	return v; +} + +void __init ceph_flock_init(void) +{ +	get_random_bytes(&lock_secret, sizeof(lock_secret)); +} +  /**   * Implement fcntl and flock locking functions.   */  static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, -			     u64 pid, u64 pid_ns, -			     int cmd, u64 start, u64 length, u8 wait) +			     int cmd, u8 wait, struct file_lock *fl)  { -	struct inode *inode = file->f_dentry->d_inode; -	struct ceph_mds_client *mdsc = -		ceph_sb_to_client(inode->i_sb)->mdsc; +	struct inode *inode = file_inode(file); +	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;  	struct ceph_mds_request *req;  	int err; +	u64 length = 0; +	u64 owner;  	req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);  	if (IS_ERR(req))  		return PTR_ERR(req); -	req->r_inode = igrab(inode); +	req->r_inode = inode; +	ihold(inode); +	req->r_num_caps = 1; -	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " -	     "length: %llu, wait: %d, type`: %d", (int)lock_type, -	     (int)operation, pid, start, length, wait, cmd); +	/* mds requires start and length rather than start and end */ +	if (LLONG_MAX == fl->fl_end) +		length = 0; +	else +		length = fl->fl_end - fl->fl_start + 1; + +	owner = secure_addr(fl->fl_owner); + +	dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, " +	     "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type, +	     (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length, +	     wait, fl->fl_type);  	req->r_args.filelock_change.rule = lock_type;  	req->r_args.filelock_change.type = cmd; -	req->r_args.filelock_change.pid = cpu_to_le64(pid); -	/* This should be adjusted, but I'm not sure if -	   namespaces actually get id numbers*/ -	req->r_args.filelock_change.pid_namespace = -		cpu_to_le64((u64)pid_ns); -	req->r_args.filelock_change.start = cpu_to_le64(start); +	req->r_args.filelock_change.owner = cpu_to_le64(owner); +	req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); +	req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);  	req->r_args.filelock_change.length = cpu_to_le64(length);  	req->r_args.filelock_change.wait = wait;  	err = ceph_mdsc_do_request(mdsc, inode, req); + +	if (operation == CEPH_MDS_OP_GETFILELOCK) { +		fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); +		if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) +			fl->fl_type = F_RDLCK; +		else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) +			fl->fl_type = F_WRLCK; +		else +			fl->fl_type = F_UNLCK; + +		fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start); +		length = le64_to_cpu(req->r_reply_info.filelock_reply->start) + +						 le64_to_cpu(req->r_reply_info.filelock_reply->length); +		if (length >= 1) +			fl->fl_end = length -1; +		else +			fl->fl_end = 0; + +	}  	ceph_mdsc_put_request(req);  	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " -	     "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type, -	     (int)operation, pid, start, length, wait, cmd, err); +	     "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type, +	     (int)operation, (u64)fl->fl_pid, fl->fl_start, +	     length, wait, fl->fl_type, err);  	return err;  } @@ -54,20 +102,24 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,   */  int ceph_lock(struct file *file, int cmd, struct file_lock *fl)  { -	u64 length;  	u8 lock_cmd;  	int err;  	u8 wait = 0;  	u16 op = CEPH_MDS_OP_SETFILELOCK; -	fl->fl_nspid = get_pid(task_tgid(current)); -	dout("ceph_lock, fl_pid:%d", fl->fl_pid); +	if (!(fl->fl_flags & FL_POSIX)) +		return -ENOLCK; +	/* No mandatory locks */ +	if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) +		return -ENOLCK; + +	dout("ceph_lock, fl_owner: %p", fl->fl_owner);  	/* set wait bit as appropriate, then make command as Ceph expects it*/ -	if (F_SETLKW == cmd) -		wait = 1; -	if (F_GETLK == cmd) +	if (IS_GETLK(cmd))  		op = CEPH_MDS_OP_GETFILELOCK; +	else if (IS_SETLKW(cmd)) +		wait = 1;  	if (F_RDLCK == fl->fl_type)  		lock_cmd = CEPH_LOCK_SHARED; @@ -76,87 +128,75 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)  	else  		lock_cmd = CEPH_LOCK_UNLOCK; -	if (LLONG_MAX == fl->fl_end) -		length = 0; -	else -		length = fl->fl_end - fl->fl_start + 1; - -	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, -				(u64)fl->fl_pid, -				(u64)(unsigned long)fl->fl_nspid, -				lock_cmd, fl->fl_start, -				length, wait); +	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);  	if (!err) { -		dout("mds locked, locking locally"); -		err = posix_lock_file(file, fl, NULL); -		if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { -			/* undo! This should only happen if the kernel detects -			 * local deadlock. */ -			ceph_lock_message(CEPH_LOCK_FCNTL, op, file, -					  (u64)fl->fl_pid, -					  (u64)(unsigned long)fl->fl_nspid, -					  CEPH_LOCK_UNLOCK, fl->fl_start, -					  length, 0); -			dout("got %d on posix_lock_file, undid lock", err); +		if (op != CEPH_MDS_OP_GETFILELOCK) { +			dout("mds locked, locking locally"); +			err = posix_lock_file(file, fl, NULL); +			if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { +				/* undo! This should only happen if +				 * the kernel detects local +				 * deadlock. */ +				ceph_lock_message(CEPH_LOCK_FCNTL, op, file, +						  CEPH_LOCK_UNLOCK, 0, fl); +				dout("got %d on posix_lock_file, undid lock", +				     err); +			}  		} -	} else { -		dout("mds returned error code %d", err); + +	} else if (err == -ERESTARTSYS) { +		dout("undoing lock\n"); +		ceph_lock_message(CEPH_LOCK_FCNTL, op, file, +				  CEPH_LOCK_UNLOCK, 0, fl);  	}  	return err;  }  int ceph_flock(struct file *file, int cmd, struct file_lock *fl)  { -	u64 length;  	u8 lock_cmd;  	int err; -	u8 wait = 1; - -	fl->fl_nspid = get_pid(task_tgid(current)); -	dout("ceph_flock, fl_pid:%d", fl->fl_pid); - -	/* set wait bit, then clear it out of cmd*/ -	if (cmd & LOCK_NB) -		wait = 0; -	cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN); -	/* set command sequence that Ceph wants to see: -	   shared lock, exclusive lock, or unlock */ -	if (LOCK_SH == cmd) +	u8 wait = 0; + +	if (!(fl->fl_flags & FL_FLOCK)) +		return -ENOLCK; +	/* No mandatory locks */ +	if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) +		return -ENOLCK; + +	dout("ceph_flock, fl_file: %p", fl->fl_file); + +	if (IS_SETLKW(cmd)) +		wait = 1; + +	if (F_RDLCK == fl->fl_type)  		lock_cmd = CEPH_LOCK_SHARED; -	else if (LOCK_EX == cmd) +	else if (F_WRLCK == fl->fl_type)  		lock_cmd = CEPH_LOCK_EXCL;  	else  		lock_cmd = CEPH_LOCK_UNLOCK; -	/* mds requires start and length rather than start and end */ -	if (LLONG_MAX == fl->fl_end) -		length = 0; -	else -		length = fl->fl_end - fl->fl_start + 1;  	err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, -				file, (u64)fl->fl_pid, -				(u64)(unsigned long)fl->fl_nspid, -				lock_cmd, fl->fl_start, -				length, wait); +				file, lock_cmd, wait, fl);  	if (!err) {  		err = flock_lock_file_wait(file, fl);  		if (err) {  			ceph_lock_message(CEPH_LOCK_FLOCK,  					  CEPH_MDS_OP_SETFILELOCK, -					  file, (u64)fl->fl_pid, -					  (u64)(unsigned long)fl->fl_nspid, -					  CEPH_LOCK_UNLOCK, fl->fl_start, -					  length, 0); +					  file, CEPH_LOCK_UNLOCK, 0, fl);  			dout("got %d on flock_lock_file_wait, undid lock", err);  		} -	} else { -		dout("mds error code %d", err); +	} else if (err == -ERESTARTSYS) { +		dout("undoing lock\n"); +		ceph_lock_message(CEPH_LOCK_FLOCK, +				  CEPH_MDS_OP_SETFILELOCK, +				  file, CEPH_LOCK_UNLOCK, 0, fl);  	}  	return err;  }  /** - * Must be called with BKL already held. Fills in the passed + * Must be called with lock_flocks() already held. Fills in the passed   * counter variables, so you can prepare pagelist metadata before calling   * ceph_encode_locks.   */ @@ -178,27 +218,23 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)  }  /** - * Encode the flock and fcntl locks for the given inode into the pagelist. - * Format is: #fcntl locks, sequential fcntl locks, #flock locks, - * sequential flock locks. - * Must be called with lock_flocks() already held. - * If we encounter more of a specific lock type than expected, - * we return the value 1. + * Encode the flock and fcntl locks for the given inode into the ceph_filelock + * array. Must be called with inode->i_lock already held. + * If we encounter more of a specific lock type than expected, return -ENOSPC.   */ -int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, -		      int num_fcntl_locks, int num_flock_locks) +int ceph_encode_locks_to_buffer(struct inode *inode, +				struct ceph_filelock *flocks, +				int num_fcntl_locks, int num_flock_locks)  {  	struct file_lock *lock; -	struct ceph_filelock cephlock;  	int err = 0;  	int seen_fcntl = 0;  	int seen_flock = 0; +	int l = 0;  	dout("encoding %d flock and %d fcntl locks", num_flock_locks,  	     num_fcntl_locks); -	err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32)); -	if (err) -		goto fail; +  	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {  		if (lock->fl_flags & FL_POSIX) {  			++seen_fcntl; @@ -206,19 +242,12 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,  				err = -ENOSPC;  				goto fail;  			} -			err = lock_to_ceph_filelock(lock, &cephlock); +			err = lock_to_ceph_filelock(lock, &flocks[l]);  			if (err)  				goto fail; -			err = ceph_pagelist_append(pagelist, &cephlock, -					   sizeof(struct ceph_filelock)); +			++l;  		} -		if (err) -			goto fail;  	} - -	err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32)); -	if (err) -		goto fail;  	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {  		if (lock->fl_flags & FL_FLOCK) {  			++seen_flock; @@ -226,19 +255,51 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,  				err = -ENOSPC;  				goto fail;  			} -			err = lock_to_ceph_filelock(lock, &cephlock); +			err = lock_to_ceph_filelock(lock, &flocks[l]);  			if (err)  				goto fail; -			err = ceph_pagelist_append(pagelist, &cephlock, -					   sizeof(struct ceph_filelock)); +			++l;  		} -		if (err) -			goto fail;  	}  fail:  	return err;  } +/** + * Copy the encoded flock and fcntl locks into the pagelist. + * Format is: #fcntl locks, sequential fcntl locks, #flock locks, + * sequential flock locks. + * Returns zero on success. + */ +int ceph_locks_to_pagelist(struct ceph_filelock *flocks, +			   struct ceph_pagelist *pagelist, +			   int num_fcntl_locks, int num_flock_locks) +{ +	int err = 0; +	__le32 nlocks; + +	nlocks = cpu_to_le32(num_fcntl_locks); +	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); +	if (err) +		goto out_fail; + +	err = ceph_pagelist_append(pagelist, flocks, +				   num_fcntl_locks * sizeof(*flocks)); +	if (err) +		goto out_fail; + +	nlocks = cpu_to_le32(num_flock_locks); +	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); +	if (err) +		goto out_fail; + +	err = ceph_pagelist_append(pagelist, +				   &flocks[num_fcntl_locks], +				   num_flock_locks * sizeof(*flocks)); +out_fail: +	return err; +} +  /*   * Given a pointer to a lock, convert it to a ceph filelock   */ @@ -246,13 +307,11 @@ int lock_to_ceph_filelock(struct file_lock *lock,  			  struct ceph_filelock *cephlock)  {  	int err = 0; -  	cephlock->start = cpu_to_le64(lock->fl_start);  	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);  	cephlock->client = cpu_to_le64(0); -	cephlock->pid = cpu_to_le64(lock->fl_pid); -	cephlock->pid_namespace = -	        cpu_to_le64((u64)(unsigned long)lock->fl_nspid); +	cephlock->pid = cpu_to_le64((u64)lock->fl_pid); +	cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));  	switch (lock->fl_type) {  	case F_RDLCK: diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3142b15940c..92a2548278f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3,14 +3,15 @@  #include <linux/fs.h>  #include <linux/wait.h>  #include <linux/slab.h> +#include <linux/gfp.h>  #include <linux/sched.h>  #include <linux/debugfs.h>  #include <linux/seq_file.h> -#include <linux/smp_lock.h>  #include "super.h"  #include "mds_client.h" +#include <linux/ceph/ceph_features.h>  #include <linux/ceph/messenger.h>  #include <linux/ceph/decode.h>  #include <linux/ceph/pagelist.h> @@ -43,6 +44,7 @@   */  struct ceph_reconnect_state { +	int nr_caps;  	struct ceph_pagelist *pagelist;  	bool flock;  }; @@ -61,7 +63,8 @@ static const struct ceph_connection_operations mds_con_ops;   * parse individual inode info   */  static int parse_reply_info_in(void **p, void *end, -			       struct ceph_mds_reply_info_in *info) +			       struct ceph_mds_reply_info_in *info, +			       u64 features)  {  	int err = -EIO; @@ -75,6 +78,12 @@ static int parse_reply_info_in(void **p, void *end,  	info->symlink = *p;  	*p += info->symlink_len; +	if (features & CEPH_FEATURE_DIRLAYOUTHASH) +		ceph_decode_copy_safe(p, end, &info->dir_layout, +				      sizeof(info->dir_layout), bad); +	else +		memset(&info->dir_layout, 0, sizeof(info->dir_layout)); +  	ceph_decode_32_safe(p, end, info->xattr_len, bad);  	ceph_decode_need(p, end, info->xattr_len, bad);  	info->xattr_data = *p; @@ -89,12 +98,13 @@ bad:   * target inode.   */  static int parse_reply_info_trace(void **p, void *end, -				  struct ceph_mds_reply_info_parsed *info) +				  struct ceph_mds_reply_info_parsed *info, +				  u64 features)  {  	int err;  	if (info->head->is_dentry) { -		err = parse_reply_info_in(p, end, &info->diri); +		err = parse_reply_info_in(p, end, &info->diri, features);  		if (err < 0)  			goto out_bad; @@ -115,7 +125,7 @@ static int parse_reply_info_trace(void **p, void *end,  	}  	if (info->head->is_target) { -		err = parse_reply_info_in(p, end, &info->targeti); +		err = parse_reply_info_in(p, end, &info->targeti, features);  		if (err < 0)  			goto out_bad;  	} @@ -135,7 +145,8 @@ out_bad:   * parse readdir results   */  static int parse_reply_info_dir(void **p, void *end, -				struct ceph_mds_reply_info_parsed *info) +				struct ceph_mds_reply_info_parsed *info, +				u64 features)  {  	u32 num, i = 0;  	int err; @@ -155,21 +166,18 @@ static int parse_reply_info_dir(void **p, void *end,  	if (num == 0)  		goto done; -	/* alloc large array */ -	info->dir_nr = num; -	info->dir_in = kcalloc(num, sizeof(*info->dir_in) + -			       sizeof(*info->dir_dname) + -			       sizeof(*info->dir_dname_len) + -			       sizeof(*info->dir_dlease), -			       GFP_NOFS); -	if (info->dir_in == NULL) { -		err = -ENOMEM; -		goto out_bad; -	} +	BUG_ON(!info->dir_in);  	info->dir_dname = (void *)(info->dir_in + num);  	info->dir_dname_len = (void *)(info->dir_dname + num);  	info->dir_dlease = (void *)(info->dir_dname_len + num); +	if ((unsigned long)(info->dir_dlease + num) > +	    (unsigned long)info->dir_in + info->dir_buf_size) { +		pr_err("dir contents are larger than expected\n"); +		WARN_ON(1); +		goto bad; +	} +	info->dir_nr = num;  	while (num) {  		/* dentry */  		ceph_decode_need(p, end, sizeof(u32)*2, bad); @@ -183,7 +191,7 @@ static int parse_reply_info_dir(void **p, void *end,  		*p += sizeof(struct ceph_mds_reply_lease);  		/* inode */ -		err = parse_reply_info_in(p, end, &info->dir_in[i]); +		err = parse_reply_info_in(p, end, &info->dir_in[i], features);  		if (err < 0)  			goto out_bad;  		i++; @@ -203,10 +211,74 @@ out_bad:  }  /* + * parse fcntl F_GETLK results + */ +static int parse_reply_info_filelock(void **p, void *end, +				     struct ceph_mds_reply_info_parsed *info, +				     u64 features) +{ +	if (*p + sizeof(*info->filelock_reply) > end) +		goto bad; + +	info->filelock_reply = *p; +	*p += sizeof(*info->filelock_reply); + +	if (unlikely(*p != end)) +		goto bad; +	return 0; + +bad: +	return -EIO; +} + +/* + * parse create results + */ +static int parse_reply_info_create(void **p, void *end, +				  struct ceph_mds_reply_info_parsed *info, +				  u64 features) +{ +	if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { +		if (*p == end) { +			info->has_create_ino = false; +		} else { +			info->has_create_ino = true; +			info->ino = ceph_decode_64(p); +		} +	} + +	if (unlikely(*p != end)) +		goto bad; +	return 0; + +bad: +	return -EIO; +} + +/* + * parse extra results + */ +static int parse_reply_info_extra(void **p, void *end, +				  struct ceph_mds_reply_info_parsed *info, +				  u64 features) +{ +	if (info->head->op == CEPH_MDS_OP_GETFILELOCK) +		return parse_reply_info_filelock(p, end, info, features); +	else if (info->head->op == CEPH_MDS_OP_READDIR || +		 info->head->op == CEPH_MDS_OP_LSSNAP) +		return parse_reply_info_dir(p, end, info, features); +	else if (info->head->op == CEPH_MDS_OP_CREATE) +		return parse_reply_info_create(p, end, info, features); +	else +		return -EIO; +} + +/*   * parse entire mds reply   */  static int parse_reply_info(struct ceph_msg *msg, -			    struct ceph_mds_reply_info_parsed *info) +			    struct ceph_mds_reply_info_parsed *info, +			    u64 features)  {  	void *p, *end;  	u32 len; @@ -219,15 +291,17 @@ static int parse_reply_info(struct ceph_msg *msg,  	/* trace */  	ceph_decode_32_safe(&p, end, len, bad);  	if (len > 0) { -		err = parse_reply_info_trace(&p, p+len, info); +		ceph_decode_need(&p, end, len, bad); +		err = parse_reply_info_trace(&p, p+len, info, features);  		if (err < 0)  			goto out_bad;  	} -	/* dir content */ +	/* extra */  	ceph_decode_32_safe(&p, end, len, bad);  	if (len > 0) { -		err = parse_reply_info_dir(&p, p+len, info); +		ceph_decode_need(&p, end, len, bad); +		err = parse_reply_info_extra(&p, p+len, info, features);  		if (err < 0)  			goto out_bad;  	} @@ -251,7 +325,9 @@ out_bad:  static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)  { -	kfree(info->dir_in); +	if (!info->dir_in) +		return; +	free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));  } @@ -289,10 +365,10 @@ void ceph_put_mds_session(struct ceph_mds_session *s)  	dout("mdsc put_session %p %d -> %d\n", s,  	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);  	if (atomic_dec_and_test(&s->s_ref)) { -		if (s->s_authorizer) -		     s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer( -			     s->s_mdsc->fsc->client->monc.auth, -			     s->s_authorizer); +		if (s->s_auth.authorizer) +			ceph_auth_destroy_authorizer( +				s->s_mdsc->fsc->client->monc.auth, +				s->s_auth.authorizer);  		kfree(s);  	}  } @@ -339,6 +415,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,  {  	struct ceph_mds_session *s; +	if (mds >= mdsc->mdsmap->m_max_mds) +		return ERR_PTR(-EINVAL); +  	s = kzalloc(sizeof(*s), GFP_NOFS);  	if (!s)  		return ERR_PTR(-ENOMEM); @@ -349,15 +428,13 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,  	s->s_seq = 0;  	mutex_init(&s->s_mutex); -	ceph_con_init(mdsc->fsc->client->msgr, &s->s_con); -	s->s_con.private = s; -	s->s_con.ops = &mds_con_ops; -	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; -	s->s_con.peer_name.num = cpu_to_le64(mds); +	ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); -	spin_lock_init(&s->s_cap_lock); +	spin_lock_init(&s->s_gen_ttl_lock);  	s->s_cap_gen = 0; -	s->s_cap_ttl = 0; +	s->s_cap_ttl = jiffies - 1; + +	spin_lock_init(&s->s_cap_lock);  	s->s_renew_requested = 0;  	s->s_renew_seq = 0;  	INIT_LIST_HEAD(&s->s_caps); @@ -367,6 +444,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,  	INIT_LIST_HEAD(&s->s_waiting);  	INIT_LIST_HEAD(&s->s_unsafe);  	s->s_num_cap_releases = 0; +	s->s_cap_reconnect = 0;  	s->s_cap_iterator = NULL;  	INIT_LIST_HEAD(&s->s_cap_releases);  	INIT_LIST_HEAD(&s->s_cap_releases_done); @@ -393,7 +471,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,  	mdsc->sessions[mds] = s;  	atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */ -	ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); +	ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, +		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));  	return s; @@ -433,29 +512,33 @@ void ceph_mdsc_release_request(struct kref *kref)  	struct ceph_mds_request *req = container_of(kref,  						    struct ceph_mds_request,  						    r_kref); +	destroy_reply_info(&req->r_reply_info);  	if (req->r_request)  		ceph_msg_put(req->r_request); -	if (req->r_reply) { +	if (req->r_reply)  		ceph_msg_put(req->r_reply); -		destroy_reply_info(&req->r_reply_info); -	}  	if (req->r_inode) { -		ceph_put_cap_refs(ceph_inode(req->r_inode), -				  CEPH_CAP_PIN); +		ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);  		iput(req->r_inode);  	}  	if (req->r_locked_dir) -		ceph_put_cap_refs(ceph_inode(req->r_locked_dir), -				  CEPH_CAP_PIN); +		ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);  	if (req->r_target_inode)  		iput(req->r_target_inode);  	if (req->r_dentry)  		dput(req->r_dentry); -	if (req->r_old_dentry) { -		ceph_put_cap_refs( -			ceph_inode(req->r_old_dentry->d_parent->d_inode), -			CEPH_CAP_PIN); +	if (req->r_old_dentry)  		dput(req->r_old_dentry); +	if (req->r_old_dentry_dir) { +		/* +		 * track (and drop pins for) r_old_dentry_dir +		 * separately, since r_old_dentry's d_parent may have +		 * changed between the dir mutex being dropped and +		 * this request being freed. +		 */ +		ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), +				  CEPH_CAP_PIN); +		iput(req->r_old_dentry_dir);  	}  	kfree(req->r_path1);  	kfree(req->r_path2); @@ -529,9 +612,13 @@ static void __register_request(struct ceph_mds_client *mdsc,  	ceph_mdsc_get_request(req);  	__insert_request(mdsc, req); +	req->r_uid = current_fsuid(); +	req->r_gid = current_fsgid(); +  	if (dir) {  		struct ceph_inode_info *ci = ceph_inode(dir); +		ihold(dir);  		spin_lock(&ci->i_unsafe_lock);  		req->r_unsafe_dir = dir;  		list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); @@ -552,8 +639,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc,  		spin_lock(&ci->i_unsafe_lock);  		list_del_init(&req->r_unsafe_dir_item);  		spin_unlock(&ci->i_unsafe_lock); + +		iput(req->r_unsafe_dir); +		req->r_unsafe_dir = NULL;  	} +	complete_all(&req->r_safe_completion); +  	ceph_mdsc_put_request(req);  } @@ -565,8 +657,14 @@ static void __unregister_request(struct ceph_mds_client *mdsc,   *   * Called under mdsc->mutex.   */ -struct dentry *get_nonsnap_parent(struct dentry *dentry) +static struct dentry *get_nonsnap_parent(struct dentry *dentry)  { +	/* +	 * we don't need to worry about protecting the d_parent access +	 * here because we never renaming inside the snapped namespace +	 * except to resplice to another snapdir, and either the old or new +	 * result is a valid result. +	 */  	while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)  		dentry = dentry->d_parent;  	return dentry; @@ -602,7 +700,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,  	if (req->r_inode) {  		inode = req->r_inode;  	} else if (req->r_dentry) { -		struct inode *dir = req->r_dentry->d_parent->d_inode; +		/* ignore race with rename; old or new d_parent is okay */ +		struct dentry *parent = req->r_dentry->d_parent; +		struct inode *dir = parent->d_inode;  		if (dir->i_sb != mdsc->fsc->sb) {  			/* not this fs! */ @@ -610,18 +710,18 @@ static int __choose_mds(struct ceph_mds_client *mdsc,  		} else if (ceph_snap(dir) != CEPH_NOSNAP) {  			/* direct snapped/virtual snapdir requests  			 * based on parent dir inode */ -			struct dentry *dn = -				get_nonsnap_parent(req->r_dentry->d_parent); +			struct dentry *dn = get_nonsnap_parent(parent);  			inode = dn->d_inode;  			dout("__choose_mds using nonsnap parent %p\n", inode); -		} else if (req->r_dentry->d_inode) { +		} else {  			/* dentry target */  			inode = req->r_dentry->d_inode; -		} else { -			/* dir + name */ -			inode = dir; -			hash = req->r_dentry->d_name.hash; -			is_hash = true; +			if (!inode || mode == USE_AUTH_MDS) { +				/* dir + name */ +				inode = dir; +				hash = ceph_dentry_hash(dir, req->r_dentry); +				is_hash = true; +			}  		}  	} @@ -647,9 +747,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,  				dout("choose_mds %p %llx.%llx "  				     "frag %u mds%d (%d/%d)\n",  				     inode, ceph_vinop(inode), -				     frag.frag, frag.mds, +				     frag.frag, mds,  				     (int)r, frag.ndist); -				return mds; +				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= +				    CEPH_MDS_STATE_ACTIVE) +					return mds;  			}  			/* since this file/dir wasn't known to be @@ -662,26 +764,28 @@ static int __choose_mds(struct ceph_mds_client *mdsc,  				dout("choose_mds %p %llx.%llx "  				     "frag %u mds%d (auth)\n",  				     inode, ceph_vinop(inode), frag.frag, mds); -				return mds; +				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= +				    CEPH_MDS_STATE_ACTIVE) +					return mds;  			}  		}  	} -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	cap = NULL;  	if (mode == USE_AUTH_MDS)  		cap = ci->i_auth_cap;  	if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))  		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);  	if (!cap) { -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  		goto random;  	}  	mds = cap->session->s_mds;  	dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",  	     inode, ceph_vinop(inode), mds,  	     cap == ci->i_auth_cap ? "auth " : "", cap); -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	return mds;  random: @@ -699,7 +803,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)  	struct ceph_msg *msg;  	struct ceph_mds_session_head *h; -	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS); +	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, +			   false);  	if (!msg) {  		pr_err("create_session_msg ENOMEM creating msg\n");  		return NULL; @@ -742,35 +847,56 @@ static int __open_session(struct ceph_mds_client *mdsc,   *   * called under mdsc->mutex   */ +static struct ceph_mds_session * +__open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ +	struct ceph_mds_session *session; + +	session = __ceph_lookup_mds_session(mdsc, target); +	if (!session) { +		session = register_session(mdsc, target); +		if (IS_ERR(session)) +			return session; +	} +	if (session->s_state == CEPH_MDS_SESSION_NEW || +	    session->s_state == CEPH_MDS_SESSION_CLOSING) +		__open_session(mdsc, session); + +	return session; +} + +struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ +	struct ceph_mds_session *session; + +	dout("open_export_target_session to mds%d\n", target); + +	mutex_lock(&mdsc->mutex); +	session = __open_export_target_session(mdsc, target); +	mutex_unlock(&mdsc->mutex); + +	return session; +} +  static void __open_export_target_sessions(struct ceph_mds_client *mdsc,  					  struct ceph_mds_session *session)  {  	struct ceph_mds_info *mi;  	struct ceph_mds_session *ts;  	int i, mds = session->s_mds; -	int target;  	if (mds >= mdsc->mdsmap->m_max_mds)  		return; +  	mi = &mdsc->mdsmap->m_info[mds];  	dout("open_export_target_sessions for mds%d (%d targets)\n",  	     session->s_mds, mi->num_export_targets);  	for (i = 0; i < mi->num_export_targets; i++) { -		target = mi->export_targets[i]; -		ts = __ceph_lookup_mds_session(mdsc, target); -		if (!ts) { -			ts = register_session(mdsc, target); -			if (IS_ERR(ts)) -				return; -		} -		if (session->s_state == CEPH_MDS_SESSION_NEW || -		    session->s_state == CEPH_MDS_SESSION_CLOSING) -			__open_session(mdsc, session); -		else -			dout(" mds%d target mds%d %p is %s\n", session->s_mds, -			     i, ts, session_state_name(ts->s_state)); -		ceph_put_mds_session(ts); +		ts = __open_export_target_session(mdsc, mi->export_targets[i]); +		if (!IS_ERR(ts)) +			ceph_put_mds_session(ts);  	}  } @@ -885,8 +1011,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,  	dout("removing cap %p, ci is %p, inode is %p\n",  	     cap, ci, &ci->vfs_inode); -	spin_lock(&inode->i_lock); -	__ceph_remove_cap(cap); +	spin_lock(&ci->i_ceph_lock); +	__ceph_remove_cap(cap, false);  	if (!__ceph_is_any_real_caps(ci)) {  		struct ceph_mds_client *mdsc =  			ceph_sb_to_client(inode->i_sb)->mdsc; @@ -918,7 +1044,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,  		}  		spin_unlock(&mdsc->cap_dirty_lock);  	} -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	while (drop--)  		iput(inode);  	return 0; @@ -931,6 +1057,37 @@ static void remove_session_caps(struct ceph_mds_session *session)  {  	dout("remove_session_caps on %p\n", session);  	iterate_session_caps(session, remove_session_caps_cb, NULL); + +	spin_lock(&session->s_cap_lock); +	if (session->s_nr_caps > 0) { +		struct super_block *sb = session->s_mdsc->fsc->sb; +		struct inode *inode; +		struct ceph_cap *cap, *prev = NULL; +		struct ceph_vino vino; +		/* +		 * iterate_session_caps() skips inodes that are being +		 * deleted, we need to wait until deletions are complete. +		 * __wait_on_freeing_inode() is designed for the job, +		 * but it is not exported, so use lookup inode function +		 * to access it. +		 */ +		while (!list_empty(&session->s_caps)) { +			cap = list_entry(session->s_caps.next, +					 struct ceph_cap, session_caps); +			if (cap == prev) +				break; +			prev = cap; +			vino = cap->ci->i_vino; +			spin_unlock(&session->s_cap_lock); + +			inode = ceph_find_inode(sb, vino); +			iput(inode); + +			spin_lock(&session->s_cap_lock); +		} +	} +	spin_unlock(&session->s_cap_lock); +  	BUG_ON(session->s_nr_caps > 0);  	BUG_ON(!list_empty(&session->s_cap_flushing));  	cleanup_cap_releases(session); @@ -949,10 +1106,10 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,  	wake_up_all(&ci->i_cap_wq);  	if (arg) { -		spin_lock(&inode->i_lock); +		spin_lock(&ci->i_ceph_lock);  		ci->i_wanted_max_size = 0;  		ci->i_requested_max_size = 0; -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  	}  	return 0;  } @@ -1001,6 +1158,21 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,  	return 0;  } +static int send_flushmsg_ack(struct ceph_mds_client *mdsc, +			     struct ceph_mds_session *session, u64 seq) +{ +	struct ceph_msg *msg; + +	dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", +	     session->s_mds, session_state_name(session->s_state), seq); +	msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); +	if (!msg) +		return -ENOMEM; +	ceph_con_send(&session->s_con, msg); +	return 0; +} + +  /*   * Note new cap ttl, and any transition from stale -> not stale (fresh?).   * @@ -1013,8 +1185,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc,  	int wake = 0;  	spin_lock(&session->s_cap_lock); -	was_stale = is_renew && (session->s_cap_ttl == 0 || -				 time_after_eq(jiffies, session->s_cap_ttl)); +	was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);  	session->s_cap_ttl = session->s_renew_requested +  		mdsc->mdsmap->m_session_timeout*HZ; @@ -1080,31 +1251,36 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)  {  	struct ceph_mds_session *session = arg;  	struct ceph_inode_info *ci = ceph_inode(inode); -	int used, oissued, mine; +	int used, wanted, oissued, mine;  	if (session->s_trim_caps <= 0)  		return -1; -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	mine = cap->issued | cap->implemented;  	used = __ceph_caps_used(ci); +	wanted = __ceph_caps_file_wanted(ci);  	oissued = __ceph_caps_issued_other(ci, cap); -	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", +	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",  	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), -	     ceph_cap_string(used)); -	if (ci->i_dirty_caps) -		goto out;   /* dirty caps */ -	if ((used & ~oissued) & mine) +	     ceph_cap_string(used), ceph_cap_string(wanted)); +	if (cap == ci->i_auth_cap) { +		if (ci->i_dirty_caps | ci->i_flushing_caps) +			goto out; +		if ((used | wanted) & CEPH_CAP_ANY_WR) +			goto out; +	} +	if ((used | wanted) & ~oissued & mine)  		goto out;   /* we need these caps */  	session->s_trim_caps--;  	if (oissued) {  		/* we aren't the only cap.. just remove us */ -		__ceph_remove_cap(cap); +		__ceph_remove_cap(cap, true);  	} else {  		/* try to drop referring dentries */ -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  		d_prune_aliases(inode);  		dout("trim_caps_cb %p cap %p  pruned, count now %d\n",  		     inode, cap, atomic_read(&inode->i_count)); @@ -1112,7 +1288,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)  	}  out: -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	return 0;  } @@ -1135,6 +1311,9 @@ static int trim_caps(struct ceph_mds_client *mdsc,  			trim_caps - session->s_trim_caps);  		session->s_trim_caps = 0;  	} + +	ceph_add_cap_releases(mdsc, session); +	ceph_send_cap_releases(mdsc, session);  	return 0;  } @@ -1175,7 +1354,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,  	while (session->s_num_cap_releases < session->s_nr_caps + extra) {  		spin_unlock(&session->s_cap_lock);  		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, -				   GFP_NOFS); +				   GFP_NOFS, false);  		if (!msg)  			goto out_unlocked;  		dout("add_cap_releases %p msg %p now %d\n", session, msg, @@ -1230,7 +1409,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)  					   i_flushing_item);  			struct inode *inode = &ci->vfs_inode; -			spin_lock(&inode->i_lock); +			spin_lock(&ci->i_ceph_lock);  			if (ci->i_cap_flush_seq <= want_flush_seq) {  				dout("check_cap_flush still flushing %p "  				     "seq %lld <= %lld to mds%d\n", inode, @@ -1238,7 +1417,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)  				     session->s_mds);  				ret = 0;  			} -			spin_unlock(&inode->i_lock); +			spin_unlock(&ci->i_ceph_lock);  		}  		mutex_unlock(&session->s_mutex);  		ceph_put_mds_session(session); @@ -1284,16 +1463,19 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,  	unsigned num;  	dout("discard_cap_releases mds%d\n", session->s_mds); -	spin_lock(&session->s_cap_lock); -	/* zero out the in-progress message */ -	msg = list_first_entry(&session->s_cap_releases, -			       struct ceph_msg, list_head); -	head = msg->front.iov_base; -	num = le32_to_cpu(head->num); -	dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); -	head->num = cpu_to_le32(0); -	session->s_num_cap_releases += num; +	if (!list_empty(&session->s_cap_releases)) { +		/* zero out the in-progress message */ +		msg = list_first_entry(&session->s_cap_releases, +					struct ceph_msg, list_head); +		head = msg->front.iov_base; +		num = le32_to_cpu(head->num); +		dout("discard_cap_releases mds%d %p %u\n", +		     session->s_mds, msg, num); +		head->num = cpu_to_le32(0); +		msg->front.iov_len = sizeof(*head); +		session->s_num_cap_releases += num; +	}  	/* requeue completed messages */  	while (!list_empty(&session->s_cap_releases_done)) { @@ -1310,14 +1492,49 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,  		msg->front.iov_len = sizeof(*head);  		list_add(&msg->list_head, &session->s_cap_releases);  	} - -	spin_unlock(&session->s_cap_lock);  }  /*   * requests   */ +int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, +				    struct inode *dir) +{ +	struct ceph_inode_info *ci = ceph_inode(dir); +	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; +	struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; +	size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + +		      sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease); +	int order, num_entries; + +	spin_lock(&ci->i_ceph_lock); +	num_entries = ci->i_files + ci->i_subdirs; +	spin_unlock(&ci->i_ceph_lock); +	num_entries = max(num_entries, 1); +	num_entries = min(num_entries, opt->max_readdir); + +	order = get_order(size * num_entries); +	while (order >= 0) { +		rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN, +							order); +		if (rinfo->dir_in) +			break; +		order--; +	} +	if (!rinfo->dir_in) +		return -ENOMEM; + +	num_entries = (PAGE_SIZE << order) / size; +	num_entries = min(num_entries, opt->max_readdir); + +	rinfo->dir_buf_size = PAGE_SIZE << order; +	req->r_num_caps = num_entries + 1; +	req->r_args.readdir.max_entries = cpu_to_le32(num_entries); +	req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes); +	return 0; +} +  /*   * Create an mds request.   */ @@ -1341,6 +1558,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)  	init_completion(&req->r_safe_completion);  	INIT_LIST_HEAD(&req->r_unsafe_item); +	req->r_stamp = CURRENT_TIME; +  	req->r_op = op;  	req->r_direct_mode = mode;  	return req; @@ -1384,12 +1603,15 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,  	struct dentry *temp;  	char *path;  	int len, pos; +	unsigned seq;  	if (dentry == NULL)  		return ERR_PTR(-EINVAL);  retry:  	len = 0; +	seq = read_seqbegin(&rename_lock); +	rcu_read_lock();  	for (temp = dentry; !IS_ROOT(temp);) {  		struct inode *inode = temp->d_inode;  		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) @@ -1400,11 +1622,8 @@ retry:  		else  			len += 1 + temp->d_name.len;  		temp = temp->d_parent; -		if (temp == NULL) { -			pr_err("build_path corrupt dentry %p\n", dentry); -			return ERR_PTR(-EINVAL); -		}  	} +	rcu_read_unlock();  	if (len)  		len--;  /* no leading '/' */ @@ -1413,32 +1632,35 @@ retry:  		return ERR_PTR(-ENOMEM);  	pos = len;  	path[pos] = 0;	/* trailing null */ +	rcu_read_lock();  	for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) { -		struct inode *inode = temp->d_inode; +		struct inode *inode; +		spin_lock(&temp->d_lock); +		inode = temp->d_inode;  		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {  			dout("build_path path+%d: %p SNAPDIR\n",  			     pos, temp);  		} else if (stop_on_nosnap && inode &&  			   ceph_snap(inode) == CEPH_NOSNAP) { +			spin_unlock(&temp->d_lock);  			break;  		} else {  			pos -= temp->d_name.len; -			if (pos < 0) +			if (pos < 0) { +				spin_unlock(&temp->d_lock);  				break; +			}  			strncpy(path + pos, temp->d_name.name,  				temp->d_name.len);  		} +		spin_unlock(&temp->d_lock);  		if (pos)  			path[--pos] = '/';  		temp = temp->d_parent; -		if (temp == NULL) { -			pr_err("build_path corrupt dentry\n"); -			kfree(path); -			return ERR_PTR(-EINVAL); -		}  	} -	if (pos != 0) { +	rcu_read_unlock(); +	if (pos != 0 || read_seqretry(&rename_lock, seq)) {  		pr_err("build_path did not end path lookup where "  		       "expected, namelen is %d, pos is %d\n", len, pos);  		/* presumably this is only possible if racing with a @@ -1452,7 +1674,7 @@ retry:  	*base = ceph_ino(temp->d_inode);  	*plen = len;  	dout("build_path on %p %d built %llx '%.*s'\n", -	     dentry, atomic_read(&dentry->d_count), *base, len, path); +	     dentry, d_count(dentry), *base, len, path);  	return path;  } @@ -1517,10 +1739,10 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,  		r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);  		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,  		     *ppath); -	} else if (rpath) { +	} else if (rpath || rino) {  		*ino = rino;  		*ppath = rpath; -		*pathlen = strlen(rpath); +		*pathlen = rpath ? strlen(rpath) : 0;  		dout(" path %.*s\n", *pathlen, rpath);  	} @@ -1563,7 +1785,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,  	}  	len = sizeof(*head) + -		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)); +		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + +		sizeof(struct timespec);  	/* calculate (max) length for cap releases */  	len += sizeof(struct ceph_mds_request_release) * @@ -1574,12 +1797,13 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,  	if (req->r_old_dentry_drop)  		len += req->r_old_dentry->d_name.len; -	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS); +	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);  	if (!msg) {  		msg = ERR_PTR(-ENOMEM);  		goto out_free2;  	} +	msg->hdr.version = 2;  	msg->hdr.tid = cpu_to_le64(req->r_tid);  	head = msg->front.iov_base; @@ -1588,8 +1812,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,  	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);  	head->op = cpu_to_le32(req->r_op); -	head->caller_uid = cpu_to_le32(current_fsuid()); -	head->caller_gid = cpu_to_le32(current_fsgid()); +	head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); +	head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));  	head->args = req->r_args;  	ceph_encode_filepath(&p, end, ino1, path1); @@ -1616,12 +1840,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,  		      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);  	head->num_releases = cpu_to_le16(releases); +	/* time stamp */ +	ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); +  	BUG_ON(p > end);  	msg->front.iov_len = p - msg->front.iov_base;  	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); -	msg->pages = req->r_pages; -	msg->nr_pages = req->r_num_pages; +	if (req->r_data_len) { +		/* outbound data set only by ceph_sync_setxattr() */ +		BUG_ON(!req->r_pages); +		ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0); +	} +  	msg->hdr.data_len = cpu_to_le32(req->r_data_len);  	msg->hdr.data_off = cpu_to_le16(0); @@ -1659,7 +1890,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,  	struct ceph_msg *msg;  	int flags = 0; -	req->r_mds = mds;  	req->r_attempts++;  	if (req->r_inode) {  		struct ceph_cap *cap = @@ -1736,8 +1966,11 @@ static int __do_request(struct ceph_mds_client *mdsc,  	int mds = -1;  	int err = -EAGAIN; -	if (req->r_err || req->r_got_result) +	if (req->r_err || req->r_got_result) { +		if (req->r_aborted) +			__unregister_request(mdsc, req);  		goto out; +	}  	if (req->r_timeout &&  	    time_after_eq(jiffies, req->r_started + req->r_timeout)) { @@ -1746,6 +1979,8 @@ static int __do_request(struct ceph_mds_client *mdsc,  		goto finish;  	} +	put_request_session(req); +  	mds = __choose_mds(mdsc, req);  	if (mds < 0 ||  	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { @@ -1763,6 +1998,8 @@ static int __do_request(struct ceph_mds_client *mdsc,  			goto finish;  		}  	} +	req->r_session = get_session(session); +  	dout("do_request mds%d session %p state %s\n", mds, session,  	     session_state_name(session->s_state));  	if (session->s_state != CEPH_MDS_SESSION_OPEN && @@ -1775,7 +2012,6 @@ static int __do_request(struct ceph_mds_client *mdsc,  	}  	/* send request */ -	req->r_session = get_session(session);  	req->r_resend_mds = -1;   /* forget any previous mds hint */  	if (req->r_request_started == 0)   /* note request start time */ @@ -1804,10 +2040,16 @@ finish:  static void __wake_requests(struct ceph_mds_client *mdsc,  			    struct list_head *head)  { -	struct ceph_mds_request *req, *nreq; +	struct ceph_mds_request *req; +	LIST_HEAD(tmp_list); + +	list_splice_init(head, &tmp_list); -	list_for_each_entry_safe(req, nreq, head, r_wait) { +	while (!list_empty(&tmp_list)) { +		req = list_entry(tmp_list.next, +				 struct ceph_mds_request, r_wait);  		list_del_init(&req->r_wait); +		dout(" wake request %p tid %llu\n", req, req->r_tid);  		__do_request(mdsc, req);  	}  } @@ -1829,7 +2071,6 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)  		if (req->r_session &&  		    req->r_session->s_mds == mds) {  			dout(" kicking tid %llu\n", req->r_tid); -			put_request_session(req);  			__do_request(mdsc, req);  		}  	} @@ -1862,10 +2103,9 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,  		ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);  	if (req->r_locked_dir)  		ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); -	if (req->r_old_dentry) -		ceph_get_cap_refs( -			ceph_inode(req->r_old_dentry->d_parent->d_inode), -			CEPH_CAP_PIN); +	if (req->r_old_dentry_dir) +		ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), +				  CEPH_CAP_PIN);  	/* issue */  	mutex_lock(&mdsc->mutex); @@ -1923,20 +2163,16 @@ out:  }  /* - * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS + * Invalidate dir's completeness, dentry lease state on an aborted MDS   * namespace request.   */  void ceph_invalidate_dir_request(struct ceph_mds_request *req)  {  	struct inode *inode = req->r_locked_dir; -	struct ceph_inode_info *ci = ceph_inode(inode); -	dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode); -	spin_lock(&inode->i_lock); -	ci->i_ceph_flags &= ~CEPH_I_COMPLETE; -	ci->i_release_count++; -	spin_unlock(&inode->i_lock); +	dout("invalidate_dir_request %p (complete, lease(s))\n", inode); +	ceph_dir_clear_complete(inode);  	if (req->r_dentry)  		ceph_invalidate_dentry_lease(req->r_dentry);  	if (req->r_old_dentry) @@ -1989,13 +2225,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)  	/* dup? */  	if ((req->r_got_unsafe && !head->safe) ||  	    (req->r_got_safe && head->safe)) { -		pr_warning("got a dup %s reply on %llu from mds%d\n", +		pr_warn("got a dup %s reply on %llu from mds%d\n",  			   head->safe ? "safe" : "unsafe", tid, mds);  		mutex_unlock(&mdsc->mutex);  		goto out;  	}  	if (req->r_got_safe && !head->safe) { -		pr_warning("got unsafe after safe on %llu from mds%d\n", +		pr_warn("got unsafe after safe on %llu from mds%d\n",  			   tid, mds);  		mutex_unlock(&mdsc->mutex);  		goto out; @@ -2012,23 +2248,16 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)  	 */  	if (result == -ESTALE) {  		dout("got ESTALE on request %llu", req->r_tid); -		if (!req->r_inode) { -			/* do nothing; not an authority problem */ -		} else if (req->r_direct_mode != USE_AUTH_MDS) { +		if (req->r_direct_mode != USE_AUTH_MDS) {  			dout("not using auth, setting for that now");  			req->r_direct_mode = USE_AUTH_MDS;  			__do_request(mdsc, req);  			mutex_unlock(&mdsc->mutex);  			goto out;  		} else  { -			struct ceph_inode_info *ci = ceph_inode(req->r_inode); -			struct ceph_cap *cap = -				ceph_get_cap_for_mds(ci, req->r_mds);; - -			dout("already using auth"); -			if ((!cap || cap != ci->i_auth_cap) || -			    (cap->mseq != req->r_sent_on_mseq)) { -				dout("but cap changed, so resending"); +			int mds = __choose_mds(mdsc, req); +			if (mds >= 0 && mds != req->r_session->s_mds) { +				dout("but auth changed, so resending");  				__do_request(mdsc, req);  				mutex_unlock(&mdsc->mutex);  				goto out; @@ -2041,7 +2270,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)  	if (head->safe) {  		req->r_got_safe = true;  		__unregister_request(mdsc, req); -		complete_all(&req->r_safe_completion);  		if (req->r_got_unsafe) {  			/* @@ -2067,12 +2295,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)  	dout("handle_reply tid %lld result %d\n", tid, result);  	rinfo = &req->r_reply_info; -	err = parse_reply_info(msg, rinfo); +	err = parse_reply_info(msg, rinfo, session->s_con.peer_features);  	mutex_unlock(&mdsc->mutex);  	mutex_lock(&session->s_mutex);  	if (err < 0) { -		pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds); +		pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);  		ceph_msg_dump(msg);  		goto out_err;  	} @@ -2092,7 +2320,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)  	mutex_lock(&req->r_fill_mutex);  	err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);  	if (err == 0) { -		if (result == 0 && rinfo->dir_nr) +		if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || +				    req->r_op == CEPH_MDS_OP_LSSNAP))  			ceph_readdir_prepopulate(req, req->r_session);  		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);  	} @@ -2242,10 +2471,10 @@ static void handle_session(struct ceph_mds_session *session,  	case CEPH_SESSION_STALE:  		pr_info("mds%d caps went stale, renewing\n",  			session->s_mds); -		spin_lock(&session->s_cap_lock); +		spin_lock(&session->s_gen_ttl_lock);  		session->s_cap_gen++; -		session->s_cap_ttl = 0; -		spin_unlock(&session->s_cap_lock); +		session->s_cap_ttl = jiffies - 1; +		spin_unlock(&session->s_gen_ttl_lock);  		send_renew_caps(mdsc, session);  		break; @@ -2253,6 +2482,10 @@ static void handle_session(struct ceph_mds_session *session,  		trim_caps(mdsc, session, le32_to_cpu(h->max_caps));  		break; +	case CEPH_SESSION_FLUSHMSG: +		send_flushmsg_ack(mdsc, session, seq); +		break; +  	default:  		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);  		WARN_ON(1); @@ -2339,9 +2572,11 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,  	if (err)  		goto out_free; -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	cap->seq = 0;        /* reset cap seq */  	cap->issue_seq = 0;  /* and issue_seq */ +	cap->mseq = 0;       /* and migrate_seq */ +	cap->cap_gen = cap->session->s_cap_gen;  	if (recon_state->flock) {  		rec.v2.cap_id = cpu_to_le64(cap->cap_id); @@ -2362,43 +2597,50 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,  		rec.v1.pathbase = cpu_to_le64(pathbase);  		reclen = sizeof(rec.v1);  	} -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	if (recon_state->flock) {  		int num_fcntl_locks, num_flock_locks; -		struct ceph_pagelist_cursor trunc_point; - -		ceph_pagelist_set_cursor(pagelist, &trunc_point); -		do { -			lock_flocks(); -			ceph_count_locks(inode, &num_fcntl_locks, -					 &num_flock_locks); -			rec.v2.flock_len = (2*sizeof(u32) + -					    (num_fcntl_locks+num_flock_locks) * -					    sizeof(struct ceph_filelock)); -			unlock_flocks(); - -			/* pre-alloc pagelist */ -			ceph_pagelist_truncate(pagelist, &trunc_point); -			err = ceph_pagelist_append(pagelist, &rec, reclen); -			if (!err) -				err = ceph_pagelist_reserve(pagelist, -							    rec.v2.flock_len); - -			/* encode locks */ -			if (!err) { -				lock_flocks(); -				err = ceph_encode_locks(inode, -							pagelist, -							num_fcntl_locks, -							num_flock_locks); -				unlock_flocks(); -			} -		} while (err == -ENOSPC); +		struct ceph_filelock *flocks; + +encode_again: +		spin_lock(&inode->i_lock); +		ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); +		spin_unlock(&inode->i_lock); +		flocks = kmalloc((num_fcntl_locks+num_flock_locks) * +				 sizeof(struct ceph_filelock), GFP_NOFS); +		if (!flocks) { +			err = -ENOMEM; +			goto out_free; +		} +		spin_lock(&inode->i_lock); +		err = ceph_encode_locks_to_buffer(inode, flocks, +						  num_fcntl_locks, +						  num_flock_locks); +		spin_unlock(&inode->i_lock); +		if (err) { +			kfree(flocks); +			if (err == -ENOSPC) +				goto encode_again; +			goto out_free; +		} +		/* +		 * number of encoded locks is stable, so copy to pagelist +		 */ +		rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + +				    (num_fcntl_locks+num_flock_locks) * +				    sizeof(struct ceph_filelock)); +		err = ceph_pagelist_append(pagelist, &rec, reclen); +		if (!err) +			err = ceph_locks_to_pagelist(flocks, pagelist, +						     num_fcntl_locks, +						     num_flock_locks); +		kfree(flocks);  	} else {  		err = ceph_pagelist_append(pagelist, &rec, reclen);  	} +	recon_state->nr_caps++;  out_free:  	kfree(path);  out_dput: @@ -2426,6 +2668,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,  	struct rb_node *p;  	int mds = session->s_mds;  	int err = -ENOMEM; +	int s_nr_caps;  	struct ceph_pagelist *pagelist;  	struct ceph_reconnect_state recon_state; @@ -2436,7 +2679,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,  		goto fail_nopagelist;  	ceph_pagelist_init(pagelist); -	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS); +	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);  	if (!reply)  		goto fail_nomsg; @@ -2444,7 +2687,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,  	session->s_state = CEPH_MDS_SESSION_RECONNECTING;  	session->s_seq = 0; +	ceph_con_close(&session->s_con);  	ceph_con_open(&session->s_con, +		      CEPH_ENTITY_TYPE_MDS, mds,  		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));  	/* replay unsafe requests */ @@ -2455,20 +2700,38 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,  	dout("session %p state %s\n", session,  	     session_state_name(session->s_state)); +	spin_lock(&session->s_gen_ttl_lock); +	session->s_cap_gen++; +	spin_unlock(&session->s_gen_ttl_lock); + +	spin_lock(&session->s_cap_lock); +	/* +	 * notify __ceph_remove_cap() that we are composing cap reconnect. +	 * If a cap get released before being added to the cap reconnect, +	 * __ceph_remove_cap() should skip queuing cap release. +	 */ +	session->s_cap_reconnect = 1;  	/* drop old cap expires; we're about to reestablish that state */  	discard_cap_releases(mdsc, session); +	spin_unlock(&session->s_cap_lock);  	/* traverse this session's caps */ -	err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); +	s_nr_caps = session->s_nr_caps; +	err = ceph_pagelist_encode_32(pagelist, s_nr_caps);  	if (err)  		goto fail; +	recon_state.nr_caps = 0;  	recon_state.pagelist = pagelist;  	recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;  	err = iterate_session_caps(session, encode_caps_cb, &recon_state);  	if (err < 0)  		goto fail; +	spin_lock(&session->s_cap_lock); +	session->s_cap_reconnect = 0; +	spin_unlock(&session->s_cap_lock); +  	/*  	 * snaprealms.  we provide mds with the ino, seq (version), and  	 * parent for all of our realms.  If the mds has any newer info, @@ -2489,11 +2752,20 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,  			goto fail;  	} -	reply->pagelist = pagelist;  	if (recon_state.flock)  		reply->hdr.version = cpu_to_le16(2); + +	/* raced with cap release? */ +	if (s_nr_caps != recon_state.nr_caps) { +		struct page *page = list_first_entry(&pagelist->head, +						     struct page, lru); +		__le32 *addr = kmap_atomic(page); +		*addr = cpu_to_le32(recon_state.nr_caps); +		kunmap_atomic(addr); +	} +  	reply->hdr.data_len = cpu_to_le32(pagelist->length); -	reply->nr_pages = calc_pages_for(0, pagelist->length); +	ceph_msg_data_add_pagelist(reply, pagelist);  	ceph_con_send(&session->s_con, reply);  	mutex_unlock(&session->s_mutex); @@ -2549,7 +2821,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,  		     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",  		     session_state_name(s->s_state)); -		if (memcmp(ceph_mdsmap_get_addr(oldmap, i), +		if (i >= newmap->m_max_mds || +		    memcmp(ceph_mdsmap_get_addr(oldmap, i),  			   ceph_mdsmap_get_addr(newmap, i),  			   sizeof(struct ceph_entity_addr))) {  			if (s->s_state == CEPH_MDS_SESSION_OPENING) { @@ -2636,14 +2909,12 @@ static void handle_lease(struct ceph_mds_client *mdsc,  {  	struct super_block *sb = mdsc->fsc->sb;  	struct inode *inode; -	struct ceph_inode_info *ci;  	struct dentry *parent, *dentry;  	struct ceph_dentry_info *di;  	int mds = session->s_mds;  	struct ceph_mds_lease *h = msg->front.iov_base;  	u32 seq;  	struct ceph_vino vino; -	int mask;  	struct qstr dname;  	int release = 0; @@ -2654,7 +2925,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,  		goto bad;  	vino.ino = le64_to_cpu(h->ino);  	vino.snap = CEPH_NOSNAP; -	mask = le16_to_cpu(h->mask);  	seq = le32_to_cpu(h->seq);  	dname.name = (void *)h + sizeof(*h) + sizeof(u32);  	dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); @@ -2666,14 +2936,13 @@ static void handle_lease(struct ceph_mds_client *mdsc,  	/* lookup inode */  	inode = ceph_find_inode(sb, vino); -	dout("handle_lease %s, mask %d, ino %llx %p %.*s\n", -	     ceph_lease_op_name(h->action), mask, vino.ino, inode, +	dout("handle_lease %s, ino %llx %p %.*s\n", +	     ceph_lease_op_name(h->action), vino.ino, inode,  	     dname.len, dname.name);  	if (inode == NULL) {  		dout("handle_lease no inode %llx\n", vino.ino);  		goto release;  	} -	ci = ceph_inode(inode);  	/* dentry */  	parent = d_find_alias(inode); @@ -2692,7 +2961,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,  	di = ceph_dentry(dentry);  	switch (h->action) {  	case CEPH_MDS_LEASE_REVOKE: -		if (di && di->lease_session == session) { +		if (di->lease_session == session) {  			if (ceph_seq_cmp(di->lease_seq, seq) > 0)  				h->seq = cpu_to_le32(di->lease_seq);  			__ceph_mdsc_drop_dentry_lease(dentry); @@ -2701,7 +2970,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,  		break;  	case CEPH_MDS_LEASE_RENEW: -		if (di && di->lease_session == session && +		if (di->lease_session == session &&  		    di->lease_gen == session->s_cap_gen &&  		    di->lease_renew_from &&  		    di->lease_renew_after == 0) { @@ -2753,12 +3022,11 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,  	dnamelen = dentry->d_name.len;  	len += dnamelen; -	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS); +	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);  	if (!msg)  		return;  	lease = msg->front.iov_base;  	lease->action = action; -	lease->mask = cpu_to_le16(1);  	lease->ino = cpu_to_le64(ceph_vino(inode).ino);  	lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);  	lease->seq = cpu_to_le32(seq); @@ -2780,7 +3048,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,   * Pass @inode always, @dentry is optional.   */  void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, -			     struct dentry *dentry, int mask) +			     struct dentry *dentry)  {  	struct ceph_dentry_info *di;  	struct ceph_mds_session *session; @@ -2788,7 +3056,6 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,  	BUG_ON(inode == NULL);  	BUG_ON(dentry == NULL); -	BUG_ON(mask == 0);  	/* is dentry lease valid? */  	spin_lock(&dentry->d_lock); @@ -2798,8 +3065,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,  	    di->lease_gen != di->lease_session->s_cap_gen ||  	    !time_before(jiffies, dentry->d_time)) {  		dout("lease_release inode %p dentry %p -- " -		     "no lease on %d\n", -		     inode, dentry, mask); +		     "no lease\n", +		     inode, dentry);  		spin_unlock(&dentry->d_lock);  		return;  	} @@ -2810,8 +3077,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,  	__ceph_mdsc_drop_dentry_lease(dentry);  	spin_unlock(&dentry->d_lock); -	dout("lease_release inode %p dentry %p mask %d to mds%d\n", -	     inode, dentry, mask, session->s_mds); +	dout("lease_release inode %p dentry %p to mds%d\n", +	     inode, dentry, session->s_mds);  	ceph_mdsc_lease_send_msg(session, inode, dentry,  				 CEPH_MDS_LEASE_RELEASE, seq);  	ceph_put_mds_session(session); @@ -2924,8 +3191,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)  	fsc->mdsc = mdsc;  	mutex_init(&mdsc->mutex);  	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); -	if (mdsc->mdsmap == NULL) +	if (mdsc->mdsmap == NULL) { +		kfree(mdsc);  		return -ENOMEM; +	}  	init_completion(&mdsc->safe_umount_waiters);  	init_waitqueue_head(&mdsc->session_close_wq); @@ -2947,6 +3216,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)  	spin_lock_init(&mdsc->snap_flush_lock);  	mdsc->cap_flush_seq = 0;  	INIT_LIST_HEAD(&mdsc->cap_dirty); +	INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);  	mdsc->num_cap_flushing = 0;  	spin_lock_init(&mdsc->cap_dirty_lock);  	init_waitqueue_head(&mdsc->cap_flushing_wq); @@ -3076,7 +3346,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)  /*   * true if all sessions are closed, or we force unmount   */ -bool done_closing_sessions(struct ceph_mds_client *mdsc) +static bool done_closing_sessions(struct ceph_mds_client *mdsc)  {  	int i, n = 0; @@ -3160,9 +3430,15 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)  {  	struct ceph_mds_client *mdsc = fsc->mdsc; +	dout("mdsc_destroy %p\n", mdsc);  	ceph_mdsc_stop(mdsc); + +	/* flush out any connection work with references to us */ +	ceph_msgr_flush(); +  	fsc->mdsc = NULL;  	kfree(mdsc); +	dout("mdsc_destroy %p done\n", mdsc);  } @@ -3243,8 +3519,8 @@ static void con_put(struct ceph_connection *con)  {  	struct ceph_mds_session *s = con->private; +	dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1);  	ceph_put_mds_session(s); -	dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));  }  /* @@ -3256,7 +3532,7 @@ static void peer_reset(struct ceph_connection *con)  	struct ceph_mds_session *s = con->private;  	struct ceph_mds_client *mdsc = s->s_mdsc; -	pr_warning("mds%d closed our session\n", s->s_mds); +	pr_warn("mds%d closed our session\n", s->s_mds);  	send_mds_reconnect(mdsc, s);  } @@ -3307,39 +3583,37 @@ out:  /*   * authentication   */ -static int get_authorizer(struct ceph_connection *con, -			  void **buf, int *len, int *proto, -			  void **reply_buf, int *reply_len, int force_new) + +/* + * Note: returned pointer is the address of a structure that's + * managed separately.  Caller must *not* attempt to free it. + */ +static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, +					int *proto, int force_new)  {  	struct ceph_mds_session *s = con->private;  	struct ceph_mds_client *mdsc = s->s_mdsc;  	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; -	int ret = 0; - -	if (force_new && s->s_authorizer) { -		ac->ops->destroy_authorizer(ac, s->s_authorizer); -		s->s_authorizer = NULL; -	} -	if (s->s_authorizer == NULL) { -		if (ac->ops->create_authorizer) { -			ret = ac->ops->create_authorizer( -				ac, CEPH_ENTITY_TYPE_MDS, -				&s->s_authorizer, -				&s->s_authorizer_buf, -				&s->s_authorizer_buf_len, -				&s->s_authorizer_reply_buf, -				&s->s_authorizer_reply_buf_len); -			if (ret) -				return ret; -		} -	} +	struct ceph_auth_handshake *auth = &s->s_auth; +	if (force_new && auth->authorizer) { +		ceph_auth_destroy_authorizer(ac, auth->authorizer); +		auth->authorizer = NULL; +	} +	if (!auth->authorizer) { +		int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, +						      auth); +		if (ret) +			return ERR_PTR(ret); +	} else { +		int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS, +						      auth); +		if (ret) +			return ERR_PTR(ret); +	}  	*proto = ac->protocol; -	*buf = s->s_authorizer_buf; -	*len = s->s_authorizer_buf_len; -	*reply_buf = s->s_authorizer_reply_buf; -	*reply_len = s->s_authorizer_reply_buf_len; -	return 0; + +	return auth;  } @@ -3349,7 +3623,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)  	struct ceph_mds_client *mdsc = s->s_mdsc;  	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; -	return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); +	return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len);  }  static int invalidate_authorizer(struct ceph_connection *con) @@ -3358,12 +3632,32 @@ static int invalidate_authorizer(struct ceph_connection *con)  	struct ceph_mds_client *mdsc = s->s_mdsc;  	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; -	if (ac->ops->invalidate_authorizer) -		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); +	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);  	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);  } +static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, +				struct ceph_msg_header *hdr, int *skip) +{ +	struct ceph_msg *msg; +	int type = (int) le16_to_cpu(hdr->type); +	int front_len = (int) le32_to_cpu(hdr->front_len); + +	if (con->in_msg) +		return con->in_msg; + +	*skip = 0; +	msg = ceph_msg_new(type, front_len, GFP_NOFS, false); +	if (!msg) { +		pr_err("unable to allocate msg type %d len %d\n", +		       type, front_len); +		return NULL; +	} + +	return msg; +} +  static const struct ceph_connection_operations mds_con_ops = {  	.get = con_get,  	.put = con_put, @@ -3372,6 +3666,7 @@ static const struct ceph_connection_operations mds_con_ops = {  	.verify_authorizer_reply = verify_authorizer_reply,  	.invalidate_authorizer = invalidate_authorizer,  	.peer_reset = peer_reset, +	.alloc_msg = mds_alloc_msg,  };  /* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index d66d63c7235..e00737cf523 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -11,6 +11,7 @@  #include <linux/ceph/types.h>  #include <linux/ceph/messenger.h>  #include <linux/ceph/mdsmap.h> +#include <linux/ceph/auth.h>  /*   * Some lock dependencies: @@ -20,7 +21,7 @@   *   *         mdsc->snap_rwsem   * - *         inode->i_lock + *         ci->i_ceph_lock   *                 mdsc->snap_flush_lock   *                 mdsc->cap_delay_lock   * @@ -35,6 +36,7 @@ struct ceph_cap;   */  struct ceph_mds_reply_info_in {  	struct ceph_mds_reply_inode *in; +	struct ceph_dir_layout dir_layout;  	u32 symlink_len;  	char *symlink;  	u32 xattr_len; @@ -42,26 +44,44 @@ struct ceph_mds_reply_info_in {  };  /* - * parsed info about an mds reply, including information about the - * target inode and/or its parent directory and dentry, and directory - * contents (for readdir results). + * parsed info about an mds reply, including information about + * either: 1) the target inode and/or its parent directory and dentry, + * and directory contents (for readdir results), or + * 2) the file range lock info (for fcntl F_GETLK results).   */  struct ceph_mds_reply_info_parsed {  	struct ceph_mds_reply_head    *head; +	/* trace */  	struct ceph_mds_reply_info_in diri, targeti;  	struct ceph_mds_reply_dirfrag *dirfrag;  	char                          *dname;  	u32                           dname_len;  	struct ceph_mds_reply_lease   *dlease; -	struct ceph_mds_reply_dirfrag *dir_dir; -	int                           dir_nr; -	char                          **dir_dname; -	u32                           *dir_dname_len; -	struct ceph_mds_reply_lease   **dir_dlease; -	struct ceph_mds_reply_info_in *dir_in; -	u8                            dir_complete, dir_end; +	/* extra */ +	union { +		/* for fcntl F_GETLK results */ +		struct ceph_filelock *filelock_reply; + +		/* for readdir results */ +		struct { +			struct ceph_mds_reply_dirfrag *dir_dir; +			size_t			      dir_buf_size; +			int                           dir_nr; +			char                          **dir_dname; +			u32                           *dir_dname_len; +			struct ceph_mds_reply_lease   **dir_dlease; +			struct ceph_mds_reply_info_in *dir_in; +			u8                            dir_complete, dir_end; +		}; + +		/* for create results */ +		struct { +			bool has_create_ino; +			u64 ino; +		}; +	};  	/* encoded blob describing snapshot contexts for certain  	   operations (e.g., open) */ @@ -101,17 +121,19 @@ struct ceph_mds_session {  	struct ceph_connection s_con; -	struct ceph_authorizer *s_authorizer; -	void             *s_authorizer_buf, *s_authorizer_reply_buf; -	size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len; +	struct ceph_auth_handshake s_auth; -	/* protected by s_cap_lock */ -	spinlock_t        s_cap_lock; +	/* protected by s_gen_ttl_lock */ +	spinlock_t        s_gen_ttl_lock;  	u32               s_cap_gen;  /* inc each time we get mds stale msg */  	unsigned long     s_cap_ttl;  /* when session caps expire */ + +	/* protected by s_cap_lock */ +	spinlock_t        s_cap_lock;  	struct list_head  s_caps;     /* all caps issued by this session */  	int               s_nr_caps, s_trim_caps;  	int               s_num_cap_releases; +	int		  s_cap_reconnect;  	struct list_head  s_cap_releases; /* waiting cap_release messages */  	struct list_head  s_cap_releases_done; /* ready to send */  	struct ceph_cap  *s_cap_iterator; @@ -154,12 +176,12 @@ struct ceph_mds_request {  	struct ceph_mds_client *r_mdsc;  	int r_op;                    /* mds op code */ -	int r_mds;  	/* operation on what? */  	struct inode *r_inode;              /* arg1 */  	struct dentry *r_dentry;            /* arg1 */  	struct dentry *r_old_dentry;        /* arg2: rename from or link from */ +	struct inode *r_old_dentry_dir;     /* arg2: old dentry's parent dir */  	char *r_path1, *r_path2;  	struct ceph_vino r_ino1, r_ino2; @@ -170,6 +192,9 @@ struct ceph_mds_request {  	union ceph_mds_request_args r_args;  	int r_fmode;        /* file mode, if expecting cap */ +	kuid_t r_uid; +	kgid_t r_gid; +	struct timespec r_stamp;  	/* for choosing which mds to send this request to */  	int r_direct_mode; @@ -265,6 +290,7 @@ struct ceph_mds_client {  	u64               cap_flush_seq;  	struct list_head  cap_dirty;        /* inodes with dirty caps */ +	struct list_head  cap_dirty_migrating; /* ...that are migration... */  	int               num_cap_flushing; /* # caps we are flushing */  	spinlock_t        cap_dirty_lock;   /* protects above items */  	wait_queue_head_t cap_flushing_wq; @@ -319,10 +345,11 @@ extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);  extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,  				    struct inode *inode, -				    struct dentry *dn, int mask); +				    struct dentry *dn);  extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); - +extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, +					   struct inode *dir);  extern struct ceph_mds_request *  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);  extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, @@ -359,6 +386,8 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,  extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,  				 struct ceph_msg *msg); +extern struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);  extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,  					  struct ceph_mds_session *session); diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 73b7d44e8a3..261531e55e9 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -20,7 +20,10 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)  {  	int n = 0;  	int i; -	char r; + +	/* special case for one mds */ +	if (1 == m->m_max_mds && m->m_info[0].state > 0) +		return 0;  	/* count */  	for (i = 0; i < m->m_max_mds; i++) @@ -30,8 +33,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)  		return -1;  	/* pick */ -	get_random_bytes(&r, 1); -	n = r % n; +	n = prandom_u32() % n;  	i = 0;  	for (i = 0; n > 0; i++, n--)  		while (m->m_info[i].state <= 0) @@ -59,6 +61,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)  		return ERR_PTR(-ENOMEM);  	ceph_decode_16_safe(p, end, version, bad); +	if (version > 3) { +		pr_warn("got mdsmap version %d > 3, failing", version); +		goto bad; +	}  	ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);  	m->m_epoch = ceph_decode_32(p); @@ -86,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)  		u32 num_export_targets;  		void *pexport_targets = NULL;  		struct ceph_timespec laggy_since; +		struct ceph_mds_info *info;  		ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);  		global_id = ceph_decode_64(p); @@ -120,37 +127,40 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)  		     i+1, n, global_id, mds, inc,  		     ceph_pr_addr(&addr.in_addr),  		     ceph_mds_state_name(state)); -		if (mds >= 0 && mds < m->m_max_mds && state > 0) { -			m->m_info[mds].global_id = global_id; -			m->m_info[mds].state = state; -			m->m_info[mds].addr = addr; -			m->m_info[mds].laggy = -				(laggy_since.tv_sec != 0 || -				 laggy_since.tv_nsec != 0); -			m->m_info[mds].num_export_targets = num_export_targets; -			if (num_export_targets) { -				m->m_info[mds].export_targets = -					kcalloc(num_export_targets, sizeof(u32), -						GFP_NOFS); -				for (j = 0; j < num_export_targets; j++) -					m->m_info[mds].export_targets[j] = -					       ceph_decode_32(&pexport_targets); -			} else { -				m->m_info[mds].export_targets = NULL; -			} + +		if (mds < 0 || mds >= m->m_max_mds || state <= 0) +			continue; + +		info = &m->m_info[mds]; +		info->global_id = global_id; +		info->state = state; +		info->addr = addr; +		info->laggy = (laggy_since.tv_sec != 0 || +			       laggy_since.tv_nsec != 0); +		info->num_export_targets = num_export_targets; +		if (num_export_targets) { +			info->export_targets = kcalloc(num_export_targets, +						       sizeof(u32), GFP_NOFS); +			if (info->export_targets == NULL) +				goto badmem; +			for (j = 0; j < num_export_targets; j++) +				info->export_targets[j] = +				       ceph_decode_32(&pexport_targets); +		} else { +			info->export_targets = NULL;  		}  	}  	/* pg_pools */  	ceph_decode_32_safe(p, end, n, bad);  	m->m_num_data_pg_pools = n; -	m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS); +	m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);  	if (!m->m_data_pg_pools)  		goto badmem; -	ceph_decode_need(p, end, sizeof(u32)*(n+1), bad); +	ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);  	for (i = 0; i < n; i++) -		m->m_data_pg_pools[i] = ceph_decode_32(p); -	m->m_cas_pg_pool = ceph_decode_32(p); +		m->m_data_pg_pools[i] = ceph_decode_64(p); +	m->m_cas_pg_pool = ceph_decode_64(p);  	/* ok, we don't care about the rest. */  	dout("mdsmap_decode success epoch %u\n", m->m_epoch); @@ -164,7 +174,7 @@ bad:  		       DUMP_PREFIX_OFFSET, 16, 1,  		       start, end - start, true);  	ceph_mdsmap_destroy(m); -	return ERR_PTR(-EINVAL); +	return ERR_PTR(err);  }  void ceph_mdsmap_destroy(struct ceph_mdsmap *m) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 39c243acd06..f01645a2775 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -206,7 +206,7 @@ void ceph_put_snap_realm(struct ceph_mds_client *mdsc,  		up_write(&mdsc->snap_rwsem);  	} else {  		spin_lock(&mdsc->snap_empty_lock); -		list_add(&mdsc->snap_empty, &realm->empty_item); +		list_add(&realm->empty_item, &mdsc->snap_empty);  		spin_unlock(&mdsc->snap_empty_lock);  	}  } @@ -296,8 +296,7 @@ static int build_snap_context(struct ceph_snap_realm *realm)  	struct ceph_snap_realm *parent = realm->parent;  	struct ceph_snap_context *snapc;  	int err = 0; -	int i; -	int num = realm->num_prior_parent_snaps + realm->num_snaps; +	u32 num = realm->num_prior_parent_snaps + realm->num_snaps;  	/*  	 * build parent context, if it hasn't been built. @@ -321,28 +320,29 @@ static int build_snap_context(struct ceph_snap_realm *realm)  	    realm->cached_context->seq == realm->seq &&  	    (!parent ||  	     realm->cached_context->seq >= parent->cached_context->seq)) { -		dout("build_snap_context %llx %p: %p seq %lld (%d snaps)" +		dout("build_snap_context %llx %p: %p seq %lld (%u snaps)"  		     " (unchanged)\n",  		     realm->ino, realm, realm->cached_context,  		     realm->cached_context->seq, -		     realm->cached_context->num_snaps); +		     (unsigned int) realm->cached_context->num_snaps);  		return 0;  	}  	/* alloc new snap context */  	err = -ENOMEM; -	if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc)) +	if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))  		goto fail; -	snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS); +	snapc = ceph_create_snap_context(num, GFP_NOFS);  	if (!snapc)  		goto fail; -	atomic_set(&snapc->nref, 1);  	/* build (reverse sorted) snap vector */  	num = 0;  	snapc->seq = realm->seq;  	if (parent) { -		/* include any of parent's snaps occuring _after_ my +		u32 i; + +		/* include any of parent's snaps occurring _after_ my  		   parent became my parent */  		for (i = 0; i < parent->cached_context->num_snaps; i++)  			if (parent->cached_context->snaps[i] >= @@ -361,8 +361,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)  	sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);  	snapc->num_snaps = num; -	dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n", -	     realm->ino, realm, snapc, snapc->seq, snapc->num_snaps); +	dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n", +	     realm->ino, realm, snapc, snapc->seq, +	     (unsigned int) snapc->num_snaps);  	if (realm->cached_context)  		ceph_put_snap_context(realm->cached_context); @@ -402,9 +403,9 @@ static void rebuild_snap_realms(struct ceph_snap_realm *realm)   * helper to allocate and decode an array of snapids.  free prior   * instance, if any.   */ -static int dup_array(u64 **dst, __le64 *src, int num) +static int dup_array(u64 **dst, __le64 *src, u32 num)  { -	int i; +	u32 i;  	kfree(*dst);  	if (num) { @@ -446,9 +447,18 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)  		return;  	} -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	used = __ceph_caps_used(ci);  	dirty = __ceph_caps_dirty(ci); + +	/* +	 * If there is a write in progress, treat that as a dirty Fw, +	 * even though it hasn't completed yet; by the time we finish +	 * up this capsnap it will be. +	 */ +	if (used & CEPH_CAP_FILE_WR) +		dirty |= CEPH_CAP_FILE_WR; +  	if (__ceph_have_pending_cap_snap(ci)) {  		/* there is no point in queuing multiple "pending" cap_snaps,  		   as no new writes are allowed to start when pending, so any @@ -456,15 +466,21 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)  		   cap_snap.  lucky us. */  		dout("queue_cap_snap %p already pending\n", inode);  		kfree(capsnap); -	} else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) || -		   (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| -			     CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) { +	} else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| +			    CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {  		struct ceph_snap_context *snapc = ci->i_head_snapc; -		dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode, -		     capsnap, snapc); -		igrab(inode); -		 +		/* +		 * if we are a sync write, we may need to go to the snaprealm +		 * to get the current snapc. +		 */ +		if (!snapc) +			snapc = ci->i_snap_realm->cached_context; + +		dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", +		     inode, capsnap, snapc, ceph_cap_string(dirty)); +		ihold(inode); +  		atomic_set(&capsnap->nref, 1);  		capsnap->ci = ci;  		INIT_LIST_HEAD(&capsnap->ci_item); @@ -513,7 +529,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)  		kfree(capsnap);  	} -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  }  /* @@ -522,7 +538,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)   *   * If capsnap can now be flushed, add to snap_flush list, and return 1.   * - * Caller must hold i_lock. + * Caller must hold i_ceph_lock.   */  int __ceph_finish_cap_snap(struct ceph_inode_info *ci,  			    struct ceph_cap_snap *capsnap) @@ -584,10 +600,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)  	if (lastinode)  		iput(lastinode); -	dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino); -	list_for_each_entry(child, &realm->children, child_item) -		queue_realm_cap_snaps(child); +	list_for_each_entry(child, &realm->children, child_item) { +		dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", +		     realm, realm->ino, child, child->ino); +		list_del_init(&child->dirty_item); +		list_add(&child->dirty_item, &realm->dirty_item); +	} +	list_del_init(&realm->dirty_item);  	dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);  } @@ -683,7 +703,9 @@ more:  	 * queue cap snaps _after_ we've built the new snap contexts,  	 * so that i_head_snapc can be set appropriately.  	 */ -	list_for_each_entry(realm, &dirty_realms, dirty_item) { +	while (!list_empty(&dirty_realms)) { +		realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, +					 dirty_item);  		queue_realm_cap_snaps(realm);  	} @@ -716,11 +738,11 @@ static void flush_snaps(struct ceph_mds_client *mdsc)  		ci = list_first_entry(&mdsc->snap_flush_list,  				struct ceph_inode_info, i_snap_flush_item);  		inode = &ci->vfs_inode; -		igrab(inode); +		ihold(inode);  		spin_unlock(&mdsc->snap_flush_lock); -		spin_lock(&inode->i_lock); +		spin_lock(&ci->i_ceph_lock);  		__ceph_flush_snaps(ci, &session, 0); -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  		iput(inode);  		spin_lock(&mdsc->snap_flush_lock);  	} @@ -826,7 +848,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,  				continue;  			ci = ceph_inode(inode); -			spin_lock(&inode->i_lock); +			spin_lock(&ci->i_ceph_lock);  			if (!ci->i_snap_realm)  				goto skip_inode;  			/* @@ -855,7 +877,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,  			oldrealm = ci->i_snap_realm;  			ci->i_snap_realm = realm;  			spin_unlock(&realm->inodes_with_caps_lock); -			spin_unlock(&inode->i_lock); +			spin_unlock(&ci->i_ceph_lock);  			ceph_get_snap_realm(mdsc, realm);  			ceph_put_snap_realm(mdsc, oldrealm); @@ -864,7 +886,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,  			continue;  skip_inode: -			spin_unlock(&inode->i_lock); +			spin_unlock(&ci->i_ceph_lock);  			iput(inode);  		} diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index cd5097d7c80..51cc23e4811 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s)  	case CEPH_MDS_STATE_BOOT:       return "up:boot";  	case CEPH_MDS_STATE_STANDBY:    return "up:standby";  	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay"; +	case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";  	case CEPH_MDS_STATE_CREATING:   return "up:creating";  	case CEPH_MDS_STATE_STARTING:   return "up:starting";  		/* up and in */ @@ -40,6 +41,8 @@ const char *ceph_session_op_name(int op)  	case CEPH_SESSION_RENEWCAPS: return "renewcaps";  	case CEPH_SESSION_STALE: return "stale";  	case CEPH_SESSION_RECALL_STATE: return "recall_state"; +	case CEPH_SESSION_FLUSHMSG: return "flushmsg"; +	case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";  	}  	return "???";  } @@ -50,10 +53,14 @@ const char *ceph_mds_op_name(int op)  	case CEPH_MDS_OP_LOOKUP:  return "lookup";  	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";  	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent"; +	case CEPH_MDS_OP_LOOKUPINO:  return "lookupino"; +	case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";  	case CEPH_MDS_OP_GETATTR:  return "getattr";  	case CEPH_MDS_OP_SETXATTR: return "setxattr";  	case CEPH_MDS_OP_SETATTR: return "setattr";  	case CEPH_MDS_OP_RMXATTR: return "rmxattr"; +	case CEPH_MDS_OP_SETLAYOUT: return "setlayou"; +	case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";  	case CEPH_MDS_OP_READDIR: return "readdir";  	case CEPH_MDS_OP_MKNOD: return "mknod";  	case CEPH_MDS_OP_LINK: return "link"; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 08b460ae053..06150fd745a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -17,7 +17,9 @@  #include "super.h"  #include "mds_client.h" +#include "cache.h" +#include <linux/ceph/ceph_features.h>  #include <linux/ceph/decode.h>  #include <linux/ceph/mon_client.h>  #include <linux/ceph/auth.h> @@ -70,17 +72,21 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)  	/*  	 * express utilization in terms of large blocks to avoid  	 * overflow on 32-bit machines. +	 * +	 * NOTE: for the time being, we make bsize == frsize to humor +	 * not-yet-ancient versions of glibc that are broken. +	 * Someday, we will probably want to report a real block +	 * size...  whatever that may mean for a network file system!  	 */  	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; +	buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;  	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); -	buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >> -		(CEPH_BLOCK_SHIFT-10); +	buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);  	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);  	buf->f_files = le64_to_cpu(st.num_objects);  	buf->f_ffree = -1;  	buf->f_namelen = NAME_MAX; -	buf->f_frsize = PAGE_CACHE_SIZE;  	/* leave fsid little-endian, regardless of host endianness */  	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); @@ -115,6 +121,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)  enum {  	Opt_wsize,  	Opt_rsize, +	Opt_rasize,  	Opt_caps_wanted_delay_min,  	Opt_caps_wanted_delay_max,  	Opt_cap_release_safety, @@ -130,12 +137,24 @@ enum {  	Opt_nodirstat,  	Opt_rbytes,  	Opt_norbytes, +	Opt_asyncreaddir,  	Opt_noasyncreaddir, +	Opt_dcache, +	Opt_nodcache, +	Opt_ino32, +	Opt_noino32, +	Opt_fscache, +	Opt_nofscache, +#ifdef CONFIG_CEPH_FS_POSIX_ACL +	Opt_acl, +#endif +	Opt_noacl  };  static match_table_t fsopt_tokens = {  	{Opt_wsize, "wsize=%d"},  	{Opt_rsize, "rsize=%d"}, +	{Opt_rasize, "rasize=%d"},  	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},  	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},  	{Opt_cap_release_safety, "cap_release_safety=%d"}, @@ -149,7 +168,18 @@ static match_table_t fsopt_tokens = {  	{Opt_nodirstat, "nodirstat"},  	{Opt_rbytes, "rbytes"},  	{Opt_norbytes, "norbytes"}, +	{Opt_asyncreaddir, "asyncreaddir"},  	{Opt_noasyncreaddir, "noasyncreaddir"}, +	{Opt_dcache, "dcache"}, +	{Opt_nodcache, "nodcache"}, +	{Opt_ino32, "ino32"}, +	{Opt_noino32, "noino32"}, +	{Opt_fscache, "fsc"}, +	{Opt_nofscache, "nofsc"}, +#ifdef CONFIG_CEPH_FS_POSIX_ACL +	{Opt_acl, "acl"}, +#endif +	{Opt_noacl, "noacl"},  	{-1, NULL}  }; @@ -195,6 +225,9 @@ static int parse_fsopt_token(char *c, void *private)  	case Opt_rsize:  		fsopt->rsize = intval;  		break; +	case Opt_rasize: +		fsopt->rasize = intval; +		break;  	case Opt_caps_wanted_delay_min:  		fsopt->caps_wanted_delay_min = intval;  		break; @@ -222,9 +255,38 @@ static int parse_fsopt_token(char *c, void *private)  	case Opt_norbytes:  		fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;  		break; +	case Opt_asyncreaddir: +		fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; +		break;  	case Opt_noasyncreaddir:  		fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;  		break; +	case Opt_dcache: +		fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; +		break; +	case Opt_nodcache: +		fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; +		break; +	case Opt_ino32: +		fsopt->flags |= CEPH_MOUNT_OPT_INO32; +		break; +	case Opt_noino32: +		fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; +		break; +	case Opt_fscache: +		fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; +		break; +	case Opt_nofscache: +		fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; +		break; +#ifdef CONFIG_CEPH_FS_POSIX_ACL +	case Opt_acl: +		fsopt->sb_flags |= MS_POSIXACL; +		break; +#endif +	case Opt_noacl: +		fsopt->sb_flags &= ~MS_POSIXACL; +		break;  	default:  		BUG_ON(token);  	} @@ -277,7 +339,10 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,  {  	struct ceph_mount_options *fsopt;  	const char *dev_name_end; -	int err = -ENOMEM; +	int err; + +	if (!dev_name || !*dev_name) +		return -EINVAL;  	fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);  	if (!fsopt) @@ -285,37 +350,54 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,  	dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); -        fsopt->sb_flags = flags; -        fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; - -        fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; -        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); -        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; -        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; -        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; -        fsopt->congestion_kb = default_congestion_kb(); -	 -        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ -        err = -EINVAL; -        if (!dev_name) -                goto out; -        *path = strstr(dev_name, ":/"); -        if (*path == NULL) { -                pr_err("device name is missing path (no :/ in %s)\n", -                       dev_name); -                goto out; -        } -	dev_name_end = *path; -	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); +	fsopt->sb_flags = flags; +	fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + +	fsopt->rsize = CEPH_RSIZE_DEFAULT; +	fsopt->rasize = CEPH_RASIZE_DEFAULT; +	fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); +	fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; +	fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; +	fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; +	fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; +	fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; +	fsopt->congestion_kb = default_congestion_kb(); -	/* path on server */ -	*path += 2; +	/* +	 * Distinguish the server list from the path in "dev_name". +	 * Internally we do not include the leading '/' in the path. +	 * +	 * "dev_name" will look like: +	 *     <server_spec>[,<server_spec>...]:[<path>] +	 * where +	 *     <server_spec> is <ip>[:<port>] +	 *     <path> is optional, but if present must begin with '/' +	 */ +	dev_name_end = strchr(dev_name, '/'); +	if (dev_name_end) { +		/* skip over leading '/' for path */ +		*path = dev_name_end + 1; +	} else { +		/* path is empty */ +		dev_name_end = dev_name + strlen(dev_name); +		*path = dev_name_end; +	} +	err = -EINVAL; +	dev_name_end--;		/* back up to ':' separator */ +	if (dev_name_end < dev_name || *dev_name_end != ':') { +		pr_err("device name is missing path (no : separator in %s)\n", +				dev_name); +		goto out; +	} +	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);  	dout("server path '%s'\n", *path); -	err = ceph_parse_options(popt, options, dev_name, dev_name_end, +	*popt = ceph_parse_options(options, dev_name, dev_name_end,  				 parse_fsopt_token, (void *)fsopt); -	if (err) +	if (IS_ERR(*popt)) { +		err = PTR_ERR(*popt);  		goto out; +	}  	/* success */  	*pfsopt = fsopt; @@ -329,11 +411,11 @@ out:  /**   * ceph_show_options - Show mount options in /proc/mounts   * @m: seq_file to write to - * @mnt: mount descriptor + * @root: root of that (sub)tree   */ -static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) +static int ceph_show_options(struct seq_file *m, struct dentry *root)  { -	struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb); +	struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);  	struct ceph_mount_options *fsopt = fsc->mount_options;  	struct ceph_options *opt = fsc->client->options; @@ -346,15 +428,13 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)  	if (opt->name)  		seq_printf(m, ",name=%s", opt->name); -	if (opt->secret) +	if (opt->key)  		seq_puts(m, ",secret=<hidden>");  	if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)  		seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);  	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)  		seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); -	if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) -		seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);  	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)  		seq_printf(m, ",osdkeepalivetimeout=%d",  			   opt->osd_keepalive_timeout); @@ -365,11 +445,28 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)  		seq_puts(m, ",norbytes");  	if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)  		seq_puts(m, ",noasyncreaddir"); +	if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE) +		seq_puts(m, ",dcache"); +	else +		seq_puts(m, ",nodcache"); +	if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) +		seq_puts(m, ",fsc"); +	else +		seq_puts(m, ",nofsc"); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL +	if (fsopt->sb_flags & MS_POSIXACL) +		seq_puts(m, ",acl"); +	else +		seq_puts(m, ",noacl"); +#endif  	if (fsopt->wsize)  		seq_printf(m, ",wsize=%d", fsopt->wsize); -	if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT) +	if (fsopt->rsize != CEPH_RSIZE_DEFAULT)  		seq_printf(m, ",rsize=%d", fsopt->rsize); +	if (fsopt->rasize != CEPH_RASIZE_DEFAULT) +		seq_printf(m, ",rasize=%d", fsopt->rasize);  	if (fsopt->congestion_kb != default_congestion_kb())  		seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);  	if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) @@ -412,23 +509,29 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)  /*   * create a new fs client   */ -struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, +static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,  					struct ceph_options *opt)  {  	struct ceph_fs_client *fsc; +	const u64 supported_features = +		CEPH_FEATURE_FLOCK | +		CEPH_FEATURE_DIRLAYOUTHASH; +	const u64 required_features = 0; +	int page_count; +	size_t size;  	int err = -ENOMEM;  	fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);  	if (!fsc)  		return ERR_PTR(-ENOMEM); -	fsc->client = ceph_create_client(opt, fsc); +	fsc->client = ceph_create_client(opt, fsc, supported_features, +					 required_features);  	if (IS_ERR(fsc->client)) {  		err = PTR_ERR(fsc->client);  		goto fail;  	}  	fsc->client->extra_mon_dispatch = extra_mon_dispatch; -	fsc->client->supported_features |= CEPH_FEATURE_FLOCK;  	fsc->client->monc.want_mdsmap = 1;  	fsc->mount_options = fsopt; @@ -443,28 +546,40 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,  		goto fail_client;  	err = -ENOMEM; -	fsc->wb_wq = create_workqueue("ceph-writeback"); +	/* +	 * The number of concurrent works can be high but they don't need +	 * to be processed in parallel, limit concurrency. +	 */ +	fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);  	if (fsc->wb_wq == NULL)  		goto fail_bdi; -	fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); +	fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);  	if (fsc->pg_inv_wq == NULL)  		goto fail_wb_wq; -	fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc"); +	fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);  	if (fsc->trunc_wq == NULL)  		goto fail_pg_inv_wq;  	/* set up mempools */  	err = -ENOMEM; -	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, -			      fsc->mount_options->wsize >> PAGE_CACHE_SHIFT); +	page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT; +	size = sizeof (struct page *) * (page_count ? page_count : 1); +	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);  	if (!fsc->wb_pagevec_pool)  		goto fail_trunc_wq; +	/* setup fscache */ +	if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) && +	    (ceph_fscache_register_fs(fsc) != 0)) +		goto fail_fscache; +  	/* caps */  	fsc->min_caps = fsopt->max_readdir;  	return fsc; +fail_fscache: +	ceph_fscache_unregister_fs(fsc);  fail_trunc_wq:  	destroy_workqueue(fsc->trunc_wq);  fail_pg_inv_wq: @@ -480,10 +595,12 @@ fail:  	return ERR_PTR(err);  } -void destroy_fs_client(struct ceph_fs_client *fsc) +static void destroy_fs_client(struct ceph_fs_client *fsc)  {  	dout("destroy_fs_client %p\n", fsc); +	ceph_fscache_unregister_fs(fsc); +  	destroy_workqueue(fsc->wb_wq);  	destroy_workqueue(fsc->pg_inv_wq);  	destroy_workqueue(fsc->trunc_wq); @@ -518,6 +635,8 @@ static void ceph_inode_init_once(void *foo)  static int __init init_caches(void)  { +	int error = -ENOMEM; +  	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",  				      sizeof(struct ceph_inode_info),  				      __alignof__(struct ceph_inode_info), @@ -541,23 +660,33 @@ static int __init init_caches(void)  	if (ceph_file_cachep == NULL)  		goto bad_file; -	return 0; +	if ((error = ceph_fscache_register())) +		goto bad_file; +	return 0;  bad_file:  	kmem_cache_destroy(ceph_dentry_cachep);  bad_dentry:  	kmem_cache_destroy(ceph_cap_cachep);  bad_cap:  	kmem_cache_destroy(ceph_inode_cachep); -	return -ENOMEM; +	return error;  }  static void destroy_caches(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier(); +  	kmem_cache_destroy(ceph_inode_cachep);  	kmem_cache_destroy(ceph_cap_cachep);  	kmem_cache_destroy(ceph_dentry_cachep);  	kmem_cache_destroy(ceph_file_cachep); + +	ceph_fscache_unregister();  } @@ -580,6 +709,7 @@ static const struct super_operations ceph_super_ops = {  	.alloc_inode	= ceph_alloc_inode,  	.destroy_inode	= ceph_destroy_inode,  	.write_inode    = ceph_write_inode, +	.drop_inode	= ceph_drop_inode,  	.sync_fs        = ceph_sync_fs,  	.put_super	= ceph_put_super,  	.show_options   = ceph_show_options, @@ -614,17 +744,25 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,  	req->r_num_caps = 2;  	err = ceph_mdsc_do_request(mdsc, NULL, req);  	if (err == 0) { -		dout("open_root_inode success\n"); -		if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && -		    fsc->sb->s_root == NULL) -			root = d_alloc_root(req->r_target_inode); -		else -			root = d_obtain_alias(req->r_target_inode); +		struct inode *inode = req->r_target_inode;  		req->r_target_inode = NULL; +		dout("open_root_inode success\n"); +		if (ceph_ino(inode) == CEPH_INO_ROOT && +		    fsc->sb->s_root == NULL) { +			root = d_make_root(inode); +			if (!root) { +				root = ERR_PTR(-ENOMEM); +				goto out; +			} +		} else { +			root = d_obtain_alias(inode); +		} +		ceph_init_dentry(root);  		dout("open_root_inode success, root dentry is %p\n", root);  	} else {  		root = ERR_PTR(err);  	} +out:  	ceph_mdsc_put_request(req);  	return root;  } @@ -705,6 +843,7 @@ static int ceph_set_super(struct super_block *s, void *data)  	s->s_flags = fsc->mount_options->sb_flags;  	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */ +	s->s_xattr = ceph_xattr_handlers;  	s->s_fs_info = fsc;  	fsc->sb = s; @@ -763,12 +902,16 @@ static int ceph_register_bdi(struct super_block *sb,  {  	int err; -	/* set ra_pages based on rsize mount option? */ -	if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) +	/* set ra_pages based on rasize mount option? */ +	if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)  		fsc->backing_dev_info.ra_pages = -			(fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) +			(fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)  			>> PAGE_SHIFT; -	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", +	else +		fsc->backing_dev_info.ra_pages = +			default_backing_dev_info.ra_pages; + +	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",  			   atomic_long_inc_return(&bdi_seq));  	if (!err)  		sb->s_bdi = &fsc->backing_dev_info; @@ -788,6 +931,10 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,  	struct ceph_options *opt = NULL;  	dout("ceph_mount\n"); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL +	flags |= MS_POSIXACL; +#endif  	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);  	if (err < 0) {  		res = ERR_PTR(err); @@ -798,8 +945,8 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,  	fsc = create_fs_client(fsopt, opt);  	if (IS_ERR(fsc)) {  		res = ERR_CAST(fsc); -		kfree(fsopt); -		kfree(opt); +		destroy_mount_options(fsopt); +		ceph_destroy_options(opt);  		goto out_final;  	} @@ -811,7 +958,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,  	if (ceph_test_opt(fsc->client, NOSHARE))  		compare_super = NULL; -	sb = sget(fs_type, compare_super, ceph_set_super, fsc); +	sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc);  	if (IS_ERR(sb)) {  		res = ERR_CAST(sb);  		goto out; @@ -868,6 +1015,7 @@ static struct file_system_type ceph_fs_type = {  	.kill_sb	= ceph_kill_sb,  	.fs_flags	= FS_RENAME_DOES_D_MOVE,  }; +MODULE_ALIAS_FS("ceph");  #define _STRINGIFY(x) #x  #define STRINGIFY(x) _STRINGIFY(x) @@ -878,6 +1026,8 @@ static int __init init_ceph(void)  	if (ret)  		goto out; +	ceph_flock_init(); +	ceph_xattr_init();  	ret = register_filesystem(&ceph_fs_type);  	if (ret)  		goto out_icache; @@ -887,6 +1037,7 @@ static int __init init_ceph(void)  	return 0;  out_icache: +	ceph_xattr_exit();  	destroy_caches();  out:  	return ret; @@ -896,6 +1047,7 @@ static void __exit exit_ceph(void)  {  	dout("exit_ceph\n");  	unregister_filesystem(&ceph_fs_type); +	ceph_xattr_exit();  	destroy_caches();  } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1886294e12f..12b20744e38 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -13,20 +13,28 @@  #include <linux/wait.h>  #include <linux/writeback.h>  #include <linux/slab.h> +#include <linux/posix_acl.h>  #include <linux/ceph/libceph.h> +#ifdef CONFIG_CEPH_FSCACHE +#include <linux/fscache.h> +#endif +  /* f_type in struct statfs */  #define CEPH_SUPER_MAGIC 0x00c36400  /* large granularity for statfs utilization stats to facilitate   * large volume sizes on 32-bit machines. */ -#define CEPH_BLOCK_SHIFT   20  /* 1 MB */ +#define CEPH_BLOCK_SHIFT   22  /* 4 MB */  #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)  #define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */  #define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */  #define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */ +#define CEPH_MOUNT_OPT_INO32           (1<<8) /* 32 bit inos */ +#define CEPH_MOUNT_OPT_DCACHE          (1<<9) /* use dcache for readdir etc */ +#define CEPH_MOUNT_OPT_FSCACHE         (1<<10) /* use fscache */  #define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES) @@ -35,6 +43,8 @@  #define ceph_test_mount_opt(fsc, opt) \  	(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) +#define CEPH_RSIZE_DEFAULT             0           /* max read size */ +#define CEPH_RASIZE_DEFAULT            (8192*1024) /* readahead */  #define CEPH_MAX_READDIR_DEFAULT        1024  #define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)  #define CEPH_SNAPDIRNAME_DEFAULT        ".snap" @@ -43,8 +53,9 @@ struct ceph_mount_options {  	int flags;  	int sb_flags; -	int wsize; -	int rsize;            /* max readahead */ +	int wsize;            /* max write size */ +	int rsize;            /* max read size */ +	int rasize;           /* max readahead */  	int congestion_kb;    /* max writeback in flight */  	int caps_wanted_delay_min, caps_wanted_delay_max;  	int cap_release_safety; @@ -85,6 +96,11 @@ struct ceph_fs_client {  	struct dentry *debugfs_bdi;  	struct dentry *debugfs_mdsc, *debugfs_mdsmap;  #endif + +#ifdef CONFIG_CEPH_FSCACHE +	struct fscache_cookie *fscache; +	struct workqueue_struct *revalidate_wq; +#endif  }; @@ -132,9 +148,9 @@ struct ceph_cap_snap {  	int issued, dirty;  	struct ceph_snap_context *context; -	mode_t mode; -	uid_t uid; -	gid_t gid; +	umode_t mode; +	kuid_t uid; +	kgid_t gid;  	struct ceph_buffer *xattr_blob;  	u64 xattr_version; @@ -233,12 +249,16 @@ struct ceph_inode_xattrs_info {  struct ceph_inode_info {  	struct ceph_vino i_vino;   /* ceph ino + snap */ +	spinlock_t i_ceph_lock; +  	u64 i_version;  	u32 i_time_warp_seq;  	unsigned i_ceph_flags; -	unsigned long i_release_count; +	atomic_t i_release_count; +	atomic_t i_complete_count; +	struct ceph_dir_layout i_dir_layout;  	struct ceph_file_layout i_layout;  	char *i_symlink; @@ -246,14 +266,13 @@ struct ceph_inode_info {  	struct timespec i_rctime;  	u64 i_rbytes, i_rfiles, i_rsubdirs;  	u64 i_files, i_subdirs; -	u64 i_max_offset;  /* largest readdir offset, set with I_COMPLETE */  	struct rb_root i_fragtree;  	struct mutex i_fragtree_mutex;  	struct ceph_inode_xattrs_info i_xattrs; -	/* capabilities.  protected _both_ by i_lock and cap->session's +	/* capabilities.  protected _both_ by i_ceph_lock and cap->session's  	 * s_mutex. */  	struct rb_root i_caps;           /* cap list */  	struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */ @@ -268,9 +287,6 @@ struct ceph_inode_info {  	unsigned long i_hold_caps_min; /* jiffies */  	unsigned long i_hold_caps_max; /* jiffies */  	struct list_head i_cap_delay_list;  /* for delayed cap release to mds */ -	int i_cap_exporting_mds;         /* to handle cap migration between */ -	unsigned i_cap_exporting_mseq;   /*  mds's. */ -	unsigned i_cap_exporting_issued;  	struct ceph_cap_reservation i_cap_migration_resv;  	struct list_head i_cap_snaps;   /* snapped state pending flush to mds */  	struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 or @@ -279,6 +295,7 @@ struct ceph_inode_info {  	int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */ +	struct mutex i_truncate_mutex;  	u32 i_truncate_seq;        /* last truncate to smaller size */  	u64 i_truncate_size;       /*  and the size we last truncated down to */  	int i_truncate_pending;    /*  still need to call vmtruncate */ @@ -290,12 +307,10 @@ struct ceph_inode_info {  	/* held references to caps */  	int i_pin_ref; -	int i_rd_ref, i_rdcache_ref, i_wr_ref; +	int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;  	int i_wrbuffer_ref, i_wrbuffer_ref_head;  	u32 i_shared_gen;       /* increment each time we get FILE_SHARED */ -	u32 i_rdcache_gen;      /* we increment this each time we get -				   FILE_CACHE.  If it's non-zero, we -				   _may_ have cached pages. */ +	u32 i_rdcache_gen;      /* incremented each time we get FILE_CACHE. */  	u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */  	struct list_head i_unsafe_writes; /* uncommitted sync writes */ @@ -312,6 +327,11 @@ struct ceph_inode_info {  	struct work_struct i_vmtruncate_work; +#ifdef CONFIG_CEPH_FSCACHE +	struct fscache_cookie *fscache; +	u32 i_fscache_gen; /* sequence, for delayed fscache validate */ +	struct work_struct i_revalidate_work; +#endif  	struct inode vfs_inode; /* at end */  }; @@ -320,6 +340,16 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)  	return container_of(inode, struct ceph_inode_info, vfs_inode);  } +static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) +{ +	return (struct ceph_fs_client *)inode->i_sb->s_fs_info; +} + +static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) +{ +	return (struct ceph_fs_client *)sb->s_fs_info; +} +  static inline struct ceph_vino ceph_vino(struct inode *inode)  {  	return ceph_inode(inode)->i_vino; @@ -328,18 +358,49 @@ static inline struct ceph_vino ceph_vino(struct inode *inode)  /*   * ino_t is <64 bits on many architectures, blech.   * - * don't include snap in ino hash, at least for now. + *               i_ino (kernel inode)   st_ino (userspace) + * i386          32                     32 + * x86_64+ino32  64                     32 + * x86_64        64                     64 + */ +static inline u32 ceph_ino_to_ino32(__u64 vino) +{ +	u32 ino = vino & 0xffffffff; +	ino ^= vino >> 32; +	if (!ino) +		ino = 2; +	return ino; +} + +/* + * kernel i_ino value   */  static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)  { -	ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */  #if BITS_PER_LONG == 32 -	ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; -	if (!ino) -		ino = 1; +	return ceph_ino_to_ino32(vino.ino); +#else +	return (ino_t)vino.ino;  #endif +} + +/* + * user-visible ino (stat, filldir) + */ +#if BITS_PER_LONG == 32 +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) +{ +	return ino; +} +#else +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) +{ +	if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)) +		ino = ceph_ino_to_ino32(ino);  	return ino;  } +#endif +  /* for printf-style formatting */  #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap @@ -372,38 +433,35 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,  /*   * Ceph inode.   */ -#define CEPH_I_COMPLETE  1  /* we have complete directory cached */  #define CEPH_I_NODELAY   4  /* do not delay cap release */  #define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */  #define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */ -static inline void ceph_i_clear(struct inode *inode, unsigned mask) +static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, +					   int release_count)  { -	struct ceph_inode_info *ci = ceph_inode(inode); - -	spin_lock(&inode->i_lock); -	ci->i_ceph_flags &= ~mask; -	spin_unlock(&inode->i_lock); +	atomic_set(&ci->i_complete_count, release_count);  } -static inline void ceph_i_set(struct inode *inode, unsigned mask) +static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)  { -	struct ceph_inode_info *ci = ceph_inode(inode); +	atomic_inc(&ci->i_release_count); +} -	spin_lock(&inode->i_lock); -	ci->i_ceph_flags |= mask; -	spin_unlock(&inode->i_lock); +static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) +{ +	return atomic_read(&ci->i_complete_count) == +		atomic_read(&ci->i_release_count);  } -static inline bool ceph_i_test(struct inode *inode, unsigned mask) +static inline void ceph_dir_clear_complete(struct inode *inode)  { -	struct ceph_inode_info *ci = ceph_inode(inode); -	bool r; +	__ceph_dir_clear_complete(ceph_inode(inode)); +} -	spin_lock(&inode->i_lock); -	r = (ci->i_ceph_flags & mask) == mask; -	spin_unlock(&inode->i_lock); -	return r; +static inline bool ceph_dir_is_complete(struct inode *inode) +{ +	return __ceph_dir_is_complete(ceph_inode(inode));  } @@ -429,13 +487,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)  	return ((loff_t)frag << 32) | (loff_t)off;  } -static inline int ceph_set_ino_cb(struct inode *inode, void *data) -{ -	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; -	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); -	return 0; -} -  /*   * caps helpers   */ @@ -452,9 +503,9 @@ extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,  static inline int ceph_caps_issued(struct ceph_inode_info *ci)  {  	int issued; -	spin_lock(&ci->vfs_inode.i_lock); +	spin_lock(&ci->i_ceph_lock);  	issued = __ceph_caps_issued(ci, NULL); -	spin_unlock(&ci->vfs_inode.i_lock); +	spin_unlock(&ci->i_ceph_lock);  	return issued;  } @@ -462,9 +513,9 @@ static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,  					int touch)  {  	int r; -	spin_lock(&ci->vfs_inode.i_lock); +	spin_lock(&ci->i_ceph_lock);  	r = __ceph_caps_issued_mask(ci, mask, touch); -	spin_unlock(&ci->vfs_inode.i_lock); +	spin_unlock(&ci->i_ceph_lock);  	return r;  } @@ -472,8 +523,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)  {  	return ci->i_dirty_caps | ci->i_flushing_caps;  } -extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); +extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); +extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, +				      struct ceph_cap *ocap, int mask);  extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);  extern int __ceph_caps_used(struct ceph_inode_info *ci); @@ -496,7 +549,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);  extern void ceph_caps_init(struct ceph_mds_client *mdsc);  extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);  extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); -extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, +extern void ceph_reserve_caps(struct ceph_mds_client *mdsc,  			     struct ceph_cap_reservation *ctx, int need);  extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,  			       struct ceph_cap_reservation *ctx); @@ -504,34 +557,28 @@ extern void ceph_reservation_status(struct ceph_fs_client *client,  				    int *total, int *avail, int *used,  				    int *reserved, int *min); -static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) -{ -	return (struct ceph_fs_client *)inode->i_sb->s_fs_info; -} - -static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) -{ -	return (struct ceph_fs_client *)sb->s_fs_info; -}  /*   * we keep buffered readdir results attached to file->private_data   */ +#define CEPH_F_SYNC     1 +#define CEPH_F_ATEND    2 +  struct ceph_file_info { -	int fmode;     /* initialized on open */ +	short fmode;     /* initialized on open */ +	short flags;     /* CEPH_F_* */  	/* readdir: position within the dir */  	u32 frag;  	struct ceph_mds_request *last_readdir; -	int at_end;  	/* readdir: position within a frag */  	unsigned offset;       /* offset of last chunk, adjusted for . and .. */ -	u64 next_offset;       /* offset of next chunk (last_name's + 1) */ +	unsigned next_offset;  /* offset of next chunk (last_name's + 1) */  	char *last_name;       /* last entry in previous chunk */  	struct dentry *dentry; /* next dentry (for dcache readdir) */ -	unsigned long dir_release_count; +	int dir_release_count;  	/* used for -o dirstat read() on directory thing */  	char *dir_info; @@ -559,9 +606,9 @@ struct ceph_snap_realm {  	u64 parent_since;   /* snapid when our current parent became so */  	u64 *prior_parent_snaps;      /* snaps inherited from any parents we */ -	int num_prior_parent_snaps;   /*  had prior to parent_since */ +	u32 num_prior_parent_snaps;   /*  had prior to parent_since */  	u64 *snaps;                   /* snaps specific to this realm */ -	int num_snaps; +	u32 num_snaps;  	struct ceph_snap_realm *parent;  	struct list_head children;       /* list of child realms */ @@ -642,6 +689,7 @@ extern const struct inode_operations ceph_file_iops;  extern struct inode *ceph_alloc_inode(struct super_block *sb);  extern void ceph_destroy_inode(struct inode *inode); +extern int ceph_drop_inode(struct inode *inode);  extern struct inode *ceph_get_inode(struct super_block *sb,  				    struct ceph_vino vino); @@ -675,35 +723,75 @@ extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,  /* xattr.c */  extern int ceph_setxattr(struct dentry *, const char *, const void *,  			 size_t, int); +int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); +ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); +int __ceph_removexattr(struct dentry *, const char *);  extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);  extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);  extern int ceph_removexattr(struct dentry *, const char *);  extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);  extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); +extern void __init ceph_xattr_init(void); +extern void ceph_xattr_exit(void); + +/* acl.c */ +extern const struct xattr_handler *ceph_xattr_handlers[]; + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + +struct posix_acl *ceph_get_acl(struct inode *, int); +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ceph_init_acl(struct dentry *, struct inode *, struct inode *); + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ +       forget_all_cached_acls(inode); +} + +#else + +#define ceph_get_acl NULL +#define ceph_set_acl NULL + +static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode, +				struct inode *dir) +{ +	return 0; +} + +static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) +{ +	return 0; +} + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ +} + +#endif  /* caps.c */  extern const char *ceph_cap_string(int c);  extern void ceph_handle_caps(struct ceph_mds_session *session,  			     struct ceph_msg *msg); -extern int ceph_add_cap(struct inode *inode, -			struct ceph_mds_session *session, u64 cap_id, -			int fmode, unsigned issued, unsigned wanted, -			unsigned cap, unsigned seq, u64 realmino, int flags, -			struct ceph_cap_reservation *caps_reservation); -extern void __ceph_remove_cap(struct ceph_cap *cap); -static inline void ceph_remove_cap(struct ceph_cap *cap) -{ -	struct inode *inode = &cap->ci->vfs_inode; -	spin_lock(&inode->i_lock); -	__ceph_remove_cap(cap); -	spin_unlock(&inode->i_lock); -} +extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, +				     struct ceph_cap_reservation *ctx); +extern void ceph_add_cap(struct inode *inode, +			 struct ceph_mds_session *session, u64 cap_id, +			 int fmode, unsigned issued, unsigned wanted, +			 unsigned cap, unsigned seq, u64 realmino, int flags, +			 struct ceph_cap **new_cap); +extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);  extern void ceph_put_cap(struct ceph_mds_client *mdsc,  			 struct ceph_cap *cap); +extern int ceph_is_any_caps(struct inode *inode); +extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, +				u64 cap_id, u32 migrate_seq, u32 issue_seq);  extern void ceph_queue_caps_release(struct inode *inode);  extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); -extern int ceph_fsync(struct file *file, int datasync); +extern int ceph_fsync(struct file *file, loff_t start, loff_t end, +		      int datasync);  extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,  				    struct ceph_mds_session *session);  extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, @@ -743,17 +831,11 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);  /* file.c */  extern const struct file_operations ceph_file_fops;  extern const struct address_space_operations ceph_aops; -extern int ceph_copy_to_page_vector(struct page **pages, -				    const char *data, -				    loff_t off, size_t len); -extern int ceph_copy_from_page_vector(struct page **pages, -				    char *data, -				    loff_t off, size_t len); -extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); +  extern int ceph_open(struct inode *inode, struct file *file); -extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, -				       struct nameidata *nd, int mode, -				       int locked_dir); +extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, +			    struct file *file, unsigned flags, umode_t mode, +			    int *opened);  extern int ceph_release(struct inode *inode, struct file *filp);  /* dir.c */ @@ -763,6 +845,8 @@ extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,  	ceph_snapdir_dentry_ops;  extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); +extern int ceph_handle_snapdir(struct ceph_mds_request *req, +			       struct dentry *dentry, int err);  extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,  					 struct dentry *dentry, int err); @@ -770,6 +854,8 @@ extern void ceph_dentry_lru_add(struct dentry *dn);  extern void ceph_dentry_lru_touch(struct dentry *dn);  extern void ceph_dentry_lru_del(struct dentry *dn);  extern void ceph_invalidate_dentry_lease(struct dentry *dentry); +extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); +extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);  /*   * our d_ops vary depending on whether the inode is live, @@ -785,21 +871,19 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);  extern const struct export_operations ceph_export_ops;  /* locks.c */ +extern __init void ceph_flock_init(void);  extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);  extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);  extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); -extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, -			     int p_locks, int f_locks); +extern int ceph_encode_locks_to_buffer(struct inode *inode, +				       struct ceph_filelock *flocks, +				       int num_fcntl_locks, +				       int num_flock_locks); +extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, +				  struct ceph_pagelist *pagelist, +				  int num_fcntl_locks, int num_flock_locks);  extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); -static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) -{ -	if (dentry && dentry->d_parent) -		return dentry->d_parent->d_inode; - -	return NULL; -} -  /* debugfs.c */  extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);  extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 6e12a6ba5f7..c9c2b887381 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -6,13 +6,33 @@  #include <linux/ceph/decode.h>  #include <linux/xattr.h> +#include <linux/posix_acl_xattr.h>  #include <linux/slab.h> +#define XATTR_CEPH_PREFIX "ceph." +#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) + +static int __remove_xattr(struct ceph_inode_info *ci, +			  struct ceph_inode_xattr *xattr); + +/* + * List of handlers for synthetic system.* attributes. Other + * attributes are handled directly. + */ +const struct xattr_handler *ceph_xattr_handlers[] = { +#ifdef CONFIG_CEPH_FS_POSIX_ACL +	&posix_acl_access_xattr_handler, +	&posix_acl_default_xattr_handler, +#endif +	NULL, +}; +  static bool ceph_is_valid_xattr(const char *name)  { -	return !strncmp(name, "ceph.", 5) || +	return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||  	       !strncmp(name, XATTR_SECURITY_PREFIX,  			XATTR_SECURITY_PREFIX_LEN) || +	       !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||  	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||  	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);  } @@ -21,101 +41,235 @@ static bool ceph_is_valid_xattr(const char *name)   * These define virtual xattrs exposing the recursive directory   * statistics and layout metadata.   */ -struct ceph_vxattr_cb { -	bool readonly; +struct ceph_vxattr {  	char *name; +	size_t name_size;	/* strlen(name) + 1 (for '\0') */  	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,  			      size_t size); +	bool readonly, hidden; +	bool (*exists_cb)(struct ceph_inode_info *ci);  }; +/* layouts */ + +static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) +{ +	size_t s; +	char *p = (char *)&ci->i_layout; + +	for (s = 0; s < sizeof(ci->i_layout); s++, p++) +		if (*p) +			return true; +	return false; +} + +static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, +				   size_t size) +{ +	int ret; +	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); +	struct ceph_osd_client *osdc = &fsc->client->osdc; +	s64 pool = ceph_file_layout_pg_pool(ci->i_layout); +	const char *pool_name; +	char buf[128]; + +	dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); +	down_read(&osdc->map_sem); +	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); +	if (pool_name) { +		size_t len = strlen(pool_name); +		ret = snprintf(buf, sizeof(buf), +		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=", +		(unsigned long long)ceph_file_layout_su(ci->i_layout), +		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), +	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); +		if (!size) { +			ret += len; +		} else if (ret + len > size) { +			ret = -ERANGE; +		} else { +			memcpy(val, buf, ret); +			memcpy(val + ret, pool_name, len); +			ret += len; +		} +	} else { +		ret = snprintf(buf, sizeof(buf), +		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", +		(unsigned long long)ceph_file_layout_su(ci->i_layout), +		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), +	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout), +		(unsigned long long)pool); +		if (size) { +			if (ret <= size) +				memcpy(val, buf, ret); +			else +				ret = -ERANGE; +		} +	} +	up_read(&osdc->map_sem); +	return ret; +} + +static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, +					       char *val, size_t size) +{ +	return snprintf(val, size, "%lld", +			(unsigned long long)ceph_file_layout_su(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, +						char *val, size_t size) +{ +	return snprintf(val, size, "%lld", +	       (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, +					       char *val, size_t size) +{ +	return snprintf(val, size, "%lld", +	       (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, +					char *val, size_t size) +{ +	int ret; +	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); +	struct ceph_osd_client *osdc = &fsc->client->osdc; +	s64 pool = ceph_file_layout_pg_pool(ci->i_layout); +	const char *pool_name; + +	down_read(&osdc->map_sem); +	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); +	if (pool_name) +		ret = snprintf(val, size, "%s", pool_name); +	else +		ret = snprintf(val, size, "%lld", (unsigned long long)pool); +	up_read(&osdc->map_sem); +	return ret; +} +  /* directories */ -static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,  					size_t size)  {  	return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);  } -static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,  				      size_t size)  {  	return snprintf(val, size, "%lld", ci->i_files);  } -static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,  					size_t size)  {  	return snprintf(val, size, "%lld", ci->i_subdirs);  } -static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,  					 size_t size)  {  	return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);  } -static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,  				       size_t size)  {  	return snprintf(val, size, "%lld", ci->i_rfiles);  } -static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,  					 size_t size)  {  	return snprintf(val, size, "%lld", ci->i_rsubdirs);  } -static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,  				       size_t size)  {  	return snprintf(val, size, "%lld", ci->i_rbytes);  } -static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,  				       size_t size)  { -	return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec, +	return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec,  			(long)ci->i_rctime.tv_nsec);  } -static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { -	{ true, "ceph.dir.entries", ceph_vxattrcb_entries}, -	{ true, "ceph.dir.files", ceph_vxattrcb_files}, -	{ true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, -	{ true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, -	{ true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, -	{ true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, -	{ true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, -	{ true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, -	{ true, NULL, NULL } -}; -/* files */ +#define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name +#define CEPH_XATTR_NAME2(_type, _name, _name2)	\ +	XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 -static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, -				   size_t size) -{ -	int ret; +#define XATTR_NAME_CEPH(_type, _name)					\ +	{								\ +		.name = CEPH_XATTR_NAME(_type, _name),			\ +		.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ +		.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ +		.readonly = true,				\ +		.hidden = false,				\ +		.exists_cb = NULL,			\ +	} +#define XATTR_LAYOUT_FIELD(_type, _name, _field)			\ +	{								\ +		.name = CEPH_XATTR_NAME2(_type, _name, _field),	\ +		.name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \ +		.getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \ +		.readonly = false,				\ +		.hidden = true,			\ +		.exists_cb = ceph_vxattrcb_layout_exists,	\ +	} -	ret = snprintf(val, size, -		"chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n", -		(unsigned long long)ceph_file_layout_su(ci->i_layout), -		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), -		(unsigned long long)ceph_file_layout_object_size(ci->i_layout)); -	if (ceph_file_layout_pg_preferred(ci->i_layout)) -		ret += snprintf(val + ret, size, "preferred_osd=%lld\n", -			    (unsigned long long)ceph_file_layout_pg_preferred( -				    ci->i_layout)); -	return ret; -} +static struct ceph_vxattr ceph_dir_vxattrs[] = { +	{ +		.name = "ceph.dir.layout", +		.name_size = sizeof("ceph.dir.layout"), +		.getxattr_cb = ceph_vxattrcb_layout, +		.readonly = false, +		.hidden = true, +		.exists_cb = ceph_vxattrcb_layout_exists, +	}, +	XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), +	XATTR_LAYOUT_FIELD(dir, layout, stripe_count), +	XATTR_LAYOUT_FIELD(dir, layout, object_size), +	XATTR_LAYOUT_FIELD(dir, layout, pool), +	XATTR_NAME_CEPH(dir, entries), +	XATTR_NAME_CEPH(dir, files), +	XATTR_NAME_CEPH(dir, subdirs), +	XATTR_NAME_CEPH(dir, rentries), +	XATTR_NAME_CEPH(dir, rfiles), +	XATTR_NAME_CEPH(dir, rsubdirs), +	XATTR_NAME_CEPH(dir, rbytes), +	XATTR_NAME_CEPH(dir, rctime), +	{ .name = NULL, 0 }	/* Required table terminator */ +}; +static size_t ceph_dir_vxattrs_name_size;	/* total size of all names */ + +/* files */ -static struct ceph_vxattr_cb ceph_file_vxattrs[] = { -	{ true, "ceph.layout", ceph_vxattrcb_layout}, -	{ NULL, NULL } +static struct ceph_vxattr ceph_file_vxattrs[] = { +	{ +		.name = "ceph.file.layout", +		.name_size = sizeof("ceph.file.layout"), +		.getxattr_cb = ceph_vxattrcb_layout, +		.readonly = false, +		.hidden = true, +		.exists_cb = ceph_vxattrcb_layout_exists, +	}, +	XATTR_LAYOUT_FIELD(file, layout, stripe_unit), +	XATTR_LAYOUT_FIELD(file, layout, stripe_count), +	XATTR_LAYOUT_FIELD(file, layout, object_size), +	XATTR_LAYOUT_FIELD(file, layout, pool), +	{ .name = NULL, 0 }	/* Required table terminator */  }; +static size_t ceph_file_vxattrs_name_size;	/* total size of all names */ -static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) +static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)  {  	if (S_ISDIR(inode->i_mode))  		return ceph_dir_vxattrs; @@ -124,22 +278,67 @@ static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)  	return NULL;  } -static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr, +static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ +	if (vxattrs == ceph_dir_vxattrs) +		return ceph_dir_vxattrs_name_size; +	if (vxattrs == ceph_file_vxattrs) +		return ceph_file_vxattrs_name_size; +	BUG(); + +	return 0; +} + +/* + * Compute the aggregate size (including terminating '\0') of all + * virtual extended attribute names in the given vxattr table. + */ +static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ +	struct ceph_vxattr *vxattr; +	size_t size = 0; + +	for (vxattr = vxattrs; vxattr->name; vxattr++) +		if (!vxattr->hidden) +			size += vxattr->name_size; + +	return size; +} + +/* Routines called at initialization and exit time */ + +void __init ceph_xattr_init(void) +{ +	ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs); +	ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs); +} + +void ceph_xattr_exit(void) +{ +	ceph_dir_vxattrs_name_size = 0; +	ceph_file_vxattrs_name_size = 0; +} + +static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,  						const char *name)  { -	do { -		if (strcmp(vxattr->name, name) == 0) -			return vxattr; -		vxattr++; -	} while (vxattr->name); +	struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode); + +	if (vxattr) { +		while (vxattr->name) { +			if (!strcmp(vxattr->name, name)) +				return vxattr; +			vxattr++; +		} +	} +  	return NULL;  }  static int __set_xattr(struct ceph_inode_info *ci,  			   const char *name, int name_len,  			   const char *val, int val_len, -			   int dirty, -			   int should_free_name, int should_free_val, +			   int flags, int update_xattr,  			   struct ceph_inode_xattr **newxattr)  {  	struct rb_node **p; @@ -168,12 +367,31 @@ static int __set_xattr(struct ceph_inode_info *ci,  		xattr = NULL;  	} +	if (update_xattr) { +		int err = 0; +		if (xattr && (flags & XATTR_CREATE)) +			err = -EEXIST; +		else if (!xattr && (flags & XATTR_REPLACE)) +			err = -ENODATA; +		if (err) { +			kfree(name); +			kfree(val); +			return err; +		} +		if (update_xattr < 0) { +			if (xattr) +				__remove_xattr(ci, xattr); +			kfree(name); +			return 0; +		} +	} +  	if (!xattr) {  		new = 1;  		xattr = *newxattr;  		xattr->name = name;  		xattr->name_len = name_len; -		xattr->should_free_name = should_free_name; +		xattr->should_free_name = update_xattr;  		ci->i_xattrs.count++;  		dout("__set_xattr count=%d\n", ci->i_xattrs.count); @@ -183,7 +401,7 @@ static int __set_xattr(struct ceph_inode_info *ci,  		if (xattr->should_free_val)  			kfree((void *)xattr->val); -		if (should_free_name) { +		if (update_xattr) {  			kfree((void *)name);  			name = xattr->name;  		} @@ -198,8 +416,8 @@ static int __set_xattr(struct ceph_inode_info *ci,  		xattr->val = "";  	xattr->val_len = val_len; -	xattr->dirty = dirty; -	xattr->should_free_val = (val && should_free_val); +	xattr->dirty = update_xattr; +	xattr->should_free_val = (val && update_xattr);  	if (new) {  		rb_link_node(&xattr->node, parent, p); @@ -219,6 +437,7 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,  	struct rb_node **p;  	struct rb_node *parent = NULL;  	struct ceph_inode_xattr *xattr = NULL; +	int name_len = strlen(name);  	int c;  	p = &ci->i_xattrs.index.rb_node; @@ -226,6 +445,8 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,  		parent = *p;  		xattr = rb_entry(parent, struct ceph_inode_xattr, node);  		c = strncmp(name, xattr->name, xattr->name_len); +		if (c == 0 && name_len > xattr->name_len) +			c = 1;  		if (c < 0)  			p = &(*p)->rb_left;  		else if (c > 0) @@ -258,7 +479,7 @@ static int __remove_xattr(struct ceph_inode_info *ci,  			  struct ceph_inode_xattr *xattr)  {  	if (!xattr) -		return -EOPNOTSUPP; +		return -ENODATA;  	rb_erase(&xattr->node, &ci->i_xattrs.index); @@ -340,8 +561,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci)  }  static int __build_xattrs(struct inode *inode) -	__releases(inode->i_lock) -	__acquires(inode->i_lock) +	__releases(ci->i_ceph_lock) +	__acquires(ci->i_ceph_lock)  {  	u32 namelen;  	u32 numattr = 0; @@ -369,7 +590,7 @@ start:  		end = p + ci->i_xattrs.blob->vec.iov_len;  		ceph_decode_32_safe(&p, end, numattr, bad);  		xattr_version = ci->i_xattrs.version; -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  		xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),  				 GFP_NOFS); @@ -384,12 +605,13 @@ start:  				goto bad_lock;  		} -		spin_lock(&inode->i_lock); +		spin_lock(&ci->i_ceph_lock);  		if (ci->i_xattrs.version != xattr_version) {  			/* lost a race, retry */  			for (i = 0; i < numattr; i++)  				kfree(xattrs[i]);  			kfree(xattrs); +			xattrs = NULL;  			goto start;  		}  		err = -EIO; @@ -403,7 +625,7 @@ start:  			p += len;  			err = __set_xattr(ci, name, namelen, val, len, -					  0, 0, 0, &xattrs[numattr]); +					  0, 0, &xattrs[numattr]);  			if (err < 0)  				goto bad; @@ -415,7 +637,7 @@ start:  	return err;  bad_lock: -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  bad:  	if (xattrs) {  		for (i = 0; i < numattr; i++) @@ -492,24 +714,25 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)  	}  } -ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, +ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,  		      size_t size)  { -	struct inode *inode = dentry->d_inode;  	struct ceph_inode_info *ci = ceph_inode(inode); -	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);  	int err;  	struct ceph_inode_xattr *xattr; -	struct ceph_vxattr_cb *vxattr = NULL; +	struct ceph_vxattr *vxattr = NULL;  	if (!ceph_is_valid_xattr(name))  		return -ENODATA;  	/* let's see if a virtual xattr was requested */ -	if (vxattrs) -		vxattr = ceph_match_vxattr(vxattrs, name); +	vxattr = ceph_match_vxattr(inode, name); +	if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { +		err = vxattr->getxattr_cb(ci, value, size); +		return err; +	} -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,  	     ci->i_xattrs.version, ci->i_xattrs.index_version); @@ -517,19 +740,14 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,  	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {  		goto get_xattr;  	} else { -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  		/* get xattrs from mds (if we don't already have them) */  		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);  		if (err)  			return err;  	} -	spin_lock(&inode->i_lock); - -	if (vxattr && vxattr->readonly) { -		err = vxattr->getxattr_cb(ci, value, size); -		goto out; -	} +	spin_lock(&ci->i_ceph_lock);  	err = __build_xattrs(inode);  	if (err < 0) @@ -538,11 +756,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,  get_xattr:  	err = -ENODATA;  /* == ENOATTR */  	xattr = __get_xattr(ci, name); -	if (!xattr) { -		if (vxattr) -			err = vxattr->getxattr_cb(ci, value, size); +	if (!xattr)  		goto out; -	}  	err = -ERANGE;  	if (size && size < xattr->val_len) @@ -555,22 +770,31 @@ get_xattr:  	memcpy(value, xattr->val, xattr->val_len);  out: -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	return err;  } +ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, +		      size_t size) +{ +	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) +		return generic_getxattr(dentry, name, value, size); + +	return __ceph_getxattr(dentry->d_inode, name, value, size); +} +  ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)  {  	struct inode *inode = dentry->d_inode;  	struct ceph_inode_info *ci = ceph_inode(inode); -	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); +	struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);  	u32 vir_namelen = 0;  	u32 namelen;  	int err;  	u32 len;  	int i; -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	dout("listxattr %p ver=%lld index_ver=%lld\n", inode,  	     ci->i_xattrs.version, ci->i_xattrs.index_version); @@ -578,45 +802,53 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)  	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {  		goto list_xattr;  	} else { -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);  		if (err)  			return err;  	} -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  	err = __build_xattrs(inode);  	if (err < 0)  		goto out;  list_xattr: -	vir_namelen = 0; -	/* include virtual dir xattrs */ -	if (vxattrs) -		for (i = 0; vxattrs[i].name; i++) -			vir_namelen += strlen(vxattrs[i].name) + 1; +	/* +	 * Start with virtual dir xattr names (if any) (including +	 * terminating '\0' characters for each). +	 */ +	vir_namelen = ceph_vxattrs_name_size(vxattrs); +  	/* adding 1 byte per each variable due to the null termination */ -	namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; +	namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;  	err = -ERANGE; -	if (size && namelen > size) +	if (size && vir_namelen + namelen > size)  		goto out; -	err = namelen; +	err = namelen + vir_namelen;  	if (size == 0)  		goto out;  	names = __copy_xattr_names(ci, names);  	/* virtual xattr names, too */ -	if (vxattrs) +	err = namelen; +	if (vxattrs) {  		for (i = 0; vxattrs[i].name; i++) { -			len = sprintf(names, "%s", vxattrs[i].name); -			names += len + 1; +			if (!vxattrs[i].hidden && +			    !(vxattrs[i].exists_cb && +			      !vxattrs[i].exists_cb(ci))) { +				len = sprintf(names, "%s", vxattrs[i].name); +				names += len + 1; +				err += len + 1; +			}  		} +	}  out: -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock);  	return err;  } @@ -626,7 +858,6 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,  	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);  	struct inode *inode = dentry->d_inode;  	struct ceph_inode_info *ci = ceph_inode(inode); -	struct inode *parent_inode = dentry->d_parent->d_inode;  	struct ceph_mds_request *req;  	struct ceph_mds_client *mdsc = fsc->mdsc;  	int err; @@ -655,6 +886,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,  	dout("setxattr value=%.*s\n", (int)size, value); +	if (!value) +		flags |= CEPH_XATTR_REMOVE; +  	/* do request */  	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,  				       USE_AUTH_MDS); @@ -662,7 +896,8 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,  		err = PTR_ERR(req);  		goto out;  	} -	req->r_inode = igrab(inode); +	req->r_inode = inode; +	ihold(inode);  	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;  	req->r_num_caps = 1;  	req->r_args.setxattr.flags = cpu_to_le32(flags); @@ -673,7 +908,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,  	req->r_data_len = size;  	dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); -	err = ceph_mdsc_do_request(mdsc, parent_inode, req); +	err = ceph_mdsc_do_request(mdsc, NULL, req);  	ceph_mdsc_put_request(req);  	dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); @@ -686,33 +921,32 @@ out:  	return err;  } -int ceph_setxattr(struct dentry *dentry, const char *name, -		  const void *value, size_t size, int flags) +int __ceph_setxattr(struct dentry *dentry, const char *name, +			const void *value, size_t size, int flags)  {  	struct inode *inode = dentry->d_inode; +	struct ceph_vxattr *vxattr;  	struct ceph_inode_info *ci = ceph_inode(inode); -	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); +	int issued;  	int err; +	int dirty = 0;  	int name_len = strlen(name);  	int val_len = size;  	char *newname = NULL;  	char *newval = NULL;  	struct ceph_inode_xattr *xattr = NULL; -	int issued;  	int required_blob_size; -	if (ceph_snap(inode) != CEPH_NOSNAP) -		return -EROFS; -  	if (!ceph_is_valid_xattr(name))  		return -EOPNOTSUPP; -	if (vxattrs) { -		struct ceph_vxattr_cb *vxattr = -			ceph_match_vxattr(vxattrs, name); -		if (vxattr && vxattr->readonly) -			return -EOPNOTSUPP; -	} +	vxattr = ceph_match_vxattr(inode, name); +	if (vxattr && vxattr->readonly) +		return -EOPNOTSUPP; + +	/* pass any unhandled ceph.* xattrs through to the MDS */ +	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) +		goto do_sync_unlocked;  	/* preallocate memory for xattr name, value, index node */  	err = -ENOMEM; @@ -721,20 +955,19 @@ int ceph_setxattr(struct dentry *dentry, const char *name,  		goto out;  	if (val_len) { -		newval = kmalloc(val_len + 1, GFP_NOFS); +		newval = kmemdup(value, val_len, GFP_NOFS);  		if (!newval)  			goto out; -		memcpy(newval, value, val_len); -		newval[val_len] = '\0';  	}  	xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);  	if (!xattr)  		goto out; -	spin_lock(&inode->i_lock); +	spin_lock(&ci->i_ceph_lock);  retry:  	issued = __ceph_caps_issued(ci, NULL); +	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));  	if (!(issued & CEPH_CAP_XATTR_EXCL))  		goto do_sync;  	__build_xattrs(inode); @@ -743,32 +976,37 @@ retry:  	if (!ci->i_xattrs.prealloc_blob ||  	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { -		struct ceph_buffer *blob = NULL; +		struct ceph_buffer *blob; -		spin_unlock(&inode->i_lock); +		spin_unlock(&ci->i_ceph_lock);  		dout(" preaallocating new blob size=%d\n", required_blob_size);  		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);  		if (!blob)  			goto out; -		spin_lock(&inode->i_lock); +		spin_lock(&ci->i_ceph_lock);  		if (ci->i_xattrs.prealloc_blob)  			ceph_buffer_put(ci->i_xattrs.prealloc_blob);  		ci->i_xattrs.prealloc_blob = blob;  		goto retry;  	} -	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); -	err = __set_xattr(ci, newname, name_len, newval, -			  val_len, 1, 1, 1, &xattr); -	__ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); -	ci->i_xattrs.dirty = true; -	inode->i_ctime = CURRENT_TIME; -	spin_unlock(&inode->i_lock); +	err = __set_xattr(ci, newname, name_len, newval, val_len, +			  flags, value ? 1 : -1, &xattr); + +	if (!err) { +		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); +		ci->i_xattrs.dirty = true; +		inode->i_ctime = CURRENT_TIME; +	} +	spin_unlock(&ci->i_ceph_lock); +	if (dirty) +		__mark_inode_dirty(inode, dirty);  	return err;  do_sync: -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock); +do_sync_unlocked:  	err = ceph_sync_setxattr(dentry, name, value, size, flags);  out:  	kfree(newname); @@ -777,12 +1015,23 @@ out:  	return err;  } +int ceph_setxattr(struct dentry *dentry, const char *name, +		  const void *value, size_t size, int flags) +{ +	if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) +		return -EROFS; + +	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) +		return generic_setxattr(dentry, name, value, size, flags); + +	return __ceph_setxattr(dentry, name, value, size, flags); +} +  static int ceph_send_removexattr(struct dentry *dentry, const char *name)  {  	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);  	struct ceph_mds_client *mdsc = fsc->mdsc;  	struct inode *inode = dentry->d_inode; -	struct inode *parent_inode = dentry->d_parent->d_inode;  	struct ceph_mds_request *req;  	int err; @@ -790,56 +1039,90 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)  				       USE_AUTH_MDS);  	if (IS_ERR(req))  		return PTR_ERR(req); -	req->r_inode = igrab(inode); +	req->r_inode = inode; +	ihold(inode);  	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;  	req->r_num_caps = 1;  	req->r_path2 = kstrdup(name, GFP_NOFS); -	err = ceph_mdsc_do_request(mdsc, parent_inode, req); +	err = ceph_mdsc_do_request(mdsc, NULL, req);  	ceph_mdsc_put_request(req);  	return err;  } -int ceph_removexattr(struct dentry *dentry, const char *name) +int __ceph_removexattr(struct dentry *dentry, const char *name)  {  	struct inode *inode = dentry->d_inode; +	struct ceph_vxattr *vxattr;  	struct ceph_inode_info *ci = ceph_inode(inode); -	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);  	int issued;  	int err; - -	if (ceph_snap(inode) != CEPH_NOSNAP) -		return -EROFS; +	int required_blob_size; +	int dirty;  	if (!ceph_is_valid_xattr(name))  		return -EOPNOTSUPP; -	if (vxattrs) { -		struct ceph_vxattr_cb *vxattr = -			ceph_match_vxattr(vxattrs, name); -		if (vxattr && vxattr->readonly) -			return -EOPNOTSUPP; -	} +	vxattr = ceph_match_vxattr(inode, name); +	if (vxattr && vxattr->readonly) +		return -EOPNOTSUPP; -	spin_lock(&inode->i_lock); -	__build_xattrs(inode); +	/* pass any unhandled ceph.* xattrs through to the MDS */ +	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) +		goto do_sync_unlocked; + +	err = -ENOMEM; +	spin_lock(&ci->i_ceph_lock); +retry:  	issued = __ceph_caps_issued(ci, NULL);  	dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));  	if (!(issued & CEPH_CAP_XATTR_EXCL))  		goto do_sync; +	__build_xattrs(inode); + +	required_blob_size = __get_required_blob_size(ci, 0, 0); + +	if (!ci->i_xattrs.prealloc_blob || +	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { +		struct ceph_buffer *blob; + +		spin_unlock(&ci->i_ceph_lock); +		dout(" preaallocating new blob size=%d\n", required_blob_size); +		blob = ceph_buffer_new(required_blob_size, GFP_NOFS); +		if (!blob) +			goto out; +		spin_lock(&ci->i_ceph_lock); +		if (ci->i_xattrs.prealloc_blob) +			ceph_buffer_put(ci->i_xattrs.prealloc_blob); +		ci->i_xattrs.prealloc_blob = blob; +		goto retry; +	}  	err = __remove_xattr_by_name(ceph_inode(inode), name); -	__ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + +	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);  	ci->i_xattrs.dirty = true;  	inode->i_ctime = CURRENT_TIME; - -	spin_unlock(&inode->i_lock); - +	spin_unlock(&ci->i_ceph_lock); +	if (dirty) +		__mark_inode_dirty(inode, dirty);  	return err;  do_sync: -	spin_unlock(&inode->i_lock); +	spin_unlock(&ci->i_ceph_lock); +do_sync_unlocked:  	err = ceph_send_removexattr(dentry, name); +out:  	return err;  } +int ceph_removexattr(struct dentry *dentry, const char *name) +{ +	if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) +		return -EROFS; + +	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) +		return generic_removexattr(dentry, name); + +	return __ceph_removexattr(dentry, name); +}  | 
