diff options
Diffstat (limited to 'fs/fuse')
| -rw-r--r-- | fs/fuse/Kconfig | 16 | ||||
| -rw-r--r-- | fs/fuse/control.c | 21 | ||||
| -rw-r--r-- | fs/fuse/cuse.c | 95 | ||||
| -rw-r--r-- | fs/fuse/dev.c | 559 | ||||
| -rw-r--r-- | fs/fuse/dir.c | 944 | ||||
| -rw-r--r-- | fs/fuse/file.c | 1713 | ||||
| -rw-r--r-- | fs/fuse/fuse_i.h | 216 | ||||
| -rw-r--r-- | fs/fuse/inode.c | 314 | 
8 files changed, 2997 insertions, 881 deletions
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 0cf160a94ed..1b2f6c2c3aa 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -4,12 +4,24 @@ config FUSE_FS  	  With FUSE it is possible to implement a fully functional filesystem  	  in a userspace program. -	  There's also companion library: libfuse.  This library along with -	  utilities is available from the FUSE homepage: +	  There's also a companion library: libfuse2.  This library is available +	  from the FUSE homepage:  	  <http://fuse.sourceforge.net/> +	  although chances are your distribution already has that library +	  installed if you've installed the "fuse" package itself.  	  See <file:Documentation/filesystems/fuse.txt> for more information.  	  See <file:Documentation/Changes> for needed library/utility version.  	  If you want to develop a userspace FS, or if you want to use  	  a filesystem based on FUSE, answer Y or M. + +config CUSE +	tristate "Character device in Userspace support" +	depends on FUSE_FS +	help +	  This FUSE extension allows character devices to be +	  implemented in userspace. + +	  If you want to develop or use a userspace character device +	  based on CUSE, answer Y or M. diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 85542a7daf4..205e0d5d530 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -23,7 +23,7 @@ static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)  {  	struct fuse_conn *fc;  	mutex_lock(&fuse_mutex); -	fc = file->f_path.dentry->d_inode->i_private; +	fc = file_inode(file)->i_private;  	if (fc)  		fc = fuse_conn_get(fc);  	mutex_unlock(&fuse_mutex); @@ -75,19 +75,13 @@ static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,  				     unsigned global_limit)  {  	unsigned long t; -	char tmp[32];  	unsigned limit = (1 << 16) - 1;  	int err; -	if (*ppos || count >= sizeof(tmp) - 1) -		return -EINVAL; - -	if (copy_from_user(tmp, buf, count)) +	if (*ppos)  		return -EINVAL; -	tmp[count] = '\0'; - -	err = strict_strtoul(tmp, 0, &t); +	err = kstrtoul_from_user(buf, count, 0, &t);  	if (err)  		return err; @@ -123,7 +117,7 @@ static ssize_t fuse_conn_max_background_write(struct file *file,  					      const char __user *buf,  					      size_t count, loff_t *ppos)  { -	unsigned val; +	unsigned uninitialized_var(val);  	ssize_t ret;  	ret = fuse_conn_limit_write(file, buf, count, ppos, &val, @@ -160,7 +154,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,  						    const char __user *buf,  						    size_t count, loff_t *ppos)  { -	unsigned val; +	unsigned uninitialized_var(val);  	ssize_t ret;  	ret = fuse_conn_limit_write(file, buf, count, ppos, &val, @@ -231,7 +225,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,  	if (iop)  		inode->i_op = iop;  	inode->i_fop = fop; -	inode->i_nlink = nlink; +	set_nlink(inode, nlink);  	inode->i_private = fc;  	d_add(dentry, inode);  	return dentry; @@ -347,13 +341,14 @@ static struct file_system_type fuse_ctl_fs_type = {  	.mount		= fuse_ctl_mount,  	.kill_sb	= fuse_ctl_kill_sb,  }; +MODULE_ALIAS_FS("fusectl");  int __init fuse_ctl_init(void)  {  	return register_filesystem(&fuse_ctl_fs_type);  } -void fuse_ctl_cleanup(void) +void __exit fuse_ctl_cleanup(void)  {  	unregister_filesystem(&fuse_ctl_fs_type);  } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 3e87cce5837..966ace8b243 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -38,6 +38,7 @@  #include <linux/device.h>  #include <linux/file.h>  #include <linux/fs.h> +#include <linux/aio.h>  #include <linux/kdev_t.h>  #include <linux/kthread.h>  #include <linux/list.h> @@ -45,8 +46,8 @@  #include <linux/miscdevice.h>  #include <linux/mutex.h>  #include <linux/slab.h> -#include <linux/spinlock.h>  #include <linux/stat.h> +#include <linux/module.h>  #include "fuse_i.h" @@ -62,7 +63,7 @@ struct cuse_conn {  	bool			unrestricted_ioctl;  }; -static DEFINE_SPINLOCK(cuse_lock);		/* protects cuse_conntbl */ +static DEFINE_MUTEX(cuse_lock);		/* protects registration */  static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];  static struct class *cuse_class; @@ -91,19 +92,29 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,  			 loff_t *ppos)  {  	loff_t pos = 0; +	struct iovec iov = { .iov_base = buf, .iov_len = count }; +	struct fuse_io_priv io = { .async = 0, .file = file }; +	struct iov_iter ii; +	iov_iter_init(&ii, READ, &iov, 1, count); -	return fuse_direct_io(file, buf, count, &pos, 0); +	return fuse_direct_io(&io, &ii, &pos, FUSE_DIO_CUSE);  }  static ssize_t cuse_write(struct file *file, const char __user *buf,  			  size_t count, loff_t *ppos)  {  	loff_t pos = 0; +	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; +	struct fuse_io_priv io = { .async = 0, .file = file }; +	struct iov_iter ii; +	iov_iter_init(&ii, WRITE, &iov, 1, count); +  	/*  	 * No locking or generic_write_checks(), the server is  	 * responsible for locking and sanity checks.  	 */ -	return fuse_direct_io(file, buf, count, &pos, 1); +	return fuse_direct_io(&io, &ii, &pos, +			      FUSE_DIO_WRITE | FUSE_DIO_CUSE);  }  static int cuse_open(struct inode *inode, struct file *file) @@ -113,14 +124,14 @@ static int cuse_open(struct inode *inode, struct file *file)  	int rc;  	/* look up and get the connection */ -	spin_lock(&cuse_lock); +	mutex_lock(&cuse_lock);  	list_for_each_entry(pos, cuse_conntbl_head(devt), list)  		if (pos->dev->devt == devt) {  			fuse_conn_get(&pos->fc);  			cc = pos;  			break;  		} -	spin_unlock(&cuse_lock); +	mutex_unlock(&cuse_lock);  	/* dead? */  	if (!cc) @@ -266,7 +277,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)  static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)  {  	char *end = p + len; -	char *key, *val; +	char *uninitialized_var(key), *uninitialized_var(val);  	int rc;  	while (true) { @@ -304,14 +315,14 @@ static void cuse_gendev_release(struct device *dev)   */  static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)  { -	struct cuse_conn *cc = fc_to_cc(fc); -	struct cuse_init_out *arg = &req->misc.cuse_init_out; +	struct cuse_conn *cc = fc_to_cc(fc), *pos; +	struct cuse_init_out *arg = req->out.args[0].value;  	struct page *page = req->pages[0];  	struct cuse_devinfo devinfo = { };  	struct device *dev;  	struct cdev *cdev;  	dev_t devt; -	int rc; +	int rc, i;  	if (req->out.h.error ||  	    arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { @@ -355,15 +366,24 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)  	dev_set_drvdata(dev, cc);  	dev_set_name(dev, "%s", devinfo.name); +	mutex_lock(&cuse_lock); + +	/* make sure the device-name is unique */ +	for (i = 0; i < CUSE_CONNTBL_LEN; ++i) { +		list_for_each_entry(pos, &cuse_conntbl[i], list) +			if (!strcmp(dev_name(pos->dev), dev_name(dev))) +				goto err_unlock; +	} +  	rc = device_add(dev);  	if (rc) -		goto err_device; +		goto err_unlock;  	/* register cdev */  	rc = -ENOMEM;  	cdev = cdev_alloc();  	if (!cdev) -		goto err_device; +		goto err_unlock;  	cdev->owner = THIS_MODULE;  	cdev->ops = &cuse_frontend_fops; @@ -376,25 +396,26 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)  	cc->cdev = cdev;  	/* make the device available */ -	spin_lock(&cuse_lock);  	list_add(&cc->list, cuse_conntbl_head(devt)); -	spin_unlock(&cuse_lock); +	mutex_unlock(&cuse_lock);  	/* announce device availability */  	dev_set_uevent_suppress(dev, 0);  	kobject_uevent(&dev->kobj, KOBJ_ADD);  out: +	kfree(arg);  	__free_page(page);  	return;  err_cdev:  	cdev_del(cdev); -err_device: +err_unlock: +	mutex_unlock(&cuse_lock);  	put_device(dev);  err_region:  	unregister_chrdev_region(devt, 1);  err: -	fc->conn_error = 1; +	fuse_conn_kill(fc);  	goto out;  } @@ -405,10 +426,11 @@ static int cuse_send_init(struct cuse_conn *cc)  	struct page *page;  	struct fuse_conn *fc = &cc->fc;  	struct cuse_init_in *arg; +	void *outarg;  	BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); -	req = fuse_get_req(fc); +	req = fuse_get_req_for_background(fc, 1);  	if (IS_ERR(req)) {  		rc = PTR_ERR(req);  		goto err; @@ -419,6 +441,10 @@ static int cuse_send_init(struct cuse_conn *cc)  	if (!page)  		goto err_put_req; +	outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL); +	if (!outarg) +		goto err_free_page; +  	arg = &req->misc.cuse_init_in;  	arg->major = FUSE_KERNEL_VERSION;  	arg->minor = FUSE_KERNEL_MINOR_VERSION; @@ -429,17 +455,20 @@ static int cuse_send_init(struct cuse_conn *cc)  	req->in.args[0].value = arg;  	req->out.numargs = 2;  	req->out.args[0].size = sizeof(struct cuse_init_out); -	req->out.args[0].value = &req->misc.cuse_init_out; +	req->out.args[0].value = outarg;  	req->out.args[1].size = CUSE_INIT_INFO_MAX;  	req->out.argvar = 1;  	req->out.argpages = 1;  	req->pages[0] = page; +	req->page_descs[0].length = req->out.args[1].size;  	req->num_pages = 1;  	req->end = cuse_process_init_reply;  	fuse_request_send_background(fc, req);  	return 0; +err_free_page: +	__free_page(page);  err_put_req:  	fuse_put_request(fc, req);  err: @@ -449,7 +478,7 @@ err:  static void cuse_fc_release(struct fuse_conn *fc)  {  	struct cuse_conn *cc = fc_to_cc(fc); -	kfree(cc); +	kfree_rcu(cc, fc.rcu);  }  /** @@ -458,7 +487,7 @@ static void cuse_fc_release(struct fuse_conn *fc)   * @file: file struct being opened   *   * Userland CUSE server can create a CUSE device by opening /dev/cuse - * and replying to the initilaization request kernel sends.  This + * and replying to the initialization request kernel sends.  This   * function is responsible for handling CUSE device initialization.   * Because the fd opened by this function is used during   * initialization, this function only creates cuse_conn and sends @@ -483,7 +512,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file)  	cc->fc.release = cuse_fc_release;  	cc->fc.connected = 1; -	cc->fc.blocked = 0; +	cc->fc.initialized = 1;  	rc = cuse_send_init(cc);  	if (rc) {  		fuse_conn_put(&cc->fc); @@ -511,9 +540,9 @@ static int cuse_channel_release(struct inode *inode, struct file *file)  	int rc;  	/* remove from the conntbl, no more access from this point on */ -	spin_lock(&cuse_lock); +	mutex_lock(&cuse_lock);  	list_del_init(&cc->list); -	spin_unlock(&cuse_lock); +	mutex_unlock(&cuse_lock);  	/* remove device */  	if (cc->dev) @@ -523,8 +552,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file)  		cdev_del(cc->cdev);  	} -	/* kill connection and shutdown channel */ -	fuse_conn_kill(&cc->fc);  	rc = fuse_dev_release(inode, file);	/* puts the base reference */  	return rc; @@ -546,6 +573,7 @@ static ssize_t cuse_class_waiting_show(struct device *dev,  	return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));  } +static DEVICE_ATTR(waiting, 0400, cuse_class_waiting_show, NULL);  static ssize_t cuse_class_abort_store(struct device *dev,  				      struct device_attribute *attr, @@ -556,19 +584,24 @@ static ssize_t cuse_class_abort_store(struct device *dev,  	fuse_abort_conn(&cc->fc);  	return count;  } +static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store); -static struct device_attribute cuse_class_dev_attrs[] = { -	__ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL), -	__ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store), -	{ } +static struct attribute *cuse_class_dev_attrs[] = { +	&dev_attr_waiting.attr, +	&dev_attr_abort.attr, +	NULL,  }; +ATTRIBUTE_GROUPS(cuse_class_dev);  static struct miscdevice cuse_miscdev = { -	.minor		= MISC_DYNAMIC_MINOR, +	.minor		= CUSE_MINOR,  	.name		= "cuse",  	.fops		= &cuse_channel_fops,  }; +MODULE_ALIAS_MISCDEV(CUSE_MINOR); +MODULE_ALIAS("devname:cuse"); +  static int __init cuse_init(void)  {  	int i, rc; @@ -587,7 +620,7 @@ static int __init cuse_init(void)  	if (IS_ERR(cuse_class))  		return PTR_ERR(cuse_class); -	cuse_class->dev_attrs = cuse_class_dev_attrs; +	cuse_class->dev_groups = cuse_class_dev_groups;  	rc = misc_register(&cuse_miscdev);  	if (rc) { diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6e07696308d..ca887314aba 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -19,6 +19,7 @@  #include <linux/pipe_fs_i.h>  #include <linux/swap.h>  #include <linux/splice.h> +#include <linux/aio.h>  MODULE_ALIAS_MISCDEV(FUSE_MINOR);  MODULE_ALIAS("devname:fuse"); @@ -34,34 +35,67 @@ static struct fuse_conn *fuse_get_conn(struct file *file)  	return file->private_data;  } -static void fuse_request_init(struct fuse_req *req) +static void fuse_request_init(struct fuse_req *req, struct page **pages, +			      struct fuse_page_desc *page_descs, +			      unsigned npages)  {  	memset(req, 0, sizeof(*req)); +	memset(pages, 0, sizeof(*pages) * npages); +	memset(page_descs, 0, sizeof(*page_descs) * npages);  	INIT_LIST_HEAD(&req->list);  	INIT_LIST_HEAD(&req->intr_entry);  	init_waitqueue_head(&req->waitq);  	atomic_set(&req->count, 1); +	req->pages = pages; +	req->page_descs = page_descs; +	req->max_pages = npages;  } -struct fuse_req *fuse_request_alloc(void) +static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)  { -	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL); -	if (req) -		fuse_request_init(req); +	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags); +	if (req) { +		struct page **pages; +		struct fuse_page_desc *page_descs; + +		if (npages <= FUSE_REQ_INLINE_PAGES) { +			pages = req->inline_pages; +			page_descs = req->inline_page_descs; +		} else { +			pages = kmalloc(sizeof(struct page *) * npages, flags); +			page_descs = kmalloc(sizeof(struct fuse_page_desc) * +					     npages, flags); +		} + +		if (!pages || !page_descs) { +			kfree(pages); +			kfree(page_descs); +			kmem_cache_free(fuse_req_cachep, req); +			return NULL; +		} + +		fuse_request_init(req, pages, page_descs, npages); +	}  	return req;  } + +struct fuse_req *fuse_request_alloc(unsigned npages) +{ +	return __fuse_request_alloc(npages, GFP_KERNEL); +}  EXPORT_SYMBOL_GPL(fuse_request_alloc); -struct fuse_req *fuse_request_alloc_nofs(void) +struct fuse_req *fuse_request_alloc_nofs(unsigned npages)  { -	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS); -	if (req) -		fuse_request_init(req); -	return req; +	return __fuse_request_alloc(npages, GFP_NOFS);  }  void fuse_request_free(struct fuse_req *req)  { +	if (req->pages != req->inline_pages) { +		kfree(req->pages); +		kfree(req->page_descs); +	}  	kmem_cache_free(fuse_req_cachep, req);  } @@ -78,7 +112,7 @@ static void restore_sigs(sigset_t *oldset)  	sigprocmask(SIG_SETMASK, oldset, NULL);  } -static void __fuse_get_request(struct fuse_req *req) +void __fuse_get_request(struct fuse_req *req)  {  	atomic_inc(&req->count);  } @@ -92,45 +126,71 @@ static void __fuse_put_request(struct fuse_req *req)  static void fuse_req_init_context(struct fuse_req *req)  { -	req->in.h.uid = current_fsuid(); -	req->in.h.gid = current_fsgid(); +	req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); +	req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());  	req->in.h.pid = current->pid;  } -struct fuse_req *fuse_get_req(struct fuse_conn *fc) +static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) +{ +	return !fc->initialized || (for_background && fc->blocked); +} + +static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, +				       bool for_background)  {  	struct fuse_req *req; -	sigset_t oldset; -	int intr;  	int err; -  	atomic_inc(&fc->num_waiting); -	block_sigs(&oldset); -	intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); -	restore_sigs(&oldset); -	err = -EINTR; -	if (intr) -		goto out; + +	if (fuse_block_alloc(fc, for_background)) { +		sigset_t oldset; +		int intr; + +		block_sigs(&oldset); +		intr = wait_event_interruptible_exclusive(fc->blocked_waitq, +				!fuse_block_alloc(fc, for_background)); +		restore_sigs(&oldset); +		err = -EINTR; +		if (intr) +			goto out; +	}  	err = -ENOTCONN;  	if (!fc->connected)  		goto out; -	req = fuse_request_alloc(); +	req = fuse_request_alloc(npages);  	err = -ENOMEM; -	if (!req) +	if (!req) { +		if (for_background) +			wake_up(&fc->blocked_waitq);  		goto out; +	}  	fuse_req_init_context(req);  	req->waiting = 1; +	req->background = for_background;  	return req;   out:  	atomic_dec(&fc->num_waiting);  	return ERR_PTR(err);  } + +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) +{ +	return __fuse_get_req(fc, npages, false); +}  EXPORT_SYMBOL_GPL(fuse_get_req); +struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, +					     unsigned npages) +{ +	return __fuse_get_req(fc, npages, true); +} +EXPORT_SYMBOL_GPL(fuse_get_req_for_background); +  /*   * Return request in fuse_file->reserved_req.  However that may   * currently be in use.  If that is the case, wait for it to become @@ -148,8 +208,7 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc,  		if (ff->reserved_req) {  			req = ff->reserved_req;  			ff->reserved_req = NULL; -			get_file(file); -			req->stolen_file = file; +			req->stolen_file = get_file(file);  		}  		spin_unlock(&fc->lock);  	} while (!req); @@ -166,7 +225,7 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)  	struct fuse_file *ff = file->private_data;  	spin_lock(&fc->lock); -	fuse_request_init(req); +	fuse_request_init(req, req->pages, req->page_descs, req->max_pages);  	BUG_ON(ff->reserved_req);  	ff->reserved_req = req;  	wake_up_all(&fc->reserved_req_waitq); @@ -187,24 +246,37 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)   * filesystem should not have it's own file open.  If deadlock is   * intentional, it can still be broken by "aborting" the filesystem.   */ -struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file) +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, +					     struct file *file)  {  	struct fuse_req *req;  	atomic_inc(&fc->num_waiting); -	wait_event(fc->blocked_waitq, !fc->blocked); -	req = fuse_request_alloc(); +	wait_event(fc->blocked_waitq, fc->initialized); +	req = fuse_request_alloc(0);  	if (!req)  		req = get_reserved_req(fc, file);  	fuse_req_init_context(req);  	req->waiting = 1; +	req->background = 0;  	return req;  }  void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)  {  	if (atomic_dec_and_test(&req->count)) { +		if (unlikely(req->background)) { +			/* +			 * We get here in the unlikely case that a background +			 * request was allocated but not sent +			 */ +			spin_lock(&fc->lock); +			if (!fc->blocked) +				wake_up(&fc->blocked_waitq); +			spin_unlock(&fc->lock); +		} +  		if (req->waiting)  			atomic_dec(&fc->num_waiting); @@ -251,6 +323,24 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)  	kill_fasync(&fc->fasync, SIGIO, POLL_IN);  } +void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, +		       u64 nodeid, u64 nlookup) +{ +	forget->forget_one.nodeid = nodeid; +	forget->forget_one.nlookup = nlookup; + +	spin_lock(&fc->lock); +	if (fc->connected) { +		fc->forget_list_tail->next = forget; +		fc->forget_list_tail = forget; +		wake_up(&fc->waitq); +		kill_fasync(&fc->fasync, SIGIO, POLL_IN); +	} else { +		kfree(forget); +	} +	spin_unlock(&fc->lock); +} +  static void flush_bg_queue(struct fuse_conn *fc)  {  	while (fc->active_background < fc->max_background && @@ -284,10 +374,15 @@ __releases(fc->lock)  	list_del(&req->intr_entry);  	req->state = FUSE_REQ_FINISHED;  	if (req->background) { -		if (fc->num_background == fc->max_background) { +		req->background = 0; + +		if (fc->num_background == fc->max_background)  			fc->blocked = 0; -			wake_up_all(&fc->blocked_waitq); -		} + +		/* Wake up next waiter, if any */ +		if (!fc->blocked && waitqueue_active(&fc->blocked_waitq)) +			wake_up(&fc->blocked_waitq); +  		if (fc->num_background == fc->congestion_threshold &&  		    fc->connected && fc->bdi_initialized) {  			clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); @@ -389,9 +484,9 @@ __acquires(fc->lock)  	}  } -void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)  { -	req->isreply = 1; +	BUG_ON(req->background);  	spin_lock(&fc->lock);  	if (!fc->connected)  		req->out.h.error = -ENOTCONN; @@ -408,12 +503,18 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)  	}  	spin_unlock(&fc->lock);  } + +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +{ +	req->isreply = 1; +	__fuse_request_send(fc, req); +}  EXPORT_SYMBOL_GPL(fuse_request_send);  static void fuse_request_send_nowait_locked(struct fuse_conn *fc,  					    struct fuse_req *req)  { -	req->background = 1; +	BUG_ON(!req->background);  	fc->num_background++;  	if (fc->num_background == fc->max_background)  		fc->blocked = 1; @@ -438,12 +539,6 @@ static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)  	}  } -void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) -{ -	req->isreply = 0; -	fuse_request_send_nowait(fc, req); -} -  void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)  {  	req->isreply = 1; @@ -480,6 +575,27 @@ void fuse_request_send_background_locked(struct fuse_conn *fc,  	fuse_request_send_nowait_locked(fc, req);  } +void fuse_force_forget(struct file *file, u64 nodeid) +{ +	struct inode *inode = file_inode(file); +	struct fuse_conn *fc = get_fuse_conn(inode); +	struct fuse_req *req; +	struct fuse_forget_in inarg; + +	memset(&inarg, 0, sizeof(inarg)); +	inarg.nlookup = 1; +	req = fuse_get_req_nofail_nopages(fc, file); +	req->in.h.opcode = FUSE_FORGET; +	req->in.h.nodeid = nodeid; +	req->in.numargs = 1; +	req->in.args[0].size = sizeof(inarg); +	req->in.args[0].value = &inarg; +	req->isreply = 0; +	__fuse_request_send(fc, req); +	/* ignore errors */ +	fuse_put_request(fc, req); +} +  /*   * Lock the request.  Up to the next unlock_request() there mustn't be   * anything that could cause a page-fault.  If the request was already @@ -527,9 +643,8 @@ struct fuse_copy_state {  	unsigned long seglen;  	unsigned long addr;  	struct page *pg; -	void *mapaddr; -	void *buf;  	unsigned len; +	unsigned offset;  	unsigned move_pages:1;  }; @@ -550,23 +665,17 @@ static void fuse_copy_finish(struct fuse_copy_state *cs)  	if (cs->currbuf) {  		struct pipe_buffer *buf = cs->currbuf; -		if (!cs->write) { -			buf->ops->unmap(cs->pipe, buf, cs->mapaddr); -		} else { -			kunmap(buf->page); +		if (cs->write)  			buf->len = PAGE_SIZE - cs->len; -		}  		cs->currbuf = NULL; -		cs->mapaddr = NULL; -	} else if (cs->mapaddr) { -		kunmap(cs->pg); +	} else if (cs->pg) {  		if (cs->write) {  			flush_dcache_page(cs->pg);  			set_page_dirty_lock(cs->pg);  		}  		put_page(cs->pg); -		cs->mapaddr = NULL;  	} +	cs->pg = NULL;  }  /* @@ -575,7 +684,7 @@ static void fuse_copy_finish(struct fuse_copy_state *cs)   */  static int fuse_copy_fill(struct fuse_copy_state *cs)  { -	unsigned long offset; +	struct page *page;  	int err;  	unlock_request(cs->fc, cs->req); @@ -590,14 +699,12 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)  			BUG_ON(!cs->nr_segs);  			cs->currbuf = buf; -			cs->mapaddr = buf->ops->map(cs->pipe, buf, 0); +			cs->pg = buf->page; +			cs->offset = buf->offset;  			cs->len = buf->len; -			cs->buf = cs->mapaddr + buf->offset;  			cs->pipebufs++;  			cs->nr_segs--;  		} else { -			struct page *page; -  			if (cs->nr_segs == cs->pipe->buffers)  				return -EIO; @@ -610,8 +717,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)  			buf->len = 0;  			cs->currbuf = buf; -			cs->mapaddr = kmap(page); -			cs->buf = cs->mapaddr; +			cs->pg = page; +			cs->offset = 0;  			cs->len = PAGE_SIZE;  			cs->pipebufs++;  			cs->nr_segs++; @@ -624,14 +731,13 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)  			cs->iov++;  			cs->nr_segs--;  		} -		err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg); +		err = get_user_pages_fast(cs->addr, 1, cs->write, &page);  		if (err < 0)  			return err;  		BUG_ON(err != 1); -		offset = cs->addr % PAGE_SIZE; -		cs->mapaddr = kmap(cs->pg); -		cs->buf = cs->mapaddr + offset; -		cs->len = min(PAGE_SIZE - offset, cs->seglen); +		cs->pg = page; +		cs->offset = cs->addr % PAGE_SIZE; +		cs->len = min(PAGE_SIZE - cs->offset, cs->seglen);  		cs->seglen -= cs->len;  		cs->addr += cs->len;  	} @@ -644,15 +750,20 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)  {  	unsigned ncpy = min(*size, cs->len);  	if (val) { +		void *pgaddr = kmap_atomic(cs->pg); +		void *buf = pgaddr + cs->offset; +  		if (cs->write) -			memcpy(cs->buf, *val, ncpy); +			memcpy(buf, *val, ncpy);  		else -			memcpy(*val, cs->buf, ncpy); +			memcpy(*val, buf, ncpy); + +		kunmap_atomic(pgaddr);  		*val += ncpy;  	}  	*size -= ncpy;  	cs->len -= ncpy; -	cs->buf += ncpy; +	cs->offset += ncpy;  	return ncpy;  } @@ -681,8 +792,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)  	struct page *oldpage = *pagep;  	struct page *newpage;  	struct pipe_buffer *buf = cs->pipebufs; -	struct address_space *mapping; -	pgoff_t index;  	unlock_request(cs->fc, cs->req);  	fuse_copy_finish(cs); @@ -713,9 +822,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)  	if (fuse_check_page(newpage) != 0)  		goto out_fallback_unlock; -	mapping = oldpage->mapping; -	index = oldpage->index; -  	/*  	 * This is a new and locked page, it shouldn't be mapped or  	 * have any special flags on it @@ -729,14 +835,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)  	if (WARN_ON(PageMlocked(oldpage)))  		goto out_fallback_unlock; -	remove_from_page_cache(oldpage); -	page_cache_release(oldpage); - -	err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL); +	err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);  	if (err) { -		printk(KERN_WARNING "fuse_try_move_page: failed to add page"); -		goto out_fallback_unlock; +		unlock_page(newpage); +		return err;  	} +  	page_cache_get(newpage);  	if (!(buf->flags & PIPE_BUF_FLAG_LRU)) @@ -765,8 +869,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)  out_fallback_unlock:  	unlock_page(newpage);  out_fallback: -	cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); -	cs->buf = cs->mapaddr + buf->offset; +	cs->pg = buf->page; +	cs->offset = buf->offset;  	err = lock_request(cs->fc, cs->req);  	if (err) @@ -828,10 +932,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,  			}  		}  		if (page) { -			void *mapaddr = kmap_atomic(page, KM_USER0); +			void *mapaddr = kmap_atomic(page);  			void *buf = mapaddr + offset;  			offset += fuse_copy_do(cs, &buf, &count); -			kunmap_atomic(mapaddr, KM_USER0); +			kunmap_atomic(mapaddr);  		} else  			offset += fuse_copy_do(cs, NULL, &count);  	} @@ -846,11 +950,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,  {  	unsigned i;  	struct fuse_req *req = cs->req; -	unsigned offset = req->page_offset; -	unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);  	for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {  		int err; +		unsigned offset = req->page_descs[i].offset; +		unsigned count = min(nbytes, req->page_descs[i].length);  		err = fuse_copy_page(cs, &req->pages[i], offset, count,  				     zeroing); @@ -858,8 +962,6 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,  			return err;  		nbytes -= count; -		count = min(nbytes, (unsigned) PAGE_SIZE); -		offset = 0;  	}  	return 0;  } @@ -896,9 +998,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,  	return err;  } +static int forget_pending(struct fuse_conn *fc) +{ +	return fc->forget_list_head.next != NULL; +} +  static int request_pending(struct fuse_conn *fc)  { -	return !list_empty(&fc->pending) || !list_empty(&fc->interrupts); +	return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) || +		forget_pending(fc);  }  /* Wait until a request is available on the pending list */ @@ -960,6 +1068,120 @@ __releases(fc->lock)  	return err ? err : reqsize;  } +static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc, +					       unsigned max, +					       unsigned *countp) +{ +	struct fuse_forget_link *head = fc->forget_list_head.next; +	struct fuse_forget_link **newhead = &head; +	unsigned count; + +	for (count = 0; *newhead != NULL && count < max; count++) +		newhead = &(*newhead)->next; + +	fc->forget_list_head.next = *newhead; +	*newhead = NULL; +	if (fc->forget_list_head.next == NULL) +		fc->forget_list_tail = &fc->forget_list_head; + +	if (countp != NULL) +		*countp = count; + +	return head; +} + +static int fuse_read_single_forget(struct fuse_conn *fc, +				   struct fuse_copy_state *cs, +				   size_t nbytes) +__releases(fc->lock) +{ +	int err; +	struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL); +	struct fuse_forget_in arg = { +		.nlookup = forget->forget_one.nlookup, +	}; +	struct fuse_in_header ih = { +		.opcode = FUSE_FORGET, +		.nodeid = forget->forget_one.nodeid, +		.unique = fuse_get_unique(fc), +		.len = sizeof(ih) + sizeof(arg), +	}; + +	spin_unlock(&fc->lock); +	kfree(forget); +	if (nbytes < ih.len) +		return -EINVAL; + +	err = fuse_copy_one(cs, &ih, sizeof(ih)); +	if (!err) +		err = fuse_copy_one(cs, &arg, sizeof(arg)); +	fuse_copy_finish(cs); + +	if (err) +		return err; + +	return ih.len; +} + +static int fuse_read_batch_forget(struct fuse_conn *fc, +				   struct fuse_copy_state *cs, size_t nbytes) +__releases(fc->lock) +{ +	int err; +	unsigned max_forgets; +	unsigned count; +	struct fuse_forget_link *head; +	struct fuse_batch_forget_in arg = { .count = 0 }; +	struct fuse_in_header ih = { +		.opcode = FUSE_BATCH_FORGET, +		.unique = fuse_get_unique(fc), +		.len = sizeof(ih) + sizeof(arg), +	}; + +	if (nbytes < ih.len) { +		spin_unlock(&fc->lock); +		return -EINVAL; +	} + +	max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); +	head = dequeue_forget(fc, max_forgets, &count); +	spin_unlock(&fc->lock); + +	arg.count = count; +	ih.len += count * sizeof(struct fuse_forget_one); +	err = fuse_copy_one(cs, &ih, sizeof(ih)); +	if (!err) +		err = fuse_copy_one(cs, &arg, sizeof(arg)); + +	while (head) { +		struct fuse_forget_link *forget = head; + +		if (!err) { +			err = fuse_copy_one(cs, &forget->forget_one, +					    sizeof(forget->forget_one)); +		} +		head = forget->next; +		kfree(forget); +	} + +	fuse_copy_finish(cs); + +	if (err) +		return err; + +	return ih.len; +} + +static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs, +			    size_t nbytes) +__releases(fc->lock) +{ +	if (fc->minor < 16 || fc->forget_list_head.next->next == NULL) +		return fuse_read_single_forget(fc, cs, nbytes); +	else +		return fuse_read_batch_forget(fc, cs, nbytes); +} +  /*   * Read a single request into the userspace filesystem's buffer.  This   * function waits until a request is available, then removes it from @@ -998,6 +1220,14 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,  		return fuse_read_interrupt(fc, cs, nbytes, req);  	} +	if (forget_pending(fc)) { +		if (list_empty(&fc->pending) || fc->forget_batch-- > 0) +			return fuse_read_forget(fc, cs, nbytes); + +		if (fc->forget_batch <= -8) +			fc->forget_batch = 16; +	} +  	req = list_entry(fc->pending.next, struct fuse_req, list);  	req->state = FUSE_REQ_READING;  	list_move(&req->list, &fc->io); @@ -1061,22 +1291,6 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,  	return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));  } -static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe, -				   struct pipe_buffer *buf) -{ -	return 1; -} - -static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = { -	.can_merge = 0, -	.map = generic_pipe_buf_map, -	.unmap = generic_pipe_buf_unmap, -	.confirm = generic_pipe_buf_confirm, -	.release = generic_pipe_buf_release, -	.steal = fuse_dev_pipe_buf_steal, -	.get = generic_pipe_buf_get, -}; -  static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,  				    struct pipe_inode_info *pipe,  				    size_t len, unsigned int flags) @@ -1090,7 +1304,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,  	if (!fc)  		return -EPERM; -	bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); +	bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);  	if (!bufs)  		return -ENOMEM; @@ -1123,13 +1337,17 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,  		buf->page = bufs[page_nr].page;  		buf->offset = bufs[page_nr].offset;  		buf->len = bufs[page_nr].len; -		buf->ops = &fuse_dev_pipe_buf_ops; +		/* +		 * Need to be careful about this.  Having buf->ops in module +		 * code can Oops if the buffer persists after module unload. +		 */ +		buf->ops = &nosteal_pipe_buf_ops;  		pipe->nrbufs++;  		page_nr++;  		ret += buf->len; -		if (pipe->inode) +		if (pipe->files)  			do_wakeup = 1;  	} @@ -1224,6 +1442,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,  	if (outarg.namelen > FUSE_NAME_MAX)  		goto err; +	err = -EINVAL; +	if (size != sizeof(outarg) + outarg.namelen + 1) +		goto err; +  	name.name = buf;  	name.len = outarg.namelen;  	err = fuse_copy_one(cs, buf, outarg.namelen + 1); @@ -1236,7 +1458,59 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,  	down_read(&fc->killsb);  	err = -ENOENT;  	if (fc->sb) -		err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name); +		err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name); +	up_read(&fc->killsb); +	kfree(buf); +	return err; + +err: +	kfree(buf); +	fuse_copy_finish(cs); +	return err; +} + +static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, +			      struct fuse_copy_state *cs) +{ +	struct fuse_notify_delete_out outarg; +	int err = -ENOMEM; +	char *buf; +	struct qstr name; + +	buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); +	if (!buf) +		goto err; + +	err = -EINVAL; +	if (size < sizeof(outarg)) +		goto err; + +	err = fuse_copy_one(cs, &outarg, sizeof(outarg)); +	if (err) +		goto err; + +	err = -ENAMETOOLONG; +	if (outarg.namelen > FUSE_NAME_MAX) +		goto err; + +	err = -EINVAL; +	if (size != sizeof(outarg) + outarg.namelen + 1) +		goto err; + +	name.name = buf; +	name.len = outarg.namelen; +	err = fuse_copy_one(cs, buf, outarg.namelen + 1); +	if (err) +		goto err; +	fuse_copy_finish(cs); +	buf[outarg.namelen] = 0; +	name.hash = full_name_hash(name.name, name.len); + +	down_read(&fc->killsb); +	err = -ENOENT; +	if (fc->sb) +		err = fuse_reverse_inval_entry(fc->sb, outarg.parent, +					       outarg.child, &name);  	up_read(&fc->killsb);  	kfree(buf);  	return err; @@ -1308,7 +1582,8 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,  		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);  		err = fuse_copy_page(cs, &page, offset, this_num, 0); -		if (!err && offset == 0 && (num != 0 || file_size == end)) +		if (!err && offset == 0 && +		    (this_num == PAGE_CACHE_SIZE || file_size == end))  			SetPageUptodate(page);  		unlock_page(page);  		page_cache_release(page); @@ -1334,7 +1609,7 @@ out_finish:  static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)  { -	release_pages(req->pages, req->num_pages, 0); +	release_pages(req->pages, req->num_pages, false);  }  static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, @@ -1348,29 +1623,34 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,  	unsigned int num;  	unsigned int offset;  	size_t total_len = 0; +	int num_pages; + +	offset = outarg->offset & ~PAGE_CACHE_MASK; +	file_size = i_size_read(inode); -	req = fuse_get_req(fc); +	num = outarg->size; +	if (outarg->offset > file_size) +		num = 0; +	else if (outarg->offset + num > file_size) +		num = file_size - outarg->offset; + +	num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; +	num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ); + +	req = fuse_get_req(fc, num_pages);  	if (IS_ERR(req))  		return PTR_ERR(req); -	offset = outarg->offset & ~PAGE_CACHE_MASK; -  	req->in.h.opcode = FUSE_NOTIFY_REPLY;  	req->in.h.nodeid = outarg->nodeid;  	req->in.numargs = 2;  	req->in.argpages = 1; -	req->page_offset = offset; +	req->page_descs[0].offset = offset;  	req->end = fuse_retrieve_end;  	index = outarg->offset >> PAGE_CACHE_SHIFT; -	file_size = i_size_read(inode); -	num = outarg->size; -	if (outarg->offset > file_size) -		num = 0; -	else if (outarg->offset + num > file_size) -		num = file_size - outarg->offset; -	while (num) { +	while (num && req->num_pages < num_pages) {  		struct page *page;  		unsigned int this_num; @@ -1380,10 +1660,13 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,  		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);  		req->pages[req->num_pages] = page; +		req->page_descs[req->num_pages].length = this_num;  		req->num_pages++; +		offset = 0;  		num -= this_num;  		total_len += this_num; +		index++;  	}  	req->misc.retrieve_in.offset = outarg->offset;  	req->misc.retrieve_in.size = total_len; @@ -1454,6 +1737,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,  	case FUSE_NOTIFY_RETRIEVE:  		return fuse_notify_retrieve(fc, size, cs); +	case FUSE_NOTIFY_DELETE: +		return fuse_notify_delete(fc, size, cs); +  	default:  		fuse_copy_finish(cs);  		return -EINVAL; @@ -1463,11 +1749,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,  /* Look up request on processing list by unique ID */  static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)  { -	struct list_head *entry; +	struct fuse_req *req; -	list_for_each(entry, &fc->processing) { -		struct fuse_req *req; -		req = list_entry(entry, struct fuse_req, list); +	list_for_each_entry(req, &fc->processing, list) {  		if (req->in.h.unique == unique || req->intr_unique == unique)  			return req;  	} @@ -1626,7 +1910,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,  	if (!fc)  		return -EPERM; -	bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); +	bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);  	if (!bufs)  		return -ENOMEM; @@ -1770,6 +2054,23 @@ __acquires(fc->lock)  	flush_bg_queue(fc);  	end_requests(fc, &fc->pending);  	end_requests(fc, &fc->processing); +	while (forget_pending(fc)) +		kfree(dequeue_forget(fc, 1, NULL)); +} + +static void end_polls(struct fuse_conn *fc) +{ +	struct rb_node *p; + +	p = rb_first(&fc->polled_files); + +	while (p) { +		struct fuse_file *ff; +		ff = rb_entry(p, struct fuse_file, polled_node); +		wake_up_interruptible_all(&ff->poll_wait); + +		p = rb_next(p); +	}  }  /* @@ -1797,8 +2098,10 @@ void fuse_abort_conn(struct fuse_conn *fc)  	if (fc->connected) {  		fc->connected = 0;  		fc->blocked = 0; +		fc->initialized = 1;  		end_io_requests(fc);  		end_queued_requests(fc); +		end_polls(fc);  		wake_up_all(&fc->waitq);  		wake_up_all(&fc->blocked_waitq);  		kill_fasync(&fc->fasync, SIGIO, POLL_IN); @@ -1814,7 +2117,9 @@ int fuse_dev_release(struct inode *inode, struct file *file)  		spin_lock(&fc->lock);  		fc->connected = 0;  		fc->blocked = 0; +		fc->initialized = 1;  		end_queued_requests(fc); +		end_polls(fc);  		wake_up_all(&fc->blocked_waitq);  		spin_unlock(&fc->lock);  		fuse_conn_put(fc); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c9627c95482..0c6048247a3 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -10,9 +10,32 @@  #include <linux/pagemap.h>  #include <linux/file.h> -#include <linux/gfp.h>  #include <linux/sched.h>  #include <linux/namei.h> +#include <linux/slab.h> + +static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) +{ +	struct fuse_conn *fc = get_fuse_conn(dir); +	struct fuse_inode *fi = get_fuse_inode(dir); + +	if (!fc->do_readdirplus) +		return false; +	if (!fc->readdirplus_auto) +		return true; +	if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) +		return true; +	if (ctx->pos == 0) +		return true; +	return false; +} + +static void fuse_advise_use_readdirplus(struct inode *dir) +{ +	struct fuse_inode *fi = get_fuse_inode(dir); + +	set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); +}  #if BITS_PER_LONG >= 64  static inline void fuse_dentry_settime(struct dentry *entry, u64 time) @@ -89,6 +112,16 @@ void fuse_invalidate_attr(struct inode *inode)  	get_fuse_inode(inode)->i_time = 0;  } +/** + * Mark the attributes as stale due to an atime change.  Avoid the invalidate if + * atime is not used. + */ +void fuse_invalidate_atime(struct inode *inode) +{ +	if (!IS_RDONLY(inode)) +		fuse_invalidate_attr(inode); +} +  /*   * Just mark the entry as stale, so that a next attempt to look it up   * will result in a new lookup call to userspace @@ -154,34 +187,44 @@ u64 fuse_get_attr_version(struct fuse_conn *fc)   * the lookup once more.  If the lookup results in the same inode,   * then refresh the attributes, timeouts and mark the dentry valid.   */ -static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) +static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)  { -	struct inode *inode = entry->d_inode; +	struct inode *inode; +	struct dentry *parent; +	struct fuse_conn *fc; +	struct fuse_inode *fi; +	int ret; +	inode = ACCESS_ONCE(entry->d_inode);  	if (inode && is_bad_inode(inode)) -		return 0; -	else if (fuse_dentry_time(entry) < get_jiffies_64()) { +		goto invalid; +	else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || +		 (flags & LOOKUP_REVAL)) {  		int err;  		struct fuse_entry_out outarg; -		struct fuse_conn *fc;  		struct fuse_req *req; -		struct fuse_req *forget_req; -		struct dentry *parent; +		struct fuse_forget_link *forget;  		u64 attr_version;  		/* For negative dentries, always do a fresh lookup */  		if (!inode) -			return 0; +			goto invalid; + +		ret = -ECHILD; +		if (flags & LOOKUP_RCU) +			goto out;  		fc = get_fuse_conn(inode); -		req = fuse_get_req(fc); +		req = fuse_get_req_nopages(fc); +		ret = PTR_ERR(req);  		if (IS_ERR(req)) -			return 0; +			goto out; -		forget_req = fuse_get_req(fc); -		if (IS_ERR(forget_req)) { +		forget = fuse_alloc_forget(); +		if (!forget) {  			fuse_put_request(fc, req); -			return 0; +			ret = -ENOMEM; +			goto out;  		}  		attr_version = fuse_get_attr_version(fc); @@ -197,26 +240,44 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)  		if (!err && !outarg.nodeid)  			err = -ENOENT;  		if (!err) { -			struct fuse_inode *fi = get_fuse_inode(inode); +			fi = get_fuse_inode(inode);  			if (outarg.nodeid != get_node_id(inode)) { -				fuse_send_forget(fc, forget_req, -						 outarg.nodeid, 1); -				return 0; +				fuse_queue_forget(fc, forget, outarg.nodeid, 1); +				goto invalid;  			}  			spin_lock(&fc->lock);  			fi->nlookup++;  			spin_unlock(&fc->lock);  		} -		fuse_put_request(fc, forget_req); +		kfree(forget);  		if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) -			return 0; +			goto invalid;  		fuse_change_attributes(inode, &outarg.attr,  				       entry_attr_timeout(&outarg),  				       attr_version);  		fuse_change_entry_timeout(entry, &outarg); +	} else if (inode) { +		fi = get_fuse_inode(inode); +		if (flags & LOOKUP_RCU) { +			if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state)) +				return -ECHILD; +		} else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) { +			parent = dget_parent(entry); +			fuse_advise_use_readdirplus(parent->d_inode); +			dput(parent); +		}  	} -	return 1; +	ret = 1; +out: +	return ret; + +invalid: +	ret = 0; + +	if (!(flags & LOOKUP_RCU) && check_submounts_and_drop(entry) != 0) +		ret = 1; +	goto out;  }  static int invalid_nodeid(u64 nodeid) @@ -234,32 +295,12 @@ int fuse_valid_type(int m)  		S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);  } -/* - * Add a directory inode to a dentry, ensuring that no other dentry - * refers to this inode.  Called with fc->inst_mutex. - */ -static struct dentry *fuse_d_add_directory(struct dentry *entry, -					   struct inode *inode) -{ -	struct dentry *alias = d_find_alias(inode); -	if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) { -		/* This tries to shrink the subtree below alias */ -		fuse_invalidate_entry(alias); -		dput(alias); -		if (!list_empty(&inode->i_dentry)) -			return ERR_PTR(-EBUSY); -	} else { -		dput(alias); -	} -	return d_splice_alias(inode, entry); -} -  int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,  		     struct fuse_entry_out *outarg, struct inode **inode)  {  	struct fuse_conn *fc = get_fuse_conn_super(sb);  	struct fuse_req *req; -	struct fuse_req *forget_req; +	struct fuse_forget_link *forget;  	u64 attr_version;  	int err; @@ -268,14 +309,14 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,  	if (name->len > FUSE_NAME_MAX)  		goto out; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	err = PTR_ERR(req);  	if (IS_ERR(req))  		goto out; -	forget_req = fuse_get_req(fc); -	err = PTR_ERR(forget_req); -	if (IS_ERR(forget_req)) { +	forget = fuse_alloc_forget(); +	err = -ENOMEM; +	if (!forget) {  		fuse_put_request(fc, req);  		goto out;  	} @@ -301,25 +342,24 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,  			   attr_version);  	err = -ENOMEM;  	if (!*inode) { -		fuse_send_forget(fc, forget_req, outarg->nodeid, 1); +		fuse_queue_forget(fc, forget, outarg->nodeid, 1);  		goto out;  	}  	err = 0;   out_put_forget: -	fuse_put_request(fc, forget_req); +	kfree(forget);   out:  	return err;  }  static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, -				  struct nameidata *nd) +				  unsigned int flags)  {  	int err;  	struct fuse_entry_out outarg;  	struct inode *inode;  	struct dentry *newent; -	struct fuse_conn *fc = get_fuse_conn(dir);  	bool outarg_valid = true;  	err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, @@ -335,24 +375,18 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,  	if (inode && get_node_id(inode) == FUSE_ROOT_ID)  		goto out_iput; -	if (inode && S_ISDIR(inode->i_mode)) { -		mutex_lock(&fc->inst_mutex); -		newent = fuse_d_add_directory(entry, inode); -		mutex_unlock(&fc->inst_mutex); -		err = PTR_ERR(newent); -		if (IS_ERR(newent)) -			goto out_iput; -	} else { -		newent = d_splice_alias(inode, entry); -	} +	newent = d_materialise_unique(entry, inode); +	err = PTR_ERR(newent); +	if (IS_ERR(newent)) +		goto out_err;  	entry = newent ? newent : entry; -	entry->d_op = &fuse_dentry_operations;  	if (outarg_valid)  		fuse_change_entry_timeout(entry, &outarg);  	else  		fuse_invalidate_entry_cache(entry); +	fuse_advise_use_readdirplus(dir);  	return newent;   out_iput: @@ -367,32 +401,29 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,   * If the filesystem doesn't support this, then fall back to separate   * 'mknod' + 'open' requests.   */ -static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, -			    struct nameidata *nd) +static int fuse_create_open(struct inode *dir, struct dentry *entry, +			    struct file *file, unsigned flags, +			    umode_t mode, int *opened)  {  	int err;  	struct inode *inode;  	struct fuse_conn *fc = get_fuse_conn(dir);  	struct fuse_req *req; -	struct fuse_req *forget_req; +	struct fuse_forget_link *forget;  	struct fuse_create_in inarg;  	struct fuse_open_out outopen;  	struct fuse_entry_out outentry;  	struct fuse_file *ff; -	struct file *file; -	int flags = nd->intent.open.flags - 1; -	if (fc->no_create) -		return -ENOSYS; - -	if (flags & O_DIRECT) -		return -EINVAL; +	/* Userspace expects S_IFREG in create mode */ +	BUG_ON((mode & S_IFMT) != S_IFREG); -	forget_req = fuse_get_req(fc); -	if (IS_ERR(forget_req)) -		return PTR_ERR(forget_req); +	forget = fuse_alloc_forget(); +	err = -ENOMEM; +	if (!forget) +		goto out_err; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	err = PTR_ERR(req);  	if (IS_ERR(req))  		goto out_put_forget_req; @@ -429,11 +460,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,  	req->out.args[1].value = &outopen;  	fuse_request_send(fc, req);  	err = req->out.h.error; -	if (err) { -		if (err == -ENOSYS) -			fc->no_create = 1; +	if (err)  		goto out_free_ff; -	}  	err = -EIO;  	if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid)) @@ -448,47 +476,93 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,  	if (!inode) {  		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);  		fuse_sync_release(ff, flags); -		fuse_send_forget(fc, forget_req, outentry.nodeid, 1); -		return -ENOMEM; +		fuse_queue_forget(fc, forget, outentry.nodeid, 1); +		err = -ENOMEM; +		goto out_err;  	} -	fuse_put_request(fc, forget_req); +	kfree(forget);  	d_instantiate(entry, inode);  	fuse_change_entry_timeout(entry, &outentry);  	fuse_invalidate_attr(dir); -	file = lookup_instantiate_filp(nd, entry, generic_file_open); -	if (IS_ERR(file)) { +	err = finish_open(file, entry, generic_file_open, opened); +	if (err) {  		fuse_sync_release(ff, flags); -		return PTR_ERR(file); +	} else { +		file->private_data = fuse_file_get(ff); +		fuse_finish_open(inode, file);  	} -	file->private_data = fuse_file_get(ff); -	fuse_finish_open(inode, file); -	return 0; +	return err; - out_free_ff: +out_free_ff:  	fuse_file_free(ff); - out_put_request: +out_put_request:  	fuse_put_request(fc, req); - out_put_forget_req: -	fuse_put_request(fc, forget_req); +out_put_forget_req: +	kfree(forget); +out_err:  	return err;  } +static int fuse_mknod(struct inode *, struct dentry *, umode_t, dev_t); +static int fuse_atomic_open(struct inode *dir, struct dentry *entry, +			    struct file *file, unsigned flags, +			    umode_t mode, int *opened) +{ +	int err; +	struct fuse_conn *fc = get_fuse_conn(dir); +	struct dentry *res = NULL; + +	if (d_unhashed(entry)) { +		res = fuse_lookup(dir, entry, 0); +		if (IS_ERR(res)) +			return PTR_ERR(res); + +		if (res) +			entry = res; +	} + +	if (!(flags & O_CREAT) || entry->d_inode) +		goto no_open; + +	/* Only creates */ +	*opened |= FILE_CREATED; + +	if (fc->no_create) +		goto mknod; + +	err = fuse_create_open(dir, entry, file, flags, mode, opened); +	if (err == -ENOSYS) { +		fc->no_create = 1; +		goto mknod; +	} +out_dput: +	dput(res); +	return err; + +mknod: +	err = fuse_mknod(dir, entry, mode, 0); +	if (err) +		goto out_dput; +no_open: +	return finish_no_open(file, res); +} +  /*   * Code shared between mknod, mkdir, symlink and link   */  static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,  			    struct inode *dir, struct dentry *entry, -			    int mode) +			    umode_t mode)  {  	struct fuse_entry_out outarg;  	struct inode *inode;  	int err; -	struct fuse_req *forget_req; +	struct fuse_forget_link *forget; -	forget_req = fuse_get_req(fc); -	if (IS_ERR(forget_req)) { +	forget = fuse_alloc_forget(); +	if (!forget) {  		fuse_put_request(fc, req); -		return PTR_ERR(forget_req); +		return -ENOMEM;  	}  	memset(&outarg, 0, sizeof(outarg)); @@ -515,42 +589,30 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,  	inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,  			  &outarg.attr, entry_attr_timeout(&outarg), 0);  	if (!inode) { -		fuse_send_forget(fc, forget_req, outarg.nodeid, 1); +		fuse_queue_forget(fc, forget, outarg.nodeid, 1);  		return -ENOMEM;  	} -	fuse_put_request(fc, forget_req); - -	if (S_ISDIR(inode->i_mode)) { -		struct dentry *alias; -		mutex_lock(&fc->inst_mutex); -		alias = d_find_alias(inode); -		if (alias) { -			/* New directory must have moved since mkdir */ -			mutex_unlock(&fc->inst_mutex); -			dput(alias); -			iput(inode); -			return -EBUSY; -		} -		d_instantiate(entry, inode); -		mutex_unlock(&fc->inst_mutex); -	} else -		d_instantiate(entry, inode); +	kfree(forget); + +	err = d_instantiate_no_diralias(entry, inode); +	if (err) +		return err;  	fuse_change_entry_timeout(entry, &outarg);  	fuse_invalidate_attr(dir);  	return 0;   out_put_forget_req: -	fuse_put_request(fc, forget_req); +	kfree(forget);  	return err;  } -static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode, +static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,  		      dev_t rdev)  {  	struct fuse_mknod_in inarg;  	struct fuse_conn *fc = get_fuse_conn(dir); -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -571,23 +633,17 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,  	return create_new_entry(fc, req, dir, entry, mode);  } -static int fuse_create(struct inode *dir, struct dentry *entry, int mode, -		       struct nameidata *nd) +static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, +		       bool excl)  { -	if (nd && (nd->flags & LOOKUP_OPEN)) { -		int err = fuse_create_open(dir, entry, mode, nd); -		if (err != -ENOSYS) -			return err; -		/* Fall back on mknod */ -	}  	return fuse_mknod(dir, entry, mode, 0);  } -static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode) +static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)  {  	struct fuse_mkdir_in inarg;  	struct fuse_conn *fc = get_fuse_conn(dir); -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -611,7 +667,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,  {  	struct fuse_conn *fc = get_fuse_conn(dir);  	unsigned len = strlen(link) + 1; -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -624,11 +680,19 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,  	return create_new_entry(fc, req, dir, entry, S_IFLNK);  } +static inline void fuse_update_ctime(struct inode *inode) +{ +	if (!IS_NOCMTIME(inode)) { +		inode->i_ctime = current_fs_time(inode->i_sb); +		mark_inode_dirty_sync(inode); +	} +} +  static int fuse_unlink(struct inode *dir, struct dentry *entry)  {  	int err;  	struct fuse_conn *fc = get_fuse_conn(dir); -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -642,16 +706,23 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)  	fuse_put_request(fc, req);  	if (!err) {  		struct inode *inode = entry->d_inode; +		struct fuse_inode *fi = get_fuse_inode(inode); +		spin_lock(&fc->lock); +		fi->attr_version = ++fc->attr_version;  		/* -		 * Set nlink to zero so the inode can be cleared, if the inode -		 * does have more links this will be discovered at the next -		 * lookup/getattr. +		 * If i_nlink == 0 then unlink doesn't make sense, yet this can +		 * happen if userspace filesystem is careless.  It would be +		 * difficult to enforce correct nlink usage so just ignore this +		 * condition here  		 */ -		clear_nlink(inode); +		if (inode->i_nlink > 0) +			drop_nlink(inode); +		spin_unlock(&fc->lock);  		fuse_invalidate_attr(inode);  		fuse_invalidate_attr(dir);  		fuse_invalidate_entry_cache(entry); +		fuse_update_ctime(inode);  	} else if (err == -EINTR)  		fuse_invalidate_entry(entry);  	return err; @@ -661,7 +732,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)  {  	int err;  	struct fuse_conn *fc = get_fuse_conn(dir); -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -682,22 +753,26 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)  	return err;  } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, -		       struct inode *newdir, struct dentry *newent) +static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, +			      struct inode *newdir, struct dentry *newent, +			      unsigned int flags, int opcode, size_t argsize)  {  	int err; -	struct fuse_rename_in inarg; +	struct fuse_rename2_in inarg;  	struct fuse_conn *fc = get_fuse_conn(olddir); -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req; + +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); -	memset(&inarg, 0, sizeof(inarg)); +	memset(&inarg, 0, argsize);  	inarg.newdir = get_node_id(newdir); -	req->in.h.opcode = FUSE_RENAME; +	inarg.flags = flags; +	req->in.h.opcode = opcode;  	req->in.h.nodeid = get_node_id(olddir);  	req->in.numargs = 3; -	req->in.args[0].size = sizeof(inarg); +	req->in.args[0].size = argsize;  	req->in.args[0].value = &inarg;  	req->in.args[1].size = oldent->d_name.len + 1;  	req->in.args[1].value = oldent->d_name.name; @@ -709,15 +784,22 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,  	if (!err) {  		/* ctime changes */  		fuse_invalidate_attr(oldent->d_inode); +		fuse_update_ctime(oldent->d_inode); + +		if (flags & RENAME_EXCHANGE) { +			fuse_invalidate_attr(newent->d_inode); +			fuse_update_ctime(newent->d_inode); +		}  		fuse_invalidate_attr(olddir);  		if (olddir != newdir)  			fuse_invalidate_attr(newdir);  		/* newent will end up negative */ -		if (newent->d_inode) { +		if (!(flags & RENAME_EXCHANGE) && newent->d_inode) {  			fuse_invalidate_attr(newent->d_inode);  			fuse_invalidate_entry_cache(newent); +			fuse_update_ctime(newent->d_inode);  		}  	} else if (err == -EINTR) {  		/* If request was interrupted, DEITY only knows if the @@ -733,6 +815,42 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,  	return err;  } +static int fuse_rename2(struct inode *olddir, struct dentry *oldent, +			struct inode *newdir, struct dentry *newent, +			unsigned int flags) +{ +	struct fuse_conn *fc = get_fuse_conn(olddir); +	int err; + +	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) +		return -EINVAL; + +	if (flags) { +		if (fc->no_rename2 || fc->minor < 23) +			return -EINVAL; + +		err = fuse_rename_common(olddir, oldent, newdir, newent, flags, +					 FUSE_RENAME2, +					 sizeof(struct fuse_rename2_in)); +		if (err == -ENOSYS) { +			fc->no_rename2 = 1; +			err = -EINVAL; +		} +	} else { +		err = fuse_rename_common(olddir, oldent, newdir, newent, 0, +					 FUSE_RENAME, +					 sizeof(struct fuse_rename_in)); +	} + +	return err; +} + +static int fuse_rename(struct inode *olddir, struct dentry *oldent, +		       struct inode *newdir, struct dentry *newent) +{ +	return fuse_rename2(olddir, oldent, newdir, newent, 0); +} +  static int fuse_link(struct dentry *entry, struct inode *newdir,  		     struct dentry *newent)  { @@ -740,7 +858,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,  	struct fuse_link_in inarg;  	struct inode *inode = entry->d_inode;  	struct fuse_conn *fc = get_fuse_conn(inode); -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -759,20 +877,42 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,  	   will reflect changes in the backing inode (link count,  	   etc.)  	*/ -	if (!err || err == -EINTR) +	if (!err) { +		struct fuse_inode *fi = get_fuse_inode(inode); + +		spin_lock(&fc->lock); +		fi->attr_version = ++fc->attr_version; +		inc_nlink(inode); +		spin_unlock(&fc->lock);  		fuse_invalidate_attr(inode); +		fuse_update_ctime(inode); +	} else if (err == -EINTR) { +		fuse_invalidate_attr(inode); +	}  	return err;  }  static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,  			  struct kstat *stat)  { +	unsigned int blkbits; +	struct fuse_conn *fc = get_fuse_conn(inode); + +	/* see the comment in fuse_change_attributes() */ +	if (fc->writeback_cache && S_ISREG(inode->i_mode)) { +		attr->size = i_size_read(inode); +		attr->mtime = inode->i_mtime.tv_sec; +		attr->mtimensec = inode->i_mtime.tv_nsec; +		attr->ctime = inode->i_ctime.tv_sec; +		attr->ctimensec = inode->i_ctime.tv_nsec; +	} +  	stat->dev = inode->i_sb->s_dev;  	stat->ino = attr->ino;  	stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);  	stat->nlink = attr->nlink; -	stat->uid = attr->uid; -	stat->gid = attr->gid; +	stat->uid = make_kuid(&init_user_ns, attr->uid); +	stat->gid = make_kgid(&init_user_ns, attr->gid);  	stat->rdev = inode->i_rdev;  	stat->atime.tv_sec = attr->atime;  	stat->atime.tv_nsec = attr->atimensec; @@ -782,7 +922,13 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,  	stat->ctime.tv_nsec = attr->ctimensec;  	stat->size = attr->size;  	stat->blocks = attr->blocks; -	stat->blksize = (1 << inode->i_blkbits); + +	if (attr->blksize != 0) +		blkbits = ilog2(attr->blksize); +	else +		blkbits = inode->i_sb->s_blocksize_bits; + +	stat->blksize = 1 << blkbits;  }  static int fuse_do_getattr(struct inode *inode, struct kstat *stat, @@ -795,7 +941,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,  	struct fuse_req *req;  	u64 attr_version; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -846,7 +992,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,  	int err;  	bool r; -	if (fi->i_time < get_jiffies_64()) { +	if (time_before64(fi->i_time, get_jiffies_64())) {  		r = true;  		err = fuse_do_getattr(inode, stat, file);  	} else { @@ -855,6 +1001,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,  		if (stat) {  			generic_fillattr(inode, stat);  			stat->mode = fi->orig_i_mode; +			stat->ino = fi->orig_ino;  		}  	} @@ -865,7 +1012,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,  }  int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, -			     struct qstr *name) +			     u64 child_nodeid, struct qstr *name)  {  	int err = -ENOTDIR;  	struct inode *parent; @@ -892,8 +1039,36 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,  	fuse_invalidate_attr(parent);  	fuse_invalidate_entry(entry); + +	if (child_nodeid != 0 && entry->d_inode) { +		mutex_lock(&entry->d_inode->i_mutex); +		if (get_node_id(entry->d_inode) != child_nodeid) { +			err = -ENOENT; +			goto badentry; +		} +		if (d_mountpoint(entry)) { +			err = -EBUSY; +			goto badentry; +		} +		if (S_ISDIR(entry->d_inode->i_mode)) { +			shrink_dcache_parent(entry); +			if (!simple_empty(entry)) { +				err = -ENOTEMPTY; +				goto badentry; +			} +			entry->d_inode->i_flags |= S_DEAD; +		} +		dont_mount(entry); +		clear_nlink(entry->d_inode); +		err = 0; + badentry: +		mutex_unlock(&entry->d_inode->i_mutex); +		if (!err) +			d_delete(entry); +	} else { +		err = 0; +	}  	dput(entry); -	err = 0;   unlock:  	mutex_unlock(&parent->i_mutex); @@ -903,7 +1078,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,  /*   * Calling into a user-controlled filesystem gives the filesystem - * daemon ptrace-like capabilities over the requester process.  This + * daemon ptrace-like capabilities over the current process.  This   * means, that the filesystem daemon is able to record the exact   * filesystem operations performed, and can also control the behavior   * of the requester process in otherwise impossible ways.  For example @@ -914,27 +1089,23 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,   * for which the owner of the mount has ptrace privilege.  This   * excludes processes started by other users, suid or sgid processes.   */ -int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task) +int fuse_allow_current_process(struct fuse_conn *fc)  {  	const struct cred *cred; -	int ret;  	if (fc->flags & FUSE_ALLOW_OTHER)  		return 1; -	rcu_read_lock(); -	ret = 0; -	cred = __task_cred(task); -	if (cred->euid == fc->user_id && -	    cred->suid == fc->user_id && -	    cred->uid  == fc->user_id && -	    cred->egid == fc->group_id && -	    cred->sgid == fc->group_id && -	    cred->gid  == fc->group_id) -		ret = 1; -	rcu_read_unlock(); +	cred = current_cred(); +	if (uid_eq(cred->euid, fc->user_id) && +	    uid_eq(cred->suid, fc->user_id) && +	    uid_eq(cred->uid,  fc->user_id) && +	    gid_eq(cred->egid, fc->group_id) && +	    gid_eq(cred->sgid, fc->group_id) && +	    gid_eq(cred->gid,  fc->group_id)) +		return 1; -	return ret; +	return 0;  }  static int fuse_access(struct inode *inode, int mask) @@ -944,10 +1115,12 @@ static int fuse_access(struct inode *inode, int mask)  	struct fuse_access_in inarg;  	int err; +	BUG_ON(mask & MAY_NOT_BLOCK); +  	if (fc->no_access)  		return 0; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -968,6 +1141,14 @@ static int fuse_access(struct inode *inode, int mask)  	return err;  } +static int fuse_perm_getattr(struct inode *inode, int mask) +{ +	if (mask & MAY_NOT_BLOCK) +		return -ECHILD; + +	return fuse_do_getattr(inode, NULL, NULL); +} +  /*   * Check permission.  The two basic access models of FUSE are:   * @@ -987,7 +1168,7 @@ static int fuse_permission(struct inode *inode, int mask)  	bool refreshed = false;  	int err = 0; -	if (!fuse_allow_task(fc, current)) +	if (!fuse_allow_current_process(fc))  		return -EACCES;  	/* @@ -995,21 +1176,27 @@ static int fuse_permission(struct inode *inode, int mask)  	 */  	if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) ||  	    ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { -		err = fuse_update_attributes(inode, NULL, NULL, &refreshed); -		if (err) -			return err; +		struct fuse_inode *fi = get_fuse_inode(inode); + +		if (time_before64(fi->i_time, get_jiffies_64())) { +			refreshed = true; + +			err = fuse_perm_getattr(inode, mask); +			if (err) +				return err; +		}  	}  	if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { -		err = generic_permission(inode, mask, NULL); +		err = generic_permission(inode, mask);  		/* If permission is denied, try to refresh file  		   attributes.  This is also needed, because the root  		   node will at first have no permissions */  		if (err == -EACCES && !refreshed) { -			err = fuse_do_getattr(inode, NULL, NULL); +			err = fuse_perm_getattr(inode, mask);  			if (!err) -				err = generic_permission(inode, mask, NULL); +				err = generic_permission(inode, mask);  		}  		/* Note: the opposite of the above test does not @@ -1023,7 +1210,7 @@ static int fuse_permission(struct inode *inode, int mask)  			if (refreshed)  				return -EACCES; -			err = fuse_do_getattr(inode, NULL, NULL); +			err = fuse_perm_getattr(inode, mask);  			if (!err && !(inode->i_mode & S_IXUGO))  				return -EACCES;  		} @@ -1032,43 +1219,197 @@ static int fuse_permission(struct inode *inode, int mask)  }  static int parse_dirfile(char *buf, size_t nbytes, struct file *file, -			 void *dstbuf, filldir_t filldir) +			 struct dir_context *ctx)  {  	while (nbytes >= FUSE_NAME_OFFSET) {  		struct fuse_dirent *dirent = (struct fuse_dirent *) buf;  		size_t reclen = FUSE_DIRENT_SIZE(dirent); -		int over;  		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)  			return -EIO;  		if (reclen > nbytes)  			break; +		if (memchr(dirent->name, '/', dirent->namelen) != NULL) +			return -EIO; -		over = filldir(dstbuf, dirent->name, dirent->namelen, -			       file->f_pos, dirent->ino, dirent->type); -		if (over) +		if (!dir_emit(ctx, dirent->name, dirent->namelen, +			       dirent->ino, dirent->type))  			break;  		buf += reclen;  		nbytes -= reclen; -		file->f_pos = dirent->off; +		ctx->pos = dirent->off;  	}  	return 0;  } -static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) +static int fuse_direntplus_link(struct file *file, +				struct fuse_direntplus *direntplus, +				u64 attr_version)  {  	int err; +	struct fuse_entry_out *o = &direntplus->entry_out; +	struct fuse_dirent *dirent = &direntplus->dirent; +	struct dentry *parent = file->f_path.dentry; +	struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); +	struct dentry *dentry; +	struct dentry *alias; +	struct inode *dir = parent->d_inode; +	struct fuse_conn *fc; +	struct inode *inode; + +	if (!o->nodeid) { +		/* +		 * Unlike in the case of fuse_lookup, zero nodeid does not mean +		 * ENOENT. Instead, it only means the userspace filesystem did +		 * not want to return attributes/handle for this entry. +		 * +		 * So do nothing. +		 */ +		return 0; +	} + +	if (name.name[0] == '.') { +		/* +		 * We could potentially refresh the attributes of the directory +		 * and its parent? +		 */ +		if (name.len == 1) +			return 0; +		if (name.name[1] == '.' && name.len == 2) +			return 0; +	} + +	if (invalid_nodeid(o->nodeid)) +		return -EIO; +	if (!fuse_valid_type(o->attr.mode)) +		return -EIO; + +	fc = get_fuse_conn(dir); + +	name.hash = full_name_hash(name.name, name.len); +	dentry = d_lookup(parent, &name); +	if (dentry) { +		inode = dentry->d_inode; +		if (!inode) { +			d_drop(dentry); +		} else if (get_node_id(inode) != o->nodeid || +			   ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { +			err = d_invalidate(dentry); +			if (err) +				goto out; +		} else if (is_bad_inode(inode)) { +			err = -EIO; +			goto out; +		} else { +			struct fuse_inode *fi; +			fi = get_fuse_inode(inode); +			spin_lock(&fc->lock); +			fi->nlookup++; +			spin_unlock(&fc->lock); + +			fuse_change_attributes(inode, &o->attr, +					       entry_attr_timeout(o), +					       attr_version); + +			/* +			 * The other branch to 'found' comes via fuse_iget() +			 * which bumps nlookup inside +			 */ +			goto found; +		} +		dput(dentry); +	} + +	dentry = d_alloc(parent, &name); +	err = -ENOMEM; +	if (!dentry) +		goto out; + +	inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, +			  &o->attr, entry_attr_timeout(o), attr_version); +	if (!inode) +		goto out; + +	alias = d_materialise_unique(dentry, inode); +	err = PTR_ERR(alias); +	if (IS_ERR(alias)) +		goto out; + +	if (alias) { +		dput(dentry); +		dentry = alias; +	} + +found: +	if (fc->readdirplus_auto) +		set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); +	fuse_change_entry_timeout(dentry, o); + +	err = 0; +out: +	dput(dentry); +	return err; +} + +static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, +			     struct dir_context *ctx, u64 attr_version) +{ +	struct fuse_direntplus *direntplus; +	struct fuse_dirent *dirent; +	size_t reclen; +	int over = 0; +	int ret; + +	while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { +		direntplus = (struct fuse_direntplus *) buf; +		dirent = &direntplus->dirent; +		reclen = FUSE_DIRENTPLUS_SIZE(direntplus); + +		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) +			return -EIO; +		if (reclen > nbytes) +			break; +		if (memchr(dirent->name, '/', dirent->namelen) != NULL) +			return -EIO; + +		if (!over) { +			/* We fill entries into dstbuf only as much as +			   it can hold. But we still continue iterating +			   over remaining entries to link them. If not, +			   we need to send a FORGET for each of those +			   which we did not link. +			*/ +			over = !dir_emit(ctx, dirent->name, dirent->namelen, +				       dirent->ino, dirent->type); +			ctx->pos = dirent->off; +		} + +		buf += reclen; +		nbytes -= reclen; + +		ret = fuse_direntplus_link(file, direntplus, attr_version); +		if (ret) +			fuse_force_forget(file, direntplus->entry_out.nodeid); +	} + +	return 0; +} + +static int fuse_readdir(struct file *file, struct dir_context *ctx) +{ +	int plus, err;  	size_t nbytes;  	struct page *page; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_req *req; +	u64 attr_version = 0;  	if (is_bad_inode(inode))  		return -EIO; -	req = fuse_get_req(fc); +	req = fuse_get_req(fc, 1);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1077,20 +1418,37 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)  		fuse_put_request(fc, req);  		return -ENOMEM;  	} + +	plus = fuse_use_readdirplus(inode, ctx);  	req->out.argpages = 1;  	req->num_pages = 1;  	req->pages[0] = page; -	fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR); +	req->page_descs[0].length = PAGE_SIZE; +	if (plus) { +		attr_version = fuse_get_attr_version(fc); +		fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, +			       FUSE_READDIRPLUS); +	} else { +		fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, +			       FUSE_READDIR); +	}  	fuse_request_send(fc, req);  	nbytes = req->out.args[0].size;  	err = req->out.h.error;  	fuse_put_request(fc, req); -	if (!err) -		err = parse_dirfile(page_address(page), nbytes, file, dstbuf, -				    filldir); +	if (!err) { +		if (plus) { +			err = parse_dirplusfile(page_address(page), nbytes, +						file, ctx, +						attr_version); +		} else { +			err = parse_dirfile(page_address(page), nbytes, file, +					    ctx); +		} +	}  	__free_page(page); -	fuse_invalidate_attr(inode); /* atime changed */ +	fuse_invalidate_atime(inode);  	return err;  } @@ -1098,7 +1456,7 @@ static char *read_link(struct dentry *dentry)  {  	struct inode *inode = dentry->d_inode;  	struct fuse_conn *fc = get_fuse_conn(inode); -	struct fuse_req *req = fuse_get_req(fc); +	struct fuse_req *req = fuse_get_req_nopages(fc);  	char *link;  	if (IS_ERR(req)) @@ -1123,7 +1481,7 @@ static char *read_link(struct dentry *dentry)  		link[req->out.args[0].size] = '\0';   out:  	fuse_put_request(fc, req); -	fuse_invalidate_attr(inode); /* atime changed */ +	fuse_invalidate_atime(inode);  	return link;  } @@ -1156,17 +1514,46 @@ static int fuse_dir_release(struct inode *inode, struct file *file)  	return 0;  } -static int fuse_dir_fsync(struct file *file, int datasync) +static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end, +			  int datasync) +{ +	return fuse_fsync_common(file, start, end, datasync, 1); +} + +static long fuse_dir_ioctl(struct file *file, unsigned int cmd, +			    unsigned long arg) +{ +	struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); + +	/* FUSE_IOCTL_DIR only supported for API version >= 7.18 */ +	if (fc->minor < 18) +		return -ENOTTY; + +	return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR); +} + +static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd, +				   unsigned long arg)  { -	return fuse_fsync_common(file, datasync, 1); +	struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); + +	if (fc->minor < 18) +		return -ENOTTY; + +	return fuse_ioctl_common(file, cmd, arg, +				 FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);  } -static bool update_mtime(unsigned ivalid) +static bool update_mtime(unsigned ivalid, bool trust_local_mtime)  {  	/* Always update if mtime is explicitly set  */  	if (ivalid & ATTR_MTIME_SET)  		return true; +	/* Or if kernel i_mtime is the official one */ +	if (trust_local_mtime) +		return true; +  	/* If it's an open(O_TRUNC) or an ftruncate(), don't update */  	if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE)))  		return false; @@ -1175,16 +1562,17 @@ static bool update_mtime(unsigned ivalid)  	return true;  } -static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) +static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, +			   bool trust_local_cmtime)  {  	unsigned ivalid = iattr->ia_valid;  	if (ivalid & ATTR_MODE)  		arg->valid |= FATTR_MODE,   arg->mode = iattr->ia_mode;  	if (ivalid & ATTR_UID) -		arg->valid |= FATTR_UID,    arg->uid = iattr->ia_uid; +		arg->valid |= FATTR_UID,    arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);  	if (ivalid & ATTR_GID) -		arg->valid |= FATTR_GID,    arg->gid = iattr->ia_gid; +		arg->valid |= FATTR_GID,    arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);  	if (ivalid & ATTR_SIZE)  		arg->valid |= FATTR_SIZE,   arg->size = iattr->ia_size;  	if (ivalid & ATTR_ATIME) { @@ -1194,13 +1582,18 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)  		if (!(ivalid & ATTR_ATIME_SET))  			arg->valid |= FATTR_ATIME_NOW;  	} -	if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) { +	if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) {  		arg->valid |= FATTR_MTIME;  		arg->mtime = iattr->ia_mtime.tv_sec;  		arg->mtimensec = iattr->ia_mtime.tv_nsec; -		if (!(ivalid & ATTR_MTIME_SET)) +		if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime)  			arg->valid |= FATTR_MTIME_NOW;  	} +	if ((ivalid & ATTR_CTIME) && trust_local_cmtime) { +		arg->valid |= FATTR_CTIME; +		arg->ctime = iattr->ia_ctime.tv_sec; +		arg->ctimensec = iattr->ia_ctime.tv_nsec; +	}  }  /* @@ -1247,6 +1640,62 @@ void fuse_release_nowrite(struct inode *inode)  	spin_unlock(&fc->lock);  } +static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, +			      struct inode *inode, +			      struct fuse_setattr_in *inarg_p, +			      struct fuse_attr_out *outarg_p) +{ +	req->in.h.opcode = FUSE_SETATTR; +	req->in.h.nodeid = get_node_id(inode); +	req->in.numargs = 1; +	req->in.args[0].size = sizeof(*inarg_p); +	req->in.args[0].value = inarg_p; +	req->out.numargs = 1; +	if (fc->minor < 9) +		req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; +	else +		req->out.args[0].size = sizeof(*outarg_p); +	req->out.args[0].value = outarg_p; +} + +/* + * Flush inode->i_mtime to the server + */ +int fuse_flush_times(struct inode *inode, struct fuse_file *ff) +{ +	struct fuse_conn *fc = get_fuse_conn(inode); +	struct fuse_req *req; +	struct fuse_setattr_in inarg; +	struct fuse_attr_out outarg; +	int err; + +	req = fuse_get_req_nopages(fc); +	if (IS_ERR(req)) +		return PTR_ERR(req); + +	memset(&inarg, 0, sizeof(inarg)); +	memset(&outarg, 0, sizeof(outarg)); + +	inarg.valid = FATTR_MTIME; +	inarg.mtime = inode->i_mtime.tv_sec; +	inarg.mtimensec = inode->i_mtime.tv_nsec; +	if (fc->minor >= 23) { +		inarg.valid |= FATTR_CTIME; +		inarg.ctime = inode->i_ctime.tv_sec; +		inarg.ctimensec = inode->i_ctime.tv_nsec; +	} +	if (ff) { +		inarg.valid |= FATTR_FH; +		inarg.fh = ff->fh; +	} +	fuse_setattr_fill(fc, req, inode, &inarg, &outarg); +	fuse_request_send(fc, req); +	err = req->out.h.error; +	fuse_put_request(fc, req); + +	return err; +} +  /*   * Set attributes, and at the same time refresh them.   * @@ -1255,20 +1704,19 @@ void fuse_release_nowrite(struct inode *inode)   * vmtruncate() doesn't allow for this case, so do the rlimit checking   * and the actual truncation by hand.   */ -static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, -			   struct file *file) +int fuse_do_setattr(struct inode *inode, struct iattr *attr, +		    struct file *file)  { -	struct inode *inode = entry->d_inode;  	struct fuse_conn *fc = get_fuse_conn(inode); +	struct fuse_inode *fi = get_fuse_inode(inode);  	struct fuse_req *req;  	struct fuse_setattr_in inarg;  	struct fuse_attr_out outarg;  	bool is_truncate = false; +	bool is_wb = fc->writeback_cache;  	loff_t oldsize;  	int err; - -	if (!fuse_allow_task(fc, current)) -		return -EACCES; +	bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);  	if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))  		attr->ia_valid |= ATTR_FORCE; @@ -1277,22 +1725,29 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,  	if (err)  		return err; -	if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc) -		return 0; +	if (attr->ia_valid & ATTR_OPEN) { +		if (fc->atomic_o_trunc) +			return 0; +		file = NULL; +	}  	if (attr->ia_valid & ATTR_SIZE)  		is_truncate = true; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); -	if (is_truncate) +	if (is_truncate) {  		fuse_set_nowrite(inode); +		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); +		if (trust_local_cmtime && attr->ia_size != inode->i_size) +			attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; +	}  	memset(&inarg, 0, sizeof(inarg));  	memset(&outarg, 0, sizeof(outarg)); -	iattr_to_fattr(attr, &inarg); +	iattr_to_fattr(attr, &inarg, trust_local_cmtime);  	if (file) {  		struct fuse_file *ff = file->private_data;  		inarg.valid |= FATTR_FH; @@ -1303,17 +1758,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,  		inarg.valid |= FATTR_LOCKOWNER;  		inarg.lock_owner = fuse_lock_owner_id(fc, current->files);  	} -	req->in.h.opcode = FUSE_SETATTR; -	req->in.h.nodeid = get_node_id(inode); -	req->in.numargs = 1; -	req->in.args[0].size = sizeof(inarg); -	req->in.args[0].value = &inarg; -	req->out.numargs = 1; -	if (fc->minor < 9) -		req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; -	else -		req->out.args[0].size = sizeof(outarg); -	req->out.args[0].value = &outarg; +	fuse_setattr_fill(fc, req, inode, &inarg, &outarg);  	fuse_request_send(fc, req);  	err = req->out.h.error;  	fuse_put_request(fc, req); @@ -1330,10 +1775,21 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,  	}  	spin_lock(&fc->lock); +	/* the kernel maintains i_mtime locally */ +	if (trust_local_cmtime) { +		if (attr->ia_valid & ATTR_MTIME) +			inode->i_mtime = attr->ia_mtime; +		if (attr->ia_valid & ATTR_CTIME) +			inode->i_ctime = attr->ia_ctime; +		/* FIXME: clear I_DIRTY_SYNC? */ +	} +  	fuse_change_attributes_common(inode, &outarg.attr,  				      attr_timeout(&outarg));  	oldsize = inode->i_size; -	i_size_write(inode, outarg.attr.size); +	/* see the comment in fuse_change_attributes() */ +	if (!is_wb || is_truncate || !S_ISREG(inode->i_mode)) +		i_size_write(inode, outarg.attr.size);  	if (is_truncate) {  		/* NOTE: this may release/reacquire fc->lock */ @@ -1345,26 +1801,34 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,  	 * Only call invalidate_inode_pages2() after removing  	 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.  	 */ -	if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { -		truncate_pagecache(inode, oldsize, outarg.attr.size); +	if ((is_truncate || !is_wb) && +	    S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { +		truncate_pagecache(inode, outarg.attr.size);  		invalidate_inode_pages2(inode->i_mapping);  	} +	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);  	return 0;  error:  	if (is_truncate)  		fuse_release_nowrite(inode); +	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);  	return err;  }  static int fuse_setattr(struct dentry *entry, struct iattr *attr)  { +	struct inode *inode = entry->d_inode; + +	if (!fuse_allow_current_process(get_fuse_conn(inode))) +		return -EACCES; +  	if (attr->ia_valid & ATTR_FILE) -		return fuse_do_setattr(entry, attr, attr->ia_file); +		return fuse_do_setattr(inode, attr, attr->ia_file);  	else -		return fuse_do_setattr(entry, attr, NULL); +		return fuse_do_setattr(inode, attr, NULL);  }  static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, @@ -1373,7 +1837,7 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,  	struct inode *inode = entry->d_inode;  	struct fuse_conn *fc = get_fuse_conn(inode); -	if (!fuse_allow_task(fc, current)) +	if (!fuse_allow_current_process(fc))  		return -EACCES;  	return fuse_update_attributes(inode, stat, NULL, NULL); @@ -1391,7 +1855,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,  	if (fc->no_setxattr)  		return -EOPNOTSUPP; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1414,6 +1878,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name,  		fc->no_setxattr = 1;  		err = -EOPNOTSUPP;  	} +	if (!err) { +		fuse_invalidate_attr(inode); +		fuse_update_ctime(inode); +	}  	return err;  } @@ -1430,7 +1898,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,  	if (fc->no_getxattr)  		return -EOPNOTSUPP; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1476,13 +1944,13 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)  	struct fuse_getxattr_out outarg;  	ssize_t ret; -	if (!fuse_allow_task(fc, current)) +	if (!fuse_allow_current_process(fc))  		return -EACCES;  	if (fc->no_listxattr)  		return -EOPNOTSUPP; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1527,7 +1995,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)  	if (fc->no_removexattr)  		return -EOPNOTSUPP; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1543,6 +2011,10 @@ static int fuse_removexattr(struct dentry *entry, const char *name)  		fc->no_removexattr = 1;  		err = -EOPNOTSUPP;  	} +	if (!err) { +		fuse_invalidate_attr(inode); +		fuse_update_ctime(inode); +	}  	return err;  } @@ -1553,9 +2025,11 @@ static const struct inode_operations fuse_dir_inode_operations = {  	.unlink		= fuse_unlink,  	.rmdir		= fuse_rmdir,  	.rename		= fuse_rename, +	.rename2	= fuse_rename2,  	.link		= fuse_link,  	.setattr	= fuse_setattr,  	.create		= fuse_create, +	.atomic_open	= fuse_atomic_open,  	.mknod		= fuse_mknod,  	.permission	= fuse_permission,  	.getattr	= fuse_getattr, @@ -1568,10 +2042,12 @@ static const struct inode_operations fuse_dir_inode_operations = {  static const struct file_operations fuse_dir_operations = {  	.llseek		= generic_file_llseek,  	.read		= generic_read_dir, -	.readdir	= fuse_readdir, +	.iterate	= fuse_readdir,  	.open		= fuse_dir_open,  	.release	= fuse_dir_release,  	.fsync		= fuse_dir_fsync, +	.unlocked_ioctl	= fuse_dir_ioctl, +	.compat_ioctl	= fuse_dir_compat_ioctl,  };  static const struct inode_operations fuse_common_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c8224587123..40ac2628ddc 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -13,6 +13,10 @@  #include <linux/kernel.h>  #include <linux/sched.h>  #include <linux/module.h> +#include <linux/compat.h> +#include <linux/swap.h> +#include <linux/aio.h> +#include <linux/falloc.h>  static const struct file_operations fuse_direct_io_file_operations; @@ -23,7 +27,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,  	struct fuse_req *req;  	int err; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -55,7 +59,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)  		return NULL;  	ff->fc = fc; -	ff->reserved_req = fuse_request_alloc(); +	ff->reserved_req = fuse_request_alloc(0);  	if (unlikely(!ff->reserved_req)) {  		kfree(ff);  		return NULL; @@ -85,18 +89,62 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff)  	return ff;  } +static void fuse_release_async(struct work_struct *work) +{ +	struct fuse_req *req; +	struct fuse_conn *fc; +	struct path path; + +	req = container_of(work, struct fuse_req, misc.release.work); +	path = req->misc.release.path; +	fc = get_fuse_conn(path.dentry->d_inode); + +	fuse_put_request(fc, req); +	path_put(&path); +} +  static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)  { -	path_put(&req->misc.release.path); +	if (fc->destroy_req) { +		/* +		 * If this is a fuseblk mount, then it's possible that +		 * releasing the path will result in releasing the +		 * super block and sending the DESTROY request.  If +		 * the server is single threaded, this would hang. +		 * For this reason do the path_put() in a separate +		 * thread. +		 */ +		atomic_inc(&req->count); +		INIT_WORK(&req->misc.release.work, fuse_release_async); +		schedule_work(&req->misc.release.work); +	} else { +		path_put(&req->misc.release.path); +	}  } -static void fuse_file_put(struct fuse_file *ff) +static void fuse_file_put(struct fuse_file *ff, bool sync)  {  	if (atomic_dec_and_test(&ff->count)) {  		struct fuse_req *req = ff->reserved_req; -		req->end = fuse_release_end; -		fuse_request_send_background(ff->fc, req); +		if (ff->fc->no_open) { +			/* +			 * Drop the release request when client does not +			 * implement 'open' +			 */ +			req->background = 0; +			path_put(&req->misc.release.path); +			fuse_put_request(ff->fc, req); +		} else if (sync) { +			req->background = 0; +			fuse_request_send(ff->fc, req); +			path_put(&req->misc.release.path); +			fuse_put_request(ff->fc, req); +		} else { +			req->end = fuse_release_end; +			req->background = 1; +			fuse_request_send_background(ff->fc, req); +		}  		kfree(ff);  	}  } @@ -104,36 +152,62 @@ static void fuse_file_put(struct fuse_file *ff)  int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,  		 bool isdir)  { -	struct fuse_open_out outarg;  	struct fuse_file *ff; -	int err;  	int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;  	ff = fuse_file_alloc(fc);  	if (!ff)  		return -ENOMEM; -	err = fuse_send_open(fc, nodeid, file, opcode, &outarg); -	if (err) { -		fuse_file_free(ff); -		return err; +	ff->fh = 0; +	ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */ +	if (!fc->no_open || isdir) { +		struct fuse_open_out outarg; +		int err; + +		err = fuse_send_open(fc, nodeid, file, opcode, &outarg); +		if (!err) { +			ff->fh = outarg.fh; +			ff->open_flags = outarg.open_flags; + +		} else if (err != -ENOSYS || isdir) { +			fuse_file_free(ff); +			return err; +		} else { +			fc->no_open = 1; +		}  	}  	if (isdir) -		outarg.open_flags &= ~FOPEN_DIRECT_IO; +		ff->open_flags &= ~FOPEN_DIRECT_IO; -	ff->fh = outarg.fh;  	ff->nodeid = nodeid; -	ff->open_flags = outarg.open_flags;  	file->private_data = fuse_file_get(ff);  	return 0;  }  EXPORT_SYMBOL_GPL(fuse_do_open); +static void fuse_link_write_file(struct file *file) +{ +	struct inode *inode = file_inode(file); +	struct fuse_conn *fc = get_fuse_conn(inode); +	struct fuse_inode *fi = get_fuse_inode(inode); +	struct fuse_file *ff = file->private_data; +	/* +	 * file may be written through mmap, so chain it onto the +	 * inodes's write_file list +	 */ +	spin_lock(&fc->lock); +	if (list_empty(&ff->write_entry)) +		list_add(&ff->write_entry, &fi->write_files); +	spin_unlock(&fc->lock); +} +  void fuse_finish_open(struct inode *inode, struct file *file)  {  	struct fuse_file *ff = file->private_data; +	struct fuse_conn *fc = get_fuse_conn(inode);  	if (ff->open_flags & FOPEN_DIRECT_IO)  		file->f_op = &fuse_direct_io_file_operations; @@ -141,28 +215,45 @@ void fuse_finish_open(struct inode *inode, struct file *file)  		invalidate_inode_pages2(inode->i_mapping);  	if (ff->open_flags & FOPEN_NONSEEKABLE)  		nonseekable_open(inode, file); +	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { +		struct fuse_inode *fi = get_fuse_inode(inode); + +		spin_lock(&fc->lock); +		fi->attr_version = ++fc->attr_version; +		i_size_write(inode, 0); +		spin_unlock(&fc->lock); +		fuse_invalidate_attr(inode); +		if (fc->writeback_cache) +			file_update_time(file); +	} +	if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) +		fuse_link_write_file(file);  }  int fuse_open_common(struct inode *inode, struct file *file, bool isdir)  {  	struct fuse_conn *fc = get_fuse_conn(inode);  	int err; - -	/* VFS checks this, but only _after_ ->open() */ -	if (file->f_flags & O_DIRECT) -		return -EINVAL; +	bool lock_inode = (file->f_flags & O_TRUNC) && +			  fc->atomic_o_trunc && +			  fc->writeback_cache;  	err = generic_file_open(inode, file);  	if (err)  		return err; +	if (lock_inode) +		mutex_lock(&inode->i_mutex); +  	err = fuse_do_open(fc, get_node_id(inode), file, isdir); -	if (err) -		return err; -	fuse_finish_open(inode, file); +	if (!err) +		fuse_finish_open(inode, file); + +	if (lock_inode) +		mutex_unlock(&inode->i_mutex); -	return 0; +	return err;  }  static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) @@ -177,7 +268,7 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)  		rb_erase(&ff->polled_node, &fc->polled_files);  	spin_unlock(&fc->lock); -	wake_up_interruptible_sync(&ff->poll_wait); +	wake_up_interruptible_all(&ff->poll_wait);  	inarg->fh = ff->fh;  	inarg->flags = flags; @@ -200,6 +291,12 @@ void fuse_release_common(struct file *file, int opcode)  	req = ff->reserved_req;  	fuse_prepare_release(ff, file->f_flags, opcode); +	if (ff->flock) { +		struct fuse_release_in *inarg = &req->misc.release.in; +		inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; +		inarg->lock_owner = fuse_lock_owner_id(ff->fc, +						       (fl_owner_t) file); +	}  	/* Hold vfsmount and dentry until release is finished */  	path_get(&file->f_path);  	req->misc.release.path = file->f_path; @@ -208,8 +305,12 @@ void fuse_release_common(struct file *file, int opcode)  	 * Normally this will send the RELEASE request, however if  	 * some asynchronous READ or WRITE requests are outstanding,  	 * the sending will be delayed. +	 * +	 * Make the release synchronous if this is a fuseblk mount, +	 * synchronous RELEASE is allowed (and desirable) in this case +	 * because the server can be trusted not to screw up.  	 */ -	fuse_file_put(ff); +	fuse_file_put(ff, ff->fc->destroy_req != NULL);  }  static int fuse_open(struct inode *inode, struct file *file) @@ -219,6 +320,12 @@ static int fuse_open(struct inode *inode, struct file *file)  static int fuse_release(struct inode *inode, struct file *file)  { +	struct fuse_conn *fc = get_fuse_conn(inode); + +	/* see fuse_vma_close() for !writeback_cache case */ +	if (fc->writeback_cache) +		write_inode_now(inode, 1); +  	fuse_release_common(file, FUSE_RELEASE);  	/* return value is ignored by VFS */ @@ -230,6 +337,7 @@ void fuse_sync_release(struct fuse_file *ff, int flags)  	WARN_ON(atomic_read(&ff->count) > 1);  	fuse_prepare_release(ff, flags, FUSE_RELEASE);  	ff->reserved_req->force = 1; +	ff->reserved_req->background = 0;  	fuse_request_send(ff->fc, ff->reserved_req);  	fuse_put_request(ff->fc, ff->reserved_req);  	kfree(ff); @@ -259,12 +367,13 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)  }  /* - * Check if page is under writeback + * Check if any page in a range is under writeback   *   * This is currently done by walking the list of writepage requests   * for the inode, which can be pretty inefficient.   */ -static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) +static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, +				   pgoff_t idx_to)  {  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_inode *fi = get_fuse_inode(inode); @@ -277,7 +386,8 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)  		BUG_ON(req->inode != inode);  		curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; -		if (curr_index == index) { +		if (idx_from < curr_index + req->num_pages && +		    curr_index <= idx_to) {  			found = true;  			break;  		} @@ -287,6 +397,11 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)  	return found;  } +static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) +{ +	return fuse_range_is_writeback(inode, index, index); +} +  /*   * Wait for page writeback to be completed.   * @@ -301,9 +416,24 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)  	return 0;  } +/* + * Wait for all pending writepages on the inode to finish. + * + * This is currently done by blocking further writes with FUSE_NOWRITE + * and waiting for all sent writes to complete. + * + * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage + * could conflict with truncation. + */ +static void fuse_sync_writes(struct inode *inode) +{ +	fuse_set_nowrite(inode); +	fuse_release_nowrite(inode); +} +  static int fuse_flush(struct file *file, fl_owner_t id)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_file *ff = file->private_data;  	struct fuse_req *req; @@ -316,7 +446,15 @@ static int fuse_flush(struct file *file, fl_owner_t id)  	if (fc->no_flush)  		return 0; -	req = fuse_get_req_nofail(fc, file); +	err = write_inode_now(inode, 1); +	if (err) +		return err; + +	mutex_lock(&inode->i_mutex); +	fuse_sync_writes(inode); +	mutex_unlock(&inode->i_mutex); + +	req = fuse_get_req_nofail_nopages(fc, file);  	memset(&inarg, 0, sizeof(inarg));  	inarg.fh = ff->fh;  	inarg.lock_owner = fuse_lock_owner_id(fc, id); @@ -336,22 +474,8 @@ static int fuse_flush(struct file *file, fl_owner_t id)  	return err;  } -/* - * Wait for all pending writepages on the inode to finish. - * - * This is currently done by blocking further writes with FUSE_NOWRITE - * and waiting for all sent writes to complete. - * - * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage - * could conflict with truncation. - */ -static void fuse_sync_writes(struct inode *inode) -{ -	fuse_set_nowrite(inode); -	fuse_release_nowrite(inode); -} - -int fuse_fsync_common(struct file *file, int datasync, int isdir) +int fuse_fsync_common(struct file *file, loff_t start, loff_t end, +		      int datasync, int isdir)  {  	struct inode *inode = file->f_mapping->host;  	struct fuse_conn *fc = get_fuse_conn(inode); @@ -363,23 +487,30 @@ int fuse_fsync_common(struct file *file, int datasync, int isdir)  	if (is_bad_inode(inode))  		return -EIO; -	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) -		return 0; +	mutex_lock(&inode->i_mutex);  	/*  	 * Start writeback against all dirty pages of the inode, then  	 * wait for all outstanding writes, before sending the FSYNC  	 * request.  	 */ -	err = write_inode_now(inode, 0); +	err = filemap_write_and_wait_range(inode->i_mapping, start, end);  	if (err) -		return err; +		goto out;  	fuse_sync_writes(inode); +	err = sync_inode_metadata(inode, 1); +	if (err) +		goto out; -	req = fuse_get_req(fc); -	if (IS_ERR(req)) -		return PTR_ERR(req); +	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) +		goto out; + +	req = fuse_get_req_nopages(fc); +	if (IS_ERR(req)) { +		err = PTR_ERR(req); +		goto out; +	}  	memset(&inarg, 0, sizeof(inarg));  	inarg.fh = ff->fh; @@ -399,12 +530,15 @@ int fuse_fsync_common(struct file *file, int datasync, int isdir)  			fc->no_fsync = 1;  		err = 0;  	} +out: +	mutex_unlock(&inode->i_mutex);  	return err;  } -static int fuse_fsync(struct file *file, int datasync) +static int fuse_fsync(struct file *file, loff_t start, loff_t end, +		      int datasync)  { -	return fuse_fsync_common(file, datasync, 0); +	return fuse_fsync_common(file, start, end, datasync, 0);  }  void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, @@ -427,9 +561,114 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,  	req->out.args[0].size = count;  } -static size_t fuse_send_read(struct fuse_req *req, struct file *file, +static void fuse_release_user_pages(struct fuse_req *req, int write) +{ +	unsigned i; + +	for (i = 0; i < req->num_pages; i++) { +		struct page *page = req->pages[i]; +		if (write) +			set_page_dirty_lock(page); +		put_page(page); +	} +} + +/** + * In case of short read, the caller sets 'pos' to the position of + * actual end of fuse request in IO request. Otherwise, if bytes_requested + * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. + * + * An example: + * User requested DIO read of 64K. It was splitted into two 32K fuse requests, + * both submitted asynchronously. The first of them was ACKed by userspace as + * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The + * second request was ACKed as short, e.g. only 1K was read, resulting in + * pos == 33K. + * + * Thus, when all fuse requests are completed, the minimal non-negative 'pos' + * will be equal to the length of the longest contiguous fragment of + * transferred data starting from the beginning of IO request. + */ +static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) +{ +	int left; + +	spin_lock(&io->lock); +	if (err) +		io->err = io->err ? : err; +	else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes)) +		io->bytes = pos; + +	left = --io->reqs; +	spin_unlock(&io->lock); + +	if (!left) { +		long res; + +		if (io->err) +			res = io->err; +		else if (io->bytes >= 0 && io->write) +			res = -EIO; +		else { +			res = io->bytes < 0 ? io->size : io->bytes; + +			if (!is_sync_kiocb(io->iocb)) { +				struct inode *inode = file_inode(io->iocb->ki_filp); +				struct fuse_conn *fc = get_fuse_conn(inode); +				struct fuse_inode *fi = get_fuse_inode(inode); + +				spin_lock(&fc->lock); +				fi->attr_version = ++fc->attr_version; +				spin_unlock(&fc->lock); +			} +		} + +		aio_complete(io->iocb, res, 0); +		kfree(io); +	} +} + +static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) +{ +	struct fuse_io_priv *io = req->io; +	ssize_t pos = -1; + +	fuse_release_user_pages(req, !io->write); + +	if (io->write) { +		if (req->misc.write.in.size != req->misc.write.out.size) +			pos = req->misc.write.in.offset - io->offset + +				req->misc.write.out.size; +	} else { +		if (req->misc.read.in.size != req->out.args[0].size) +			pos = req->misc.read.in.offset - io->offset + +				req->out.args[0].size; +	} + +	fuse_aio_complete(io, req->out.h.error, pos); +} + +static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req, +		size_t num_bytes, struct fuse_io_priv *io) +{ +	spin_lock(&io->lock); +	io->size += num_bytes; +	io->reqs++; +	spin_unlock(&io->lock); + +	req->io = io; +	req->end = fuse_aio_complete_req; + +	__fuse_get_request(req); +	fuse_request_send_background(fc, req); + +	return num_bytes; +} + +static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io,  			     loff_t pos, size_t count, fl_owner_t owner)  { +	struct file *file = io->file;  	struct fuse_file *ff = file->private_data;  	struct fuse_conn *fc = ff->fc; @@ -440,6 +679,10 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,  		inarg->read_flags |= FUSE_READ_LOCKOWNER;  		inarg->lock_owner = fuse_lock_owner_id(fc, owner);  	} + +	if (io->async) +		return fuse_async_req_send(fc, req, count, io); +  	fuse_request_send(fc, req);  	return req->out.args[0].size;  } @@ -451,15 +694,43 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,  	struct fuse_inode *fi = get_fuse_inode(inode);  	spin_lock(&fc->lock); -	if (attr_ver == fi->attr_version && size < inode->i_size) { +	if (attr_ver == fi->attr_version && size < inode->i_size && +	    !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {  		fi->attr_version = ++fc->attr_version;  		i_size_write(inode, size);  	}  	spin_unlock(&fc->lock);  } -static int fuse_readpage(struct file *file, struct page *page) +static void fuse_short_read(struct fuse_req *req, struct inode *inode, +			    u64 attr_ver)  { +	size_t num_read = req->out.args[0].size; +	struct fuse_conn *fc = get_fuse_conn(inode); + +	if (fc->writeback_cache) { +		/* +		 * A hole in a file. Some data after the hole are in page cache, +		 * but have not reached the client fs yet. So, the hole is not +		 * present there. +		 */ +		int i; +		int start_idx = num_read >> PAGE_CACHE_SHIFT; +		size_t off = num_read & (PAGE_CACHE_SIZE - 1); + +		for (i = start_idx; i < req->num_pages; i++) { +			zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE); +			off = 0; +		} +	} else { +		loff_t pos = page_offset(req->pages[0]) + num_read; +		fuse_read_update_size(inode, pos, attr_ver); +	} +} + +static int fuse_do_readpage(struct file *file, struct page *page) +{ +	struct fuse_io_priv io = { .async = 0, .file = file };  	struct inode *inode = page->mapping->host;  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_req *req; @@ -469,21 +740,16 @@ static int fuse_readpage(struct file *file, struct page *page)  	u64 attr_ver;  	int err; -	err = -EIO; -	if (is_bad_inode(inode)) -		goto out; -  	/* -	 * Page writeback can extend beyond the liftime of the +	 * Page writeback can extend beyond the lifetime of the  	 * page-cache page, so make sure we read a properly synced  	 * page.  	 */  	fuse_wait_on_page_writeback(inode, page->index); -	req = fuse_get_req(fc); -	err = PTR_ERR(req); +	req = fuse_get_req(fc, 1);  	if (IS_ERR(req)) -		goto out; +		return PTR_ERR(req);  	attr_ver = fuse_get_attr_version(fc); @@ -491,21 +757,36 @@ static int fuse_readpage(struct file *file, struct page *page)  	req->out.argpages = 1;  	req->num_pages = 1;  	req->pages[0] = page; -	num_read = fuse_send_read(req, file, pos, count, NULL); +	req->page_descs[0].length = count; +	num_read = fuse_send_read(req, &io, pos, count, NULL);  	err = req->out.h.error; -	fuse_put_request(fc, req);  	if (!err) {  		/*  		 * Short read means EOF.  If file size is larger, truncate it  		 */  		if (num_read < count) -			fuse_read_update_size(inode, pos + num_read, attr_ver); +			fuse_short_read(req, inode, attr_ver);  		SetPageUptodate(page);  	} -	fuse_invalidate_attr(inode); /* atime changed */ +	fuse_put_request(fc, req); + +	return err; +} + +static int fuse_readpage(struct file *file, struct page *page) +{ +	struct inode *inode = page->mapping->host; +	int err; + +	err = -EIO; +	if (is_bad_inode(inode)) +		goto out; + +	err = fuse_do_readpage(file, page); +	fuse_invalidate_atime(inode);   out:  	unlock_page(page);  	return err; @@ -527,14 +808,10 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)  		/*  		 * Short read means EOF. If file size is larger, truncate it  		 */ -		if (!req->out.h.error && num_read < count) { -			loff_t pos; +		if (!req->out.h.error && num_read < count) +			fuse_short_read(req, inode, req->misc.read.attr_ver); -			pos = page_offset(req->pages[0]) + num_read; -			fuse_read_update_size(inode, pos, -					      req->misc.read.attr_ver); -		} -		fuse_invalidate_attr(inode); /* atime changed */ +		fuse_invalidate_atime(inode);  	}  	for (i = 0; i < req->num_pages; i++) { @@ -547,7 +824,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)  		page_cache_release(page);  	}  	if (req->ff) -		fuse_file_put(req->ff); +		fuse_file_put(req->ff, false);  }  static void fuse_send_readpages(struct fuse_req *req, struct file *file) @@ -577,6 +854,7 @@ struct fuse_fill_data {  	struct fuse_req *req;  	struct file *file;  	struct inode *inode; +	unsigned nr_pages;  };  static int fuse_readpages_fill(void *_data, struct page *page) @@ -592,16 +870,31 @@ static int fuse_readpages_fill(void *_data, struct page *page)  	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||  	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||  	     req->pages[req->num_pages - 1]->index + 1 != page->index)) { +		int nr_alloc = min_t(unsigned, data->nr_pages, +				     FUSE_MAX_PAGES_PER_REQ);  		fuse_send_readpages(req, data->file); -		data->req = req = fuse_get_req(fc); +		if (fc->async_read) +			req = fuse_get_req_for_background(fc, nr_alloc); +		else +			req = fuse_get_req(fc, nr_alloc); + +		data->req = req;  		if (IS_ERR(req)) {  			unlock_page(page);  			return PTR_ERR(req);  		}  	} + +	if (WARN_ON(req->num_pages >= req->max_pages)) { +		fuse_put_request(fc, req); +		return -EIO; +	} +  	page_cache_get(page);  	req->pages[req->num_pages] = page; +	req->page_descs[req->num_pages].length = PAGE_SIZE;  	req->num_pages++; +	data->nr_pages--;  	return 0;  } @@ -612,6 +905,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_fill_data data;  	int err; +	int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ);  	err = -EIO;  	if (is_bad_inode(inode)) @@ -619,7 +913,11 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,  	data.file = file;  	data.inode = inode; -	data.req = fuse_get_req(fc); +	if (fc->async_read) +		data.req = fuse_get_req_for_background(fc, nr_alloc); +	else +		data.req = fuse_get_req(fc, nr_alloc); +	data.nr_pages = nr_pages;  	err = PTR_ERR(data.req);  	if (IS_ERR(data.req))  		goto out; @@ -635,23 +933,25 @@ out:  	return err;  } -static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov, -				  unsigned long nr_segs, loff_t pos) +static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)  {  	struct inode *inode = iocb->ki_filp->f_mapping->host; +	struct fuse_conn *fc = get_fuse_conn(inode); -	if (pos + iov_length(iov, nr_segs) > i_size_read(inode)) { +	/* +	 * In auto invalidate mode, always update attributes on read. +	 * Otherwise, only update if we attempt to read past EOF (to ensure +	 * i_size is up to date). +	 */ +	if (fc->auto_inval_data || +	    (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {  		int err; -		/* -		 * If trying to read past EOF, make sure the i_size -		 * attribute is up-to-date. -		 */  		err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);  		if (err)  			return err;  	} -	return generic_file_aio_read(iocb, iov, nr_segs, pos); +	return generic_file_read_iter(iocb, to);  }  static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, @@ -677,9 +977,10 @@ static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,  	req->out.args[0].value = outarg;  } -static size_t fuse_send_write(struct fuse_req *req, struct file *file, +static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,  			      loff_t pos, size_t count, fl_owner_t owner)  { +	struct file *file = io->file;  	struct fuse_file *ff = file->private_data;  	struct fuse_conn *fc = ff->fc;  	struct fuse_write_in *inarg = &req->misc.write.in; @@ -690,88 +991,29 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,  		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;  		inarg->lock_owner = fuse_lock_owner_id(fc, owner);  	} -	fuse_request_send(fc, req); -	return req->misc.write.out.size; -} -static int fuse_write_begin(struct file *file, struct address_space *mapping, -			loff_t pos, unsigned len, unsigned flags, -			struct page **pagep, void **fsdata) -{ -	pgoff_t index = pos >> PAGE_CACHE_SHIFT; +	if (io->async) +		return fuse_async_req_send(fc, req, count, io); -	*pagep = grab_cache_page_write_begin(mapping, index, flags); -	if (!*pagep) -		return -ENOMEM; -	return 0; +	fuse_request_send(fc, req); +	return req->misc.write.out.size;  } -void fuse_write_update_size(struct inode *inode, loff_t pos) +bool fuse_write_update_size(struct inode *inode, loff_t pos)  {  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_inode *fi = get_fuse_inode(inode); +	bool ret = false;  	spin_lock(&fc->lock);  	fi->attr_version = ++fc->attr_version; -	if (pos > inode->i_size) +	if (pos > inode->i_size) {  		i_size_write(inode, pos); -	spin_unlock(&fc->lock); -} - -static int fuse_buffered_write(struct file *file, struct inode *inode, -			       loff_t pos, unsigned count, struct page *page) -{ -	int err; -	size_t nres; -	struct fuse_conn *fc = get_fuse_conn(inode); -	unsigned offset = pos & (PAGE_CACHE_SIZE - 1); -	struct fuse_req *req; - -	if (is_bad_inode(inode)) -		return -EIO; - -	/* -	 * Make sure writepages on the same page are not mixed up with -	 * plain writes. -	 */ -	fuse_wait_on_page_writeback(inode, page->index); - -	req = fuse_get_req(fc); -	if (IS_ERR(req)) -		return PTR_ERR(req); - -	req->in.argpages = 1; -	req->num_pages = 1; -	req->pages[0] = page; -	req->page_offset = offset; -	nres = fuse_send_write(req, file, pos, count, NULL); -	err = req->out.h.error; -	fuse_put_request(fc, req); -	if (!err && !nres) -		err = -EIO; -	if (!err) { -		pos += nres; -		fuse_write_update_size(inode, pos); -		if (count == PAGE_CACHE_SIZE) -			SetPageUptodate(page); +		ret = true;  	} -	fuse_invalidate_attr(inode); -	return err ? err : nres; -} - -static int fuse_write_end(struct file *file, struct address_space *mapping, -			loff_t pos, unsigned len, unsigned copied, -			struct page *page, void *fsdata) -{ -	struct inode *inode = mapping->host; -	int res = 0; - -	if (copied) -		res = fuse_buffered_write(file, inode, pos, copied, page); +	spin_unlock(&fc->lock); -	unlock_page(page); -	page_cache_release(page); -	return res; +	return ret;  }  static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, @@ -781,13 +1023,14 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,  	size_t res;  	unsigned offset;  	unsigned i; +	struct fuse_io_priv io = { .async = 0, .file = file };  	for (i = 0; i < req->num_pages; i++)  		fuse_wait_on_page_writeback(inode, req->pages[i]->index); -	res = fuse_send_write(req, file, pos, count, NULL); +	res = fuse_send_write(req, &io, pos, count, NULL); -	offset = req->page_offset; +	offset = req->page_descs[0].offset;  	count = res;  	for (i = 0; i < req->num_pages; i++) {  		struct page *page = req->pages[i]; @@ -818,7 +1061,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,  	int err;  	req->in.argpages = 1; -	req->page_offset = offset; +	req->page_descs[0].offset = offset;  	do {  		size_t tmp; @@ -842,9 +1085,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,  		if (mapping_writably_mapped(mapping))  			flush_dcache_page(page); -		pagefault_disable();  		tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); -		pagefault_enable();  		flush_dcache_page(page);  		if (!tmp) { @@ -856,6 +1097,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,  		err = 0;  		req->pages[req->num_pages] = page; +		req->page_descs[req->num_pages].length = tmp;  		req->num_pages++;  		iov_iter_advance(ii, tmp); @@ -868,28 +1110,41 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,  		if (!fc->big_writes)  			break;  	} while (iov_iter_count(ii) && count < fc->max_write && -		 req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0); +		 req->num_pages < req->max_pages && offset == 0);  	return count > 0 ? count : err;  } +static inline unsigned fuse_wr_pages(loff_t pos, size_t len) +{ +	return min_t(unsigned, +		     ((pos + len - 1) >> PAGE_CACHE_SHIFT) - +		     (pos >> PAGE_CACHE_SHIFT) + 1, +		     FUSE_MAX_PAGES_PER_REQ); +} +  static ssize_t fuse_perform_write(struct file *file,  				  struct address_space *mapping,  				  struct iov_iter *ii, loff_t pos)  {  	struct inode *inode = mapping->host;  	struct fuse_conn *fc = get_fuse_conn(inode); +	struct fuse_inode *fi = get_fuse_inode(inode);  	int err = 0;  	ssize_t res = 0;  	if (is_bad_inode(inode))  		return -EIO; +	if (inode->i_size < pos + iov_iter_count(ii)) +		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); +  	do {  		struct fuse_req *req;  		ssize_t count; +		unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii)); -		req = fuse_get_req(fc); +		req = fuse_get_req(fc, nr_pages);  		if (IS_ERR(req)) {  			err = PTR_ERR(req);  			break; @@ -919,30 +1174,34 @@ static ssize_t fuse_perform_write(struct file *file,  	if (res > 0)  		fuse_write_update_size(inode, pos); +	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);  	fuse_invalidate_attr(inode);  	return res > 0 ? res : err;  } -static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, -				   unsigned long nr_segs, loff_t pos) +static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)  {  	struct file *file = iocb->ki_filp;  	struct address_space *mapping = file->f_mapping; -	size_t count = 0; +	size_t count = iov_iter_count(from);  	ssize_t written = 0; +	ssize_t written_buffered = 0;  	struct inode *inode = mapping->host;  	ssize_t err; -	struct iov_iter i; +	loff_t endbyte = 0; +	loff_t pos = iocb->ki_pos; -	WARN_ON(iocb->ki_pos != pos); +	if (get_fuse_conn(inode)->writeback_cache) { +		/* Update size (EOF optimization) and mode (SUID clearing) */ +		err = fuse_update_attributes(mapping->host, NULL, file, NULL); +		if (err) +			return err; -	err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); -	if (err) -		return err; +		return generic_file_write_iter(iocb, from); +	}  	mutex_lock(&inode->i_mutex); -	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);  	/* We can write back this queue in page reclaim */  	current->backing_dev_info = mapping->backing_dev_info; @@ -954,17 +1213,45 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,  	if (count == 0)  		goto out; +	iov_iter_truncate(from, count);  	err = file_remove_suid(file);  	if (err)  		goto out; -	file_update_time(file); +	err = file_update_time(file); +	if (err) +		goto out; + +	if (file->f_flags & O_DIRECT) { +		written = generic_file_direct_write(iocb, from, pos); +		if (written < 0 || !iov_iter_count(from)) +			goto out; + +		pos += written; + +		written_buffered = fuse_perform_write(file, mapping, from, pos); +		if (written_buffered < 0) { +			err = written_buffered; +			goto out; +		} +		endbyte = pos + written_buffered - 1; + +		err = filemap_write_and_wait_range(file->f_mapping, pos, +						   endbyte); +		if (err) +			goto out; -	iov_iter_init(&i, iov, nr_segs, count, 0); -	written = fuse_perform_write(file, mapping, &i, pos); -	if (written >= 0) -		iocb->ki_pos = pos + written; +		invalidate_mapping_pages(file->f_mapping, +					 pos >> PAGE_CACHE_SHIFT, +					 endbyte >> PAGE_CACHE_SHIFT); +		written += written_buffered; +		iocb->ki_pos = pos + written_buffered; +	} else { +		written = fuse_perform_write(file, mapping, from, pos); +		if (written >= 0) +			iocb->ki_pos = pos + written; +	}  out:  	current->backing_dev_info = NULL;  	mutex_unlock(&inode->i_mutex); @@ -972,87 +1259,135 @@ out:  	return written ? written : err;  } -static void fuse_release_user_pages(struct fuse_req *req, int write) +static inline void fuse_page_descs_length_init(struct fuse_req *req, +		unsigned index, unsigned nr_pages)  { -	unsigned i; +	int i; -	for (i = 0; i < req->num_pages; i++) { -		struct page *page = req->pages[i]; -		if (write) -			set_page_dirty_lock(page); -		put_page(page); -	} +	for (i = index; i < index + nr_pages; i++) +		req->page_descs[i].length = PAGE_SIZE - +			req->page_descs[i].offset; +} + +static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) +{ +	return (unsigned long)ii->iov->iov_base + ii->iov_offset; +} + +static inline size_t fuse_get_frag_size(const struct iov_iter *ii, +					size_t max_size) +{ +	return min(iov_iter_single_seg_count(ii), max_size);  } -static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, +static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,  			       size_t *nbytesp, int write)  { -	size_t nbytes = *nbytesp; -	unsigned long user_addr = (unsigned long) buf; -	unsigned offset = user_addr & ~PAGE_MASK; -	int npages; +	size_t nbytes = 0;  /* # bytes already packed in req */  	/* Special case for kernel I/O: can copy directly into the buffer */ -	if (segment_eq(get_fs(), KERNEL_DS)) { +	if (ii->type & ITER_KVEC) { +		unsigned long user_addr = fuse_get_user_addr(ii); +		size_t frag_size = fuse_get_frag_size(ii, *nbytesp); +  		if (write)  			req->in.args[1].value = (void *) user_addr;  		else  			req->out.args[0].value = (void *) user_addr; +		iov_iter_advance(ii, frag_size); +		*nbytesp = frag_size;  		return 0;  	} -	nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); -	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; -	npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); -	npages = get_user_pages_fast(user_addr, npages, !write, req->pages); -	if (npages < 0) -		return npages; +	while (nbytes < *nbytesp && req->num_pages < req->max_pages) { +		unsigned npages; +		size_t start; +		unsigned n = req->max_pages - req->num_pages; +		ssize_t ret = iov_iter_get_pages(ii, +					&req->pages[req->num_pages], +					n * PAGE_SIZE, &start); +		if (ret < 0) +			return ret; + +		iov_iter_advance(ii, ret); +		nbytes += ret; + +		ret += start; +		npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; -	req->num_pages = npages; -	req->page_offset = offset; +		req->page_descs[req->num_pages].offset = start; +		fuse_page_descs_length_init(req, req->num_pages, npages); + +		req->num_pages += npages; +		req->page_descs[req->num_pages - 1].length -= +			(PAGE_SIZE - ret) & (PAGE_SIZE - 1); +	}  	if (write)  		req->in.argpages = 1;  	else  		req->out.argpages = 1; -	nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; -	*nbytesp = min(*nbytesp, nbytes); +	*nbytesp = nbytes;  	return 0;  } -ssize_t fuse_direct_io(struct file *file, const char __user *buf, -		       size_t count, loff_t *ppos, int write) +static inline int fuse_iter_npages(const struct iov_iter *ii_p)  { +	return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ); +} + +ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, +		       loff_t *ppos, int flags) +{ +	int write = flags & FUSE_DIO_WRITE; +	int cuse = flags & FUSE_DIO_CUSE; +	struct file *file = io->file; +	struct inode *inode = file->f_mapping->host;  	struct fuse_file *ff = file->private_data;  	struct fuse_conn *fc = ff->fc;  	size_t nmax = write ? fc->max_write : fc->max_read;  	loff_t pos = *ppos; +	size_t count = iov_iter_count(iter); +	pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT; +	pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;  	ssize_t res = 0;  	struct fuse_req *req; -	req = fuse_get_req(fc); +	if (io->async) +		req = fuse_get_req_for_background(fc, fuse_iter_npages(iter)); +	else +		req = fuse_get_req(fc, fuse_iter_npages(iter));  	if (IS_ERR(req))  		return PTR_ERR(req); +	if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { +		if (!write) +			mutex_lock(&inode->i_mutex); +		fuse_sync_writes(inode); +		if (!write) +			mutex_unlock(&inode->i_mutex); +	} +  	while (count) {  		size_t nres;  		fl_owner_t owner = current->files;  		size_t nbytes = min(count, nmax); -		int err = fuse_get_user_pages(req, buf, &nbytes, write); +		int err = fuse_get_user_pages(req, iter, &nbytes, write);  		if (err) {  			res = err;  			break;  		}  		if (write) -			nres = fuse_send_write(req, file, pos, nbytes, owner); +			nres = fuse_send_write(req, io, pos, nbytes, owner);  		else -			nres = fuse_send_read(req, file, pos, nbytes, owner); +			nres = fuse_send_read(req, io, pos, nbytes, owner); -		fuse_release_user_pages(req, !write); +		if (!io->async) +			fuse_release_user_pages(req, !write);  		if (req->out.h.error) {  			if (!res)  				res = req->out.h.error; @@ -1064,12 +1399,15 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,  		count -= nres;  		res += nres;  		pos += nres; -		buf += nres;  		if (nres != nbytes)  			break;  		if (count) {  			fuse_put_request(fc, req); -			req = fuse_get_req(fc); +			if (io->async) +				req = fuse_get_req_for_background(fc, +					fuse_iter_npages(iter)); +			else +				req = fuse_get_req(fc, fuse_iter_npages(iter));  			if (IS_ERR(req))  				break;  		} @@ -1083,16 +1421,49 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,  }  EXPORT_SYMBOL_GPL(fuse_direct_io); -static ssize_t fuse_direct_read(struct file *file, char __user *buf, -				     size_t count, loff_t *ppos) +static ssize_t __fuse_direct_read(struct fuse_io_priv *io, +				  struct iov_iter *iter, +				  loff_t *ppos)  {  	ssize_t res; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct file *file = io->file; +	struct inode *inode = file_inode(file);  	if (is_bad_inode(inode))  		return -EIO; -	res = fuse_direct_io(file, buf, count, ppos, 0); +	res = fuse_direct_io(io, iter, ppos, 0); + +	fuse_invalidate_attr(inode); + +	return res; +} + +static ssize_t fuse_direct_read(struct file *file, char __user *buf, +				     size_t count, loff_t *ppos) +{ +	struct fuse_io_priv io = { .async = 0, .file = file }; +	struct iovec iov = { .iov_base = buf, .iov_len = count }; +	struct iov_iter ii; +	iov_iter_init(&ii, READ, &iov, 1, count); +	return __fuse_direct_read(&io, &ii, ppos); +} + +static ssize_t __fuse_direct_write(struct fuse_io_priv *io, +				   struct iov_iter *iter, +				   loff_t *ppos) +{ +	struct file *file = io->file; +	struct inode *inode = file_inode(file); +	size_t count = iov_iter_count(iter); +	ssize_t res; + + +	res = generic_write_checks(file, ppos, &count, 0); +	if (!res) { +		iov_iter_truncate(iter, count); +		res = fuse_direct_io(io, iter, ppos, FUSE_DIO_WRITE); +	}  	fuse_invalidate_attr(inode); @@ -1102,31 +1473,35 @@ static ssize_t fuse_direct_read(struct file *file, char __user *buf,  static ssize_t fuse_direct_write(struct file *file, const char __user *buf,  				 size_t count, loff_t *ppos)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; +	struct inode *inode = file_inode(file);  	ssize_t res; +	struct fuse_io_priv io = { .async = 0, .file = file }; +	struct iov_iter ii; +	iov_iter_init(&ii, WRITE, &iov, 1, count);  	if (is_bad_inode(inode))  		return -EIO;  	/* Don't allow parallel writes to the same file */  	mutex_lock(&inode->i_mutex); -	res = generic_write_checks(file, ppos, &count, 0); -	if (!res) { -		res = fuse_direct_io(file, buf, count, ppos, 1); -		if (res > 0) -			fuse_write_update_size(inode, *ppos); -	} +	res = __fuse_direct_write(&io, &ii, ppos); +	if (res > 0) +		fuse_write_update_size(inode, *ppos);  	mutex_unlock(&inode->i_mutex); -	fuse_invalidate_attr(inode); -  	return res;  }  static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)  { -	__free_page(req->pages[0]); -	fuse_file_put(req->ff); +	int i; + +	for (i = 0; i < req->num_pages; i++) +		__free_page(req->pages[i]); + +	if (req->ff) +		fuse_file_put(req->ff, false);  }  static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) @@ -1134,30 +1509,34 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)  	struct inode *inode = req->inode;  	struct fuse_inode *fi = get_fuse_inode(inode);  	struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; +	int i;  	list_del(&req->writepages_entry); -	dec_bdi_stat(bdi, BDI_WRITEBACK); -	dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP); -	bdi_writeout_inc(bdi); +	for (i = 0; i < req->num_pages; i++) { +		dec_bdi_stat(bdi, BDI_WRITEBACK); +		dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP); +		bdi_writeout_inc(bdi); +	}  	wake_up(&fi->page_waitq);  }  /* Called under fc->lock, may release and reacquire it */ -static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req, +				loff_t size)  __releases(fc->lock)  __acquires(fc->lock)  {  	struct fuse_inode *fi = get_fuse_inode(req->inode); -	loff_t size = i_size_read(req->inode);  	struct fuse_write_in *inarg = &req->misc.write.in; +	__u64 data_size = req->num_pages * PAGE_CACHE_SIZE;  	if (!fc->connected)  		goto out_free; -	if (inarg->offset + PAGE_CACHE_SIZE <= size) { -		inarg->size = PAGE_CACHE_SIZE; +	if (inarg->offset + data_size <= size) { +		inarg->size = data_size;  	} else if (inarg->offset < size) { -		inarg->size = size & (PAGE_CACHE_SIZE - 1); +		inarg->size = size - inarg->offset;  	} else {  		/* Got truncated off completely */  		goto out_free; @@ -1188,12 +1567,13 @@ __acquires(fc->lock)  {  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_inode *fi = get_fuse_inode(inode); +	size_t crop = i_size_read(inode);  	struct fuse_req *req;  	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {  		req = list_entry(fi->queued_writes.next, struct fuse_req, list);  		list_del_init(&req->list); -		fuse_send_writepage(fc, req); +		fuse_send_writepage(fc, req, crop);  	}  } @@ -1204,12 +1584,85 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)  	mapping_set_error(inode->i_mapping, req->out.h.error);  	spin_lock(&fc->lock); +	while (req->misc.write.next) { +		struct fuse_conn *fc = get_fuse_conn(inode); +		struct fuse_write_in *inarg = &req->misc.write.in; +		struct fuse_req *next = req->misc.write.next; +		req->misc.write.next = next->misc.write.next; +		next->misc.write.next = NULL; +		next->ff = fuse_file_get(req->ff); +		list_add(&next->writepages_entry, &fi->writepages); + +		/* +		 * Skip fuse_flush_writepages() to make it easy to crop requests +		 * based on primary request size. +		 * +		 * 1st case (trivial): there are no concurrent activities using +		 * fuse_set/release_nowrite.  Then we're on safe side because +		 * fuse_flush_writepages() would call fuse_send_writepage() +		 * anyway. +		 * +		 * 2nd case: someone called fuse_set_nowrite and it is waiting +		 * now for completion of all in-flight requests.  This happens +		 * rarely and no more than once per page, so this should be +		 * okay. +		 * +		 * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle +		 * of fuse_set_nowrite..fuse_release_nowrite section.  The fact +		 * that fuse_set_nowrite returned implies that all in-flight +		 * requests were completed along with all of their secondary +		 * requests.  Further primary requests are blocked by negative +		 * writectr.  Hence there cannot be any in-flight requests and +		 * no invocations of fuse_writepage_end() while we're in +		 * fuse_set_nowrite..fuse_release_nowrite section. +		 */ +		fuse_send_writepage(fc, next, inarg->offset + inarg->size); +	}  	fi->writectr--;  	fuse_writepage_finish(fc, req);  	spin_unlock(&fc->lock);  	fuse_writepage_free(fc, req);  } +static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, +					       struct fuse_inode *fi) +{ +	struct fuse_file *ff = NULL; + +	spin_lock(&fc->lock); +	if (!list_empty(&fi->write_files)) { +		ff = list_entry(fi->write_files.next, struct fuse_file, +				write_entry); +		fuse_file_get(ff); +	} +	spin_unlock(&fc->lock); + +	return ff; +} + +static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, +					     struct fuse_inode *fi) +{ +	struct fuse_file *ff = __fuse_write_file_get(fc, fi); +	WARN_ON(!ff); +	return ff; +} + +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) +{ +	struct fuse_conn *fc = get_fuse_conn(inode); +	struct fuse_inode *fi = get_fuse_inode(inode); +	struct fuse_file *ff; +	int err; + +	ff = __fuse_write_file_get(fc, fi); +	err = fuse_flush_times(inode, ff); +	if (ff) +		fuse_file_put(ff, 0); + +	return err; +} +  static int fuse_writepage_locked(struct page *page)  {  	struct address_space *mapping = page->mapping; @@ -1217,39 +1670,40 @@ static int fuse_writepage_locked(struct page *page)  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_inode *fi = get_fuse_inode(inode);  	struct fuse_req *req; -	struct fuse_file *ff;  	struct page *tmp_page; +	int error = -ENOMEM;  	set_page_writeback(page); -	req = fuse_request_alloc_nofs(); +	req = fuse_request_alloc_nofs(1);  	if (!req)  		goto err; +	req->background = 1; /* writeback always goes to bg_queue */  	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);  	if (!tmp_page)  		goto err_free; -	spin_lock(&fc->lock); -	BUG_ON(list_empty(&fi->write_files)); -	ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); -	req->ff = fuse_file_get(ff); -	spin_unlock(&fc->lock); +	error = -EIO; +	req->ff = fuse_write_file_get(fc, fi); +	if (!req->ff) +		goto err_nofile; -	fuse_write_fill(req, ff, page_offset(page), 0); +	fuse_write_fill(req, req->ff, page_offset(page), 0);  	copy_highpage(tmp_page, page);  	req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; +	req->misc.write.next = NULL;  	req->in.argpages = 1;  	req->num_pages = 1;  	req->pages[0] = tmp_page; -	req->page_offset = 0; +	req->page_descs[0].offset = 0; +	req->page_descs[0].length = PAGE_SIZE;  	req->end = fuse_writepage_end;  	req->inode = inode;  	inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);  	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); -	end_page_writeback(page);  	spin_lock(&fc->lock);  	list_add(&req->writepages_entry, &fi->writepages); @@ -1257,25 +1711,344 @@ static int fuse_writepage_locked(struct page *page)  	fuse_flush_writepages(inode);  	spin_unlock(&fc->lock); +	end_page_writeback(page); +  	return 0; +err_nofile: +	__free_page(tmp_page);  err_free:  	fuse_request_free(req);  err:  	end_page_writeback(page); -	return -ENOMEM; +	return error;  }  static int fuse_writepage(struct page *page, struct writeback_control *wbc)  {  	int err; +	if (fuse_page_is_writeback(page->mapping->host, page->index)) { +		/* +		 * ->writepages() should be called for sync() and friends.  We +		 * should only get here on direct reclaim and then we are +		 * allowed to skip a page which is already in flight +		 */ +		WARN_ON(wbc->sync_mode == WB_SYNC_ALL); + +		redirty_page_for_writepage(wbc, page); +		return 0; +	} +  	err = fuse_writepage_locked(page);  	unlock_page(page);  	return err;  } +struct fuse_fill_wb_data { +	struct fuse_req *req; +	struct fuse_file *ff; +	struct inode *inode; +	struct page **orig_pages; +}; + +static void fuse_writepages_send(struct fuse_fill_wb_data *data) +{ +	struct fuse_req *req = data->req; +	struct inode *inode = data->inode; +	struct fuse_conn *fc = get_fuse_conn(inode); +	struct fuse_inode *fi = get_fuse_inode(inode); +	int num_pages = req->num_pages; +	int i; + +	req->ff = fuse_file_get(data->ff); +	spin_lock(&fc->lock); +	list_add_tail(&req->list, &fi->queued_writes); +	fuse_flush_writepages(inode); +	spin_unlock(&fc->lock); + +	for (i = 0; i < num_pages; i++) +		end_page_writeback(data->orig_pages[i]); +} + +static bool fuse_writepage_in_flight(struct fuse_req *new_req, +				     struct page *page) +{ +	struct fuse_conn *fc = get_fuse_conn(new_req->inode); +	struct fuse_inode *fi = get_fuse_inode(new_req->inode); +	struct fuse_req *tmp; +	struct fuse_req *old_req; +	bool found = false; +	pgoff_t curr_index; + +	BUG_ON(new_req->num_pages != 0); + +	spin_lock(&fc->lock); +	list_del(&new_req->writepages_entry); +	list_for_each_entry(old_req, &fi->writepages, writepages_entry) { +		BUG_ON(old_req->inode != new_req->inode); +		curr_index = old_req->misc.write.in.offset >> PAGE_CACHE_SHIFT; +		if (curr_index <= page->index && +		    page->index < curr_index + old_req->num_pages) { +			found = true; +			break; +		} +	} +	if (!found) { +		list_add(&new_req->writepages_entry, &fi->writepages); +		goto out_unlock; +	} + +	new_req->num_pages = 1; +	for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) { +		BUG_ON(tmp->inode != new_req->inode); +		curr_index = tmp->misc.write.in.offset >> PAGE_CACHE_SHIFT; +		if (tmp->num_pages == 1 && +		    curr_index == page->index) { +			old_req = tmp; +		} +	} + +	if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT || +					old_req->state == FUSE_REQ_PENDING)) { +		struct backing_dev_info *bdi = page->mapping->backing_dev_info; + +		copy_highpage(old_req->pages[0], page); +		spin_unlock(&fc->lock); + +		dec_bdi_stat(bdi, BDI_WRITEBACK); +		dec_zone_page_state(page, NR_WRITEBACK_TEMP); +		bdi_writeout_inc(bdi); +		fuse_writepage_free(fc, new_req); +		fuse_request_free(new_req); +		goto out; +	} else { +		new_req->misc.write.next = old_req->misc.write.next; +		old_req->misc.write.next = new_req; +	} +out_unlock: +	spin_unlock(&fc->lock); +out: +	return found; +} + +static int fuse_writepages_fill(struct page *page, +		struct writeback_control *wbc, void *_data) +{ +	struct fuse_fill_wb_data *data = _data; +	struct fuse_req *req = data->req; +	struct inode *inode = data->inode; +	struct fuse_conn *fc = get_fuse_conn(inode); +	struct page *tmp_page; +	bool is_writeback; +	int err; + +	if (!data->ff) { +		err = -EIO; +		data->ff = fuse_write_file_get(fc, get_fuse_inode(inode)); +		if (!data->ff) +			goto out_unlock; +	} + +	/* +	 * Being under writeback is unlikely but possible.  For example direct +	 * read to an mmaped fuse file will set the page dirty twice; once when +	 * the pages are faulted with get_user_pages(), and then after the read +	 * completed. +	 */ +	is_writeback = fuse_page_is_writeback(inode, page->index); + +	if (req && req->num_pages && +	    (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ || +	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write || +	     data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { +		fuse_writepages_send(data); +		data->req = NULL; +	} +	err = -ENOMEM; +	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); +	if (!tmp_page) +		goto out_unlock; + +	/* +	 * The page must not be redirtied until the writeout is completed +	 * (i.e. userspace has sent a reply to the write request).  Otherwise +	 * there could be more than one temporary page instance for each real +	 * page. +	 * +	 * This is ensured by holding the page lock in page_mkwrite() while +	 * checking fuse_page_is_writeback().  We already hold the page lock +	 * since clear_page_dirty_for_io() and keep it held until we add the +	 * request to the fi->writepages list and increment req->num_pages. +	 * After this fuse_page_is_writeback() will indicate that the page is +	 * under writeback, so we can release the page lock. +	 */ +	if (data->req == NULL) { +		struct fuse_inode *fi = get_fuse_inode(inode); + +		err = -ENOMEM; +		req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ); +		if (!req) { +			__free_page(tmp_page); +			goto out_unlock; +		} + +		fuse_write_fill(req, data->ff, page_offset(page), 0); +		req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; +		req->misc.write.next = NULL; +		req->in.argpages = 1; +		req->background = 1; +		req->num_pages = 0; +		req->end = fuse_writepage_end; +		req->inode = inode; + +		spin_lock(&fc->lock); +		list_add(&req->writepages_entry, &fi->writepages); +		spin_unlock(&fc->lock); + +		data->req = req; +	} +	set_page_writeback(page); + +	copy_highpage(tmp_page, page); +	req->pages[req->num_pages] = tmp_page; +	req->page_descs[req->num_pages].offset = 0; +	req->page_descs[req->num_pages].length = PAGE_SIZE; + +	inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK); +	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); + +	err = 0; +	if (is_writeback && fuse_writepage_in_flight(req, page)) { +		end_page_writeback(page); +		data->req = NULL; +		goto out_unlock; +	} +	data->orig_pages[req->num_pages] = page; + +	/* +	 * Protected by fc->lock against concurrent access by +	 * fuse_page_is_writeback(). +	 */ +	spin_lock(&fc->lock); +	req->num_pages++; +	spin_unlock(&fc->lock); + +out_unlock: +	unlock_page(page); + +	return err; +} + +static int fuse_writepages(struct address_space *mapping, +			   struct writeback_control *wbc) +{ +	struct inode *inode = mapping->host; +	struct fuse_fill_wb_data data; +	int err; + +	err = -EIO; +	if (is_bad_inode(inode)) +		goto out; + +	data.inode = inode; +	data.req = NULL; +	data.ff = NULL; + +	err = -ENOMEM; +	data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, +				  sizeof(struct page *), +				  GFP_NOFS); +	if (!data.orig_pages) +		goto out; + +	err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); +	if (data.req) { +		/* Ignore errors if we can write at least one page */ +		BUG_ON(!data.req->num_pages); +		fuse_writepages_send(&data); +		err = 0; +	} +	if (data.ff) +		fuse_file_put(data.ff, false); + +	kfree(data.orig_pages); +out: +	return err; +} + +/* + * It's worthy to make sure that space is reserved on disk for the write, + * but how to implement it without killing performance need more thinking. + */ +static int fuse_write_begin(struct file *file, struct address_space *mapping, +		loff_t pos, unsigned len, unsigned flags, +		struct page **pagep, void **fsdata) +{ +	pgoff_t index = pos >> PAGE_CACHE_SHIFT; +	struct fuse_conn *fc = get_fuse_conn(file->f_dentry->d_inode); +	struct page *page; +	loff_t fsize; +	int err = -ENOMEM; + +	WARN_ON(!fc->writeback_cache); + +	page = grab_cache_page_write_begin(mapping, index, flags); +	if (!page) +		goto error; + +	fuse_wait_on_page_writeback(mapping->host, page->index); + +	if (PageUptodate(page) || len == PAGE_CACHE_SIZE) +		goto success; +	/* +	 * Check if the start this page comes after the end of file, in which +	 * case the readpage can be optimized away. +	 */ +	fsize = i_size_read(mapping->host); +	if (fsize <= (pos & PAGE_CACHE_MASK)) { +		size_t off = pos & ~PAGE_CACHE_MASK; +		if (off) +			zero_user_segment(page, 0, off); +		goto success; +	} +	err = fuse_do_readpage(file, page); +	if (err) +		goto cleanup; +success: +	*pagep = page; +	return 0; + +cleanup: +	unlock_page(page); +	page_cache_release(page); +error: +	return err; +} + +static int fuse_write_end(struct file *file, struct address_space *mapping, +		loff_t pos, unsigned len, unsigned copied, +		struct page *page, void *fsdata) +{ +	struct inode *inode = page->mapping->host; + +	if (!PageUptodate(page)) { +		/* Zero any unwritten bytes at the end of the page */ +		size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK; +		if (endoff) +			zero_user_segment(page, endoff, PAGE_CACHE_SIZE); +		SetPageUptodate(page); +	} + +	fuse_write_update_size(inode, pos + copied); +	set_page_dirty(page); +	unlock_page(page); +	page_cache_release(page); + +	return copied; +} +  static int fuse_launder_page(struct page *page)  {  	int err = 0; @@ -1315,38 +2088,32 @@ static void fuse_vma_close(struct vm_area_struct *vma)  static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  {  	struct page *page = vmf->page; -	/* -	 * Don't use page->mapping as it may become NULL from a -	 * concurrent truncate. -	 */ -	struct inode *inode = vma->vm_file->f_mapping->host; +	struct inode *inode = file_inode(vma->vm_file); + +	file_update_time(vma->vm_file); +	lock_page(page); +	if (page->mapping != inode->i_mapping) { +		unlock_page(page); +		return VM_FAULT_NOPAGE; +	}  	fuse_wait_on_page_writeback(inode, page->index); -	return 0; +	return VM_FAULT_LOCKED;  }  static const struct vm_operations_struct fuse_file_vm_ops = {  	.close		= fuse_vma_close,  	.fault		= filemap_fault, +	.map_pages	= filemap_map_pages,  	.page_mkwrite	= fuse_page_mkwrite, +	.remap_pages	= generic_file_remap_pages,  };  static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)  { -	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { -		struct inode *inode = file->f_dentry->d_inode; -		struct fuse_conn *fc = get_fuse_conn(inode); -		struct fuse_inode *fi = get_fuse_inode(inode); -		struct fuse_file *ff = file->private_data; -		/* -		 * file may be written through mmap, so chain it onto the -		 * inodes's write_file list -		 */ -		spin_lock(&fc->lock); -		if (list_empty(&ff->write_entry)) -			list_add(&ff->write_entry, &fi->write_files); -		spin_unlock(&fc->lock); -	} +	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) +		fuse_link_write_file(file); +  	file_accessed(file);  	vma->vm_ops = &fuse_file_vm_ops;  	return 0; @@ -1392,7 +2159,7 @@ static void fuse_lk_fill(struct fuse_req *req, struct file *file,  			 const struct file_lock *fl, int opcode, pid_t pid,  			 int flock)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_file *ff = file->private_data;  	struct fuse_lk_in *arg = &req->misc.lk_in; @@ -1414,13 +2181,13 @@ static void fuse_lk_fill(struct fuse_req *req, struct file *file,  static int fuse_getlk(struct file *file, struct file_lock *fl)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_req *req;  	struct fuse_lk_out outarg;  	int err; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1439,14 +2206,14 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)  static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_req *req;  	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;  	pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;  	int err; -	if (fl->fl_lmops && fl->fl_lmops->fl_grant) { +	if (fl->fl_lmops && fl->fl_lmops->lm_grant) {  		/* NLM needs asynchronous locks, which we don't support yet */  		return -ENOLCK;  	} @@ -1455,7 +2222,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)  	if (fl->fl_flags & FL_CLOSE)  		return 0; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -1471,7 +2238,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)  static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode);  	int err; @@ -1494,15 +2261,17 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)  static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)  { -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode);  	int err; -	if (fc->no_lock) { +	if (fc->no_flock) {  		err = flock_lock_file_wait(file, fl);  	} else { +		struct fuse_file *ff = file->private_data; +  		/* emulate flock with POSIX locks */ -		fl->fl_owner = (fl_owner_t) file; +		ff->flock = true;  		err = fuse_setlk(file, fl, 1);  	} @@ -1521,7 +2290,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)  	if (!inode->i_sb->s_bdev || fc->no_bmap)  		return 0; -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return 0; @@ -1545,32 +2314,21 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)  	return err ? 0 : outarg.block;  } -static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) +static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)  {  	loff_t retval; -	struct inode *inode = file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(file); + +	/* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ +	if (whence == SEEK_CUR || whence == SEEK_SET) +		return generic_file_llseek(file, offset, whence);  	mutex_lock(&inode->i_mutex); -	switch (origin) { -	case SEEK_END: -		retval = fuse_update_attributes(inode, NULL, file, NULL); -		if (retval) -			goto exit; -		offset += i_size_read(inode); -		break; -	case SEEK_CUR: -		offset += file->f_pos; -	} -	retval = -EINVAL; -	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { -		if (offset != file->f_pos) { -			file->f_pos = offset; -			file->f_version = 0; -		} -		retval = offset; -	} -exit: +	retval = fuse_update_attributes(inode, NULL, file, NULL); +	if (!retval) +		retval = generic_file_llseek(file, offset, whence);  	mutex_unlock(&inode->i_mutex); +  	return retval;  } @@ -1583,7 +2341,7 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,  	if (!bytes)  		return 0; -	iov_iter_init(&ii, iov, nr_segs, bytes, 0); +	iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes);  	while (iov_iter_count(&ii)) {  		struct page *page = pages[page_idx++]; @@ -1618,6 +2376,94 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,  }  /* + * CUSE servers compiled on 32bit broke on 64bit kernels because the + * ABI was defined to be 'struct iovec' which is different on 32bit + * and 64bit.  Fortunately we can determine which structure the server + * used from the size of the reply. + */ +static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, +				     size_t transferred, unsigned count, +				     bool is_compat) +{ +#ifdef CONFIG_COMPAT +	if (count * sizeof(struct compat_iovec) == transferred) { +		struct compat_iovec *ciov = src; +		unsigned i; + +		/* +		 * With this interface a 32bit server cannot support +		 * non-compat (i.e. ones coming from 64bit apps) ioctl +		 * requests +		 */ +		if (!is_compat) +			return -EINVAL; + +		for (i = 0; i < count; i++) { +			dst[i].iov_base = compat_ptr(ciov[i].iov_base); +			dst[i].iov_len = ciov[i].iov_len; +		} +		return 0; +	} +#endif + +	if (count * sizeof(struct iovec) != transferred) +		return -EIO; + +	memcpy(dst, src, transferred); +	return 0; +} + +/* Make sure iov_length() won't overflow */ +static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) +{ +	size_t n; +	u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; + +	for (n = 0; n < count; n++, iov++) { +		if (iov->iov_len > (size_t) max) +			return -ENOMEM; +		max -= iov->iov_len; +	} +	return 0; +} + +static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, +				 void *src, size_t transferred, unsigned count, +				 bool is_compat) +{ +	unsigned i; +	struct fuse_ioctl_iovec *fiov = src; + +	if (fc->minor < 16) { +		return fuse_copy_ioctl_iovec_old(dst, src, transferred, +						 count, is_compat); +	} + +	if (count * sizeof(struct fuse_ioctl_iovec) != transferred) +		return -EIO; + +	for (i = 0; i < count; i++) { +		/* Did the server supply an inappropriate value? */ +		if (fiov[i].base != (unsigned long) fiov[i].base || +		    fiov[i].len != (unsigned long) fiov[i].len) +			return -EIO; + +		dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base; +		dst[i].iov_len = (size_t) fiov[i].len; + +#ifdef CONFIG_COMPAT +		if (is_compat && +		    (ptr_to_compat(dst[i].iov_base) != fiov[i].base || +		     (compat_size_t) dst[i].iov_len != fiov[i].len)) +			return -EIO; +#endif +	} + +	return 0; +} + + +/*   * For ioctls, there is no generic way to determine how much memory   * needs to be read and/or written.  Furthermore, ioctls are allowed   * to dereference the passed pointer, so the parameter requires deep @@ -1677,18 +2523,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,  	struct fuse_ioctl_out outarg;  	struct fuse_req *req = NULL;  	struct page **pages = NULL; -	struct page *iov_page = NULL; +	struct iovec *iov_page = NULL;  	struct iovec *in_iov = NULL, *out_iov = NULL;  	unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;  	size_t in_size, out_size, transferred;  	int err; +#if BITS_PER_LONG == 32 +	inarg.flags |= FUSE_IOCTL_32BIT; +#else +	if (flags & FUSE_IOCTL_COMPAT) +		inarg.flags |= FUSE_IOCTL_32BIT; +#endif +  	/* assume all the iovs returned by client always fits in a page */ -	BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); +	BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);  	err = -ENOMEM; -	pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); -	iov_page = alloc_page(GFP_KERNEL); +	pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL); +	iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);  	if (!pages || !iov_page)  		goto out; @@ -1697,7 +2550,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,  	 * RETRY from server is not allowed.  	 */  	if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { -		struct iovec *iov = page_address(iov_page); +		struct iovec *iov = iov_page;  		iov->iov_base = (void __user *)arg;  		iov->iov_len = _IOC_SIZE(cmd); @@ -1735,7 +2588,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,  		num_pages++;  	} -	req = fuse_get_req(fc); +	req = fuse_get_req(fc, num_pages);  	if (IS_ERR(req)) {  		err = PTR_ERR(req);  		req = NULL; @@ -1743,6 +2596,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,  	}  	memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);  	req->num_pages = num_pages; +	fuse_page_descs_length_init(req, 0, req->num_pages);  	/* okay, let's send it to the client */  	req->in.h.opcode = FUSE_IOCTL; @@ -1778,7 +2632,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,  	/* did it ask for retry? */  	if (outarg.flags & FUSE_IOCTL_RETRY) { -		char *vaddr; +		void *vaddr;  		/* no retry if in restricted mode */  		err = -EIO; @@ -1798,18 +2652,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,  		    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)  			goto out; -		err = -EIO; -		if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred) +		vaddr = kmap_atomic(pages[0]); +		err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, +					    transferred, in_iovs + out_iovs, +					    (flags & FUSE_IOCTL_COMPAT) != 0); +		kunmap_atomic(vaddr); +		if (err)  			goto out; -		/* okay, copy in iovs and retry */ -		vaddr = kmap_atomic(pages[0], KM_USER0); -		memcpy(page_address(iov_page), vaddr, transferred); -		kunmap_atomic(vaddr, KM_USER0); - -		in_iov = page_address(iov_page); +		in_iov = iov_page;  		out_iov = in_iov + in_iovs; +		err = fuse_verify_ioctl_iov(in_iov, in_iovs); +		if (err) +			goto out; + +		err = fuse_verify_ioctl_iov(out_iov, out_iovs); +		if (err) +			goto out; +  		goto retry;  	} @@ -1821,8 +2682,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,   out:  	if (req)  		fuse_put_request(fc, req); -	if (iov_page) -		__free_page(iov_page); +	free_page((unsigned long) iov_page);  	while (num_pages)  		__free_page(pages[--num_pages]);  	kfree(pages); @@ -1831,13 +2691,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,  }  EXPORT_SYMBOL_GPL(fuse_do_ioctl); -static long fuse_file_ioctl_common(struct file *file, unsigned int cmd, -				   unsigned long arg, unsigned int flags) +long fuse_ioctl_common(struct file *file, unsigned int cmd, +		       unsigned long arg, unsigned int flags)  { -	struct inode *inode = file->f_dentry->d_inode; +	struct inode *inode = file_inode(file);  	struct fuse_conn *fc = get_fuse_conn(inode); -	if (!fuse_allow_task(fc, current)) +	if (!fuse_allow_current_process(fc))  		return -EACCES;  	if (is_bad_inode(inode)) @@ -1849,13 +2709,13 @@ static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,  static long fuse_file_ioctl(struct file *file, unsigned int cmd,  			    unsigned long arg)  { -	return fuse_file_ioctl_common(file, cmd, arg, 0); +	return fuse_ioctl_common(file, cmd, arg, 0);  }  static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,  				   unsigned long arg)  { -	return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); +	return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);  }  /* @@ -1899,7 +2759,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc,  {  	spin_lock(&fc->lock);  	if (RB_EMPTY_NODE(&ff->polled_node)) { -		struct rb_node **link, *parent; +		struct rb_node **link, *uninitialized_var(parent);  		link = fuse_find_polled_node(fc, ff->kh, &parent);  		BUG_ON(*link); @@ -1922,6 +2782,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)  		return DEFAULT_POLLMASK;  	poll_wait(file, &ff->poll_wait, wait); +	inarg.events = (__u32)poll_requested_events(wait);  	/*  	 * Ask for notification iff there's someone waiting for it. @@ -1932,7 +2793,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)  		fuse_register_polled_file(fc, ff);  	} -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return POLLERR; @@ -1982,12 +2843,198 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc,  	return 0;  } +static void fuse_do_truncate(struct file *file) +{ +	struct inode *inode = file->f_mapping->host; +	struct iattr attr; + +	attr.ia_valid = ATTR_SIZE; +	attr.ia_size = i_size_read(inode); + +	attr.ia_file = file; +	attr.ia_valid |= ATTR_FILE; + +	fuse_do_setattr(inode, &attr, file); +} + +static inline loff_t fuse_round_up(loff_t off) +{ +	return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); +} + +static ssize_t +fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, +			loff_t offset) +{ +	ssize_t ret = 0; +	struct file *file = iocb->ki_filp; +	struct fuse_file *ff = file->private_data; +	bool async_dio = ff->fc->async_dio; +	loff_t pos = 0; +	struct inode *inode; +	loff_t i_size; +	size_t count = iov_iter_count(iter); +	struct fuse_io_priv *io; + +	pos = offset; +	inode = file->f_mapping->host; +	i_size = i_size_read(inode); + +	if ((rw == READ) && (offset > i_size)) +		return 0; + +	/* optimization for short read */ +	if (async_dio && rw != WRITE && offset + count > i_size) { +		if (offset >= i_size) +			return 0; +		count = min_t(loff_t, count, fuse_round_up(i_size - offset)); +		iov_iter_truncate(iter, count); +	} + +	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); +	if (!io) +		return -ENOMEM; +	spin_lock_init(&io->lock); +	io->reqs = 1; +	io->bytes = -1; +	io->size = 0; +	io->offset = offset; +	io->write = (rw == WRITE); +	io->err = 0; +	io->file = file; +	/* +	 * By default, we want to optimize all I/Os with async request +	 * submission to the client filesystem if supported. +	 */ +	io->async = async_dio; +	io->iocb = iocb; + +	/* +	 * We cannot asynchronously extend the size of a file. We have no method +	 * to wait on real async I/O requests, so we must submit this request +	 * synchronously. +	 */ +	if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE) +		io->async = false; + +	if (rw == WRITE) +		ret = __fuse_direct_write(io, iter, &pos); +	else +		ret = __fuse_direct_read(io, iter, &pos); + +	if (io->async) { +		fuse_aio_complete(io, ret < 0 ? ret : 0, -1); + +		/* we have a non-extending, async request, so return */ +		if (!is_sync_kiocb(iocb)) +			return -EIOCBQUEUED; + +		ret = wait_on_sync_kiocb(iocb); +	} else { +		kfree(io); +	} + +	if (rw == WRITE) { +		if (ret > 0) +			fuse_write_update_size(inode, pos); +		else if (ret < 0 && offset + count > i_size) +			fuse_do_truncate(file); +	} + +	return ret; +} + +static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, +				loff_t length) +{ +	struct fuse_file *ff = file->private_data; +	struct inode *inode = file->f_inode; +	struct fuse_inode *fi = get_fuse_inode(inode); +	struct fuse_conn *fc = ff->fc; +	struct fuse_req *req; +	struct fuse_fallocate_in inarg = { +		.fh = ff->fh, +		.offset = offset, +		.length = length, +		.mode = mode +	}; +	int err; +	bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || +			   (mode & FALLOC_FL_PUNCH_HOLE); + +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) +		return -EOPNOTSUPP; + +	if (fc->no_fallocate) +		return -EOPNOTSUPP; + +	if (lock_inode) { +		mutex_lock(&inode->i_mutex); +		if (mode & FALLOC_FL_PUNCH_HOLE) { +			loff_t endbyte = offset + length - 1; +			err = filemap_write_and_wait_range(inode->i_mapping, +							   offset, endbyte); +			if (err) +				goto out; + +			fuse_sync_writes(inode); +		} +	} + +	if (!(mode & FALLOC_FL_KEEP_SIZE)) +		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + +	req = fuse_get_req_nopages(fc); +	if (IS_ERR(req)) { +		err = PTR_ERR(req); +		goto out; +	} + +	req->in.h.opcode = FUSE_FALLOCATE; +	req->in.h.nodeid = ff->nodeid; +	req->in.numargs = 1; +	req->in.args[0].size = sizeof(inarg); +	req->in.args[0].value = &inarg; +	fuse_request_send(fc, req); +	err = req->out.h.error; +	if (err == -ENOSYS) { +		fc->no_fallocate = 1; +		err = -EOPNOTSUPP; +	} +	fuse_put_request(fc, req); + +	if (err) +		goto out; + +	/* we could have extended the file */ +	if (!(mode & FALLOC_FL_KEEP_SIZE)) { +		bool changed = fuse_write_update_size(inode, offset + length); + +		if (changed && fc->writeback_cache) +			file_update_time(file); +	} + +	if (mode & FALLOC_FL_PUNCH_HOLE) +		truncate_pagecache_range(inode, offset, offset + length - 1); + +	fuse_invalidate_attr(inode); + +out: +	if (!(mode & FALLOC_FL_KEEP_SIZE)) +		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + +	if (lock_inode) +		mutex_unlock(&inode->i_mutex); + +	return err; +} +  static const struct file_operations fuse_file_operations = {  	.llseek		= fuse_file_llseek, -	.read		= do_sync_read, -	.aio_read	= fuse_file_aio_read, -	.write		= do_sync_write, -	.aio_write	= fuse_file_aio_write, +	.read		= new_sync_read, +	.read_iter	= fuse_file_read_iter, +	.write		= new_sync_write, +	.write_iter	= fuse_file_write_iter,  	.mmap		= fuse_file_mmap,  	.open		= fuse_open,  	.flush		= fuse_flush, @@ -1999,6 +3046,7 @@ static const struct file_operations fuse_file_operations = {  	.unlocked_ioctl	= fuse_file_ioctl,  	.compat_ioctl	= fuse_file_compat_ioctl,  	.poll		= fuse_file_poll, +	.fallocate	= fuse_file_fallocate,  };  static const struct file_operations fuse_direct_io_file_operations = { @@ -2015,18 +3063,21 @@ static const struct file_operations fuse_direct_io_file_operations = {  	.unlocked_ioctl	= fuse_file_ioctl,  	.compat_ioctl	= fuse_file_compat_ioctl,  	.poll		= fuse_file_poll, +	.fallocate	= fuse_file_fallocate,  	/* no splice_read */  };  static const struct address_space_operations fuse_file_aops  = {  	.readpage	= fuse_readpage,  	.writepage	= fuse_writepage, +	.writepages	= fuse_writepages,  	.launder_page	= fuse_launder_page, -	.write_begin	= fuse_write_begin, -	.write_end	= fuse_write_end,  	.readpages	= fuse_readpages,  	.set_page_dirty	= __set_page_dirty_nobuffers,  	.bmap		= fuse_bmap, +	.direct_IO	= fuse_direct_IO, +	.write_begin	= fuse_write_begin, +	.write_end	= fuse_write_end,  };  void fuse_init_file_inode(struct inode *inode) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 57d4a3a0f10..e8e47a6ab51 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -21,6 +21,7 @@  #include <linux/rwsem.h>  #include <linux/rbtree.h>  #include <linux/poll.h> +#include <linux/workqueue.h>  /** Max number of pages that can be used in a single read request */  #define FUSE_MAX_PAGES_PER_REQ 32 @@ -43,6 +44,9 @@      doing the mount will be allowed to access the filesystem */  #define FUSE_ALLOW_OTHER         (1 << 1) +/** Number of page pointers embedded in fuse_req */ +#define FUSE_REQ_INLINE_PAGES 1 +  /** List of active connections */  extern struct list_head fuse_conn_list; @@ -53,6 +57,12 @@ extern struct mutex fuse_mutex;  extern unsigned max_user_bgreq;  extern unsigned max_user_congthresh; +/* One forget request */ +struct fuse_forget_link { +	struct fuse_forget_one forget_one; +	struct fuse_forget_link *next; +}; +  /** FUSE inode */  struct fuse_inode {  	/** Inode data */ @@ -66,14 +76,17 @@ struct fuse_inode {  	u64 nlookup;  	/** The request used for sending the FORGET message */ -	struct fuse_req *forget_req; +	struct fuse_forget_link *forget;  	/** Time in jiffies until the file attributes are valid */  	u64 i_time;  	/** The sticky bit in inode->i_mode may have been removed, so  	    preserve the original mode */ -	mode_t orig_i_mode; +	umode_t orig_i_mode; + +	/** 64 bit inode number */ +	u64 orig_ino;  	/** Version of last attribute change */  	u64 attr_version; @@ -93,6 +106,19 @@ struct fuse_inode {  	/** List of writepage requestst (pending or sent) */  	struct list_head writepages; + +	/** Miscellaneous bits describing inode state */ +	unsigned long state; +}; + +/** FUSE inode state bits */ +enum { +	/** Advise readdirplus  */ +	FUSE_I_ADVISE_RDPLUS, +	/** Initialized with readdirplus */ +	FUSE_I_INIT_RDPLUS, +	/** An operation changing file size is in progress  */ +	FUSE_I_SIZE_UNSTABLE,  };  struct fuse_conn; @@ -128,6 +154,9 @@ struct fuse_file {  	/** Wait queue head for poll */  	wait_queue_head_t poll_wait; + +	/** Has flock been performed on this file? */ +	bool flock:1;  };  /** One input argument of a request */ @@ -187,6 +216,12 @@ struct fuse_out {  	struct fuse_arg args[3];  }; +/** FUSE page descriptor */ +struct fuse_page_desc { +	unsigned int length; +	unsigned int offset; +}; +  /** The request state */  enum fuse_req_state {  	FUSE_REQ_INIT = 0, @@ -197,6 +232,20 @@ enum fuse_req_state {  	FUSE_REQ_FINISHED  }; +/** The request IO state (for asynchronous processing) */ +struct fuse_io_priv { +	int async; +	spinlock_t lock; +	unsigned reqs; +	ssize_t bytes; +	size_t size; +	__u64 offset; +	bool write; +	int err; +	struct kiocb *iocb; +	struct file *file; +}; +  /**   * A request to the client   */ @@ -255,15 +304,16 @@ struct fuse_req {  	/** Data for asynchronous requests */  	union { -		struct fuse_forget_in forget_in;  		struct { -			struct fuse_release_in in; +			union { +				struct fuse_release_in in; +				struct work_struct work; +			};  			struct path path;  		} release;  		struct fuse_init_in init_in;  		struct fuse_init_out init_out;  		struct cuse_init_in cuse_init_in; -		struct cuse_init_out cuse_init_out;  		struct {  			struct fuse_read_in in;  			u64 attr_ver; @@ -271,26 +321,39 @@ struct fuse_req {  		struct {  			struct fuse_write_in in;  			struct fuse_write_out out; +			struct fuse_req *next;  		} write;  		struct fuse_notify_retrieve_in retrieve_in;  		struct fuse_lk_in lk_in;  	} misc;  	/** page vector */ -	struct page *pages[FUSE_MAX_PAGES_PER_REQ]; +	struct page **pages; + +	/** page-descriptor vector */ +	struct fuse_page_desc *page_descs; + +	/** size of the 'pages' array */ +	unsigned max_pages; + +	/** inline page vector */ +	struct page *inline_pages[FUSE_REQ_INLINE_PAGES]; + +	/** inline page-descriptor vector */ +	struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES];  	/** number of pages in vector */  	unsigned num_pages; -	/** offset of data on first page */ -	unsigned page_offset; -  	/** File used in the request (or NULL) */  	struct fuse_file *ff;  	/** Inode used in the request or NULL */  	struct inode *inode; +	/** AIO control block */ +	struct fuse_io_priv *io; +  	/** Link on fi->writepages */  	struct list_head writepages_entry; @@ -312,17 +375,16 @@ struct fuse_conn {  	/** Lock protecting accessess to  members of this structure */  	spinlock_t lock; -	/** Mutex protecting against directory alias creation */ -	struct mutex inst_mutex; -  	/** Refcount */  	atomic_t count; +	struct rcu_head rcu; +  	/** The user id for this mount */ -	uid_t user_id; +	kuid_t user_id;  	/** The group id for this mount */ -	gid_t group_id; +	kgid_t group_id;  	/** The fuse mount flags for this mount */  	unsigned flags; @@ -369,6 +431,17 @@ struct fuse_conn {  	/** Pending interrupts */  	struct list_head interrupts; +	/** Queue of pending forgets */ +	struct fuse_forget_link forget_list_head; +	struct fuse_forget_link *forget_list_tail; + +	/** Batching of FORGET requests (positive indicates FORGET batch) */ +	int forget_batch; + +	/** Flag indicating that INIT reply has been received. Allocating +	 * any fuse request will be suspended until the flag is set */ +	int initialized; +  	/** Flag indicating if connection is blocked.  This will be  	    the case before the INIT reply is received, and if there  	    are too many outstading backgrounds requests */ @@ -407,11 +480,17 @@ struct fuse_conn {  	/** Set if bdi is valid */  	unsigned bdi_initialized:1; +	/** write-back cache policy (default is write-through) */ +	unsigned writeback_cache:1; +  	/*  	 * The following bitfields are only for optimization purposes  	 * and hence races in setting them will not cause malfunction  	 */ +	/** Is open/release not implemented by fs? */ +	unsigned no_open:1; +  	/** Is fsync not implemented by fs? */  	unsigned no_fsync:1; @@ -433,7 +512,7 @@ struct fuse_conn {  	/** Is removexattr not implemented by fs? */  	unsigned no_removexattr:1; -	/** Are file locking primitives not implemented by fs? */ +	/** Are posix file locking primitives not implemented by fs? */  	unsigned no_lock:1;  	/** Is access not implemented by fs? */ @@ -457,6 +536,27 @@ struct fuse_conn {  	/** Don't apply umask to creation modes */  	unsigned dont_mask:1; +	/** Are BSD file locking primitives not implemented by fs? */ +	unsigned no_flock:1; + +	/** Is fallocate not implemented by fs? */ +	unsigned no_fallocate:1; + +	/** Is rename with flags implemented by fs? */ +	unsigned no_rename2:1; + +	/** Use enhanced/automatic page cache invalidation. */ +	unsigned auto_inval_data:1; + +	/** Does the filesystem support readdirplus? */ +	unsigned do_readdirplus:1; + +	/** Does the filesystem want adaptive readdirplus? */ +	unsigned readdirplus_auto:1; + +	/** Does the filesystem support asynchronous direct-IO submission? */ +	unsigned async_dio:1; +  	/** The number of requests waiting for completion */  	atomic_t num_waiting; @@ -543,8 +643,13 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,  /**   * Send FORGET command   */ -void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, -		      u64 nodeid, u64 nlookup); +void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, +		       u64 nodeid, u64 nlookup); + +struct fuse_forget_link *fuse_alloc_forget(void); + +/* Used by READDIRPLUS */ +void fuse_force_forget(struct file *file, u64 nodeid);  /**   * Initialize READ or READDIR request @@ -572,7 +677,8 @@ void fuse_release_common(struct file *file, int opcode);  /**   * Send FSYNC or FSYNCDIR request   */ -int fuse_fsync_common(struct file *file, int datasync, int isdir); +int fuse_fsync_common(struct file *file, loff_t start, loff_t end, +		      int datasync, int isdir);  /**   * Notify poll wakeup @@ -620,14 +726,14 @@ int fuse_dev_init(void);  void fuse_dev_cleanup(void);  int fuse_ctl_init(void); -void fuse_ctl_cleanup(void); +void __exit fuse_ctl_cleanup(void);  /**   * Allocate a request   */ -struct fuse_req *fuse_request_alloc(void); +struct fuse_req *fuse_request_alloc(unsigned npages); -struct fuse_req *fuse_request_alloc_nofs(void); +struct fuse_req *fuse_request_alloc_nofs(unsigned npages);  /**   * Free a request @@ -635,14 +741,32 @@ struct fuse_req *fuse_request_alloc_nofs(void);  void fuse_request_free(struct fuse_req *req);  /** - * Get a request, may fail with -ENOMEM + * Get a request, may fail with -ENOMEM, + * caller should specify # elements in req->pages[] explicitly   */ -struct fuse_req *fuse_get_req(struct fuse_conn *fc); +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages); +struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, +					     unsigned npages); + +/* + * Increment reference count on request + */ +void __fuse_get_request(struct fuse_req *req); + +/** + * Get a request, may fail with -ENOMEM, + * useful for callers who doesn't use req->pages[] + */ +static inline struct fuse_req *fuse_get_req_nopages(struct fuse_conn *fc) +{ +	return fuse_get_req(fc, 0); +}  /**   * Gets a requests for a file operation, always succeeds   */ -struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file); +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, +					     struct file *file);  /**   * Decrement reference count of a request.  If count goes to zero free @@ -656,11 +780,6 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);  void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);  /** - * Send a request with no reply - */ -void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); - -/**   * Send a request in the background   */  void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); @@ -678,6 +797,8 @@ void fuse_invalidate_attr(struct inode *inode);  void fuse_invalidate_entry_cache(struct dentry *entry); +void fuse_invalidate_atime(struct inode *inode); +  /**   * Acquire reference to fuse_conn   */ @@ -711,9 +832,9 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc);  int fuse_valid_type(int m);  /** - * Is task allowed to perform filesystem operation? + * Is current process allowed to perform filesystem operation?   */ -int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task); +int fuse_allow_current_process(struct fuse_conn *fc);  u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); @@ -736,19 +857,44 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,  /**   * File-system tells the kernel to invalidate parent attributes and   * the dentry matching parent/name. + * + * If the child_nodeid is non-zero and: + *    - matches the inode number for the dentry matching parent/name, + *    - is not a mount point + *    - is a file or oan empty directory + * then the dentry is unhashed (d_delete()).   */  int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, -			     struct qstr *name); +			     u64 child_nodeid, struct qstr *name);  int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,  		 bool isdir); -ssize_t fuse_direct_io(struct file *file, const char __user *buf, -		       size_t count, loff_t *ppos, int write); + +/** + * fuse_direct_io() flags + */ + +/** If set, it is WRITE; otherwise - READ */ +#define FUSE_DIO_WRITE (1 << 0) + +/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */ +#define FUSE_DIO_CUSE  (1 << 1) + +ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, +		       loff_t *ppos, int flags);  long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,  		   unsigned int flags); +long fuse_ioctl_common(struct file *file, unsigned int cmd, +		       unsigned long arg, unsigned int flags);  unsigned fuse_file_poll(struct file *file, poll_table *wait);  int fuse_dev_release(struct inode *inode, struct file *file); -void fuse_write_update_size(struct inode *inode, loff_t pos); +bool fuse_write_update_size(struct inode *inode, loff_t pos); + +int fuse_flush_times(struct inode *inode, struct fuse_file *ff); +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); + +int fuse_do_setattr(struct inode *inode, struct iattr *attr, +		    struct file *file);  #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index cfce3ad86a9..03246cd9d47 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -60,8 +60,8 @@ MODULE_PARM_DESC(max_user_congthresh,  struct fuse_mount_data {  	int fd;  	unsigned rootmode; -	unsigned user_id; -	unsigned group_id; +	kuid_t user_id; +	kgid_t group_id;  	unsigned fd_present:1;  	unsigned rootmode_present:1;  	unsigned user_id_present:1; @@ -71,6 +71,11 @@ struct fuse_mount_data {  	unsigned blksize;  }; +struct fuse_forget_link *fuse_alloc_forget(void) +{ +	return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); +} +  static struct inode *fuse_alloc_inode(struct super_block *sb)  {  	struct inode *inode; @@ -86,12 +91,14 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)  	fi->nlookup = 0;  	fi->attr_version = 0;  	fi->writectr = 0; +	fi->orig_ino = 0; +	fi->state = 0;  	INIT_LIST_HEAD(&fi->write_files);  	INIT_LIST_HEAD(&fi->queued_writes);  	INIT_LIST_HEAD(&fi->writepages);  	init_waitqueue_head(&fi->page_waitq); -	fi->forget_req = fuse_request_alloc(); -	if (!fi->forget_req) { +	fi->forget = fuse_alloc_forget(); +	if (!fi->forget) {  		kmem_cache_free(fuse_inode_cachep, inode);  		return NULL;  	} @@ -99,49 +106,54 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)  	return inode;  } -static void fuse_destroy_inode(struct inode *inode) +static void fuse_i_callback(struct rcu_head *head)  { -	struct fuse_inode *fi = get_fuse_inode(inode); -	BUG_ON(!list_empty(&fi->write_files)); -	BUG_ON(!list_empty(&fi->queued_writes)); -	if (fi->forget_req) -		fuse_request_free(fi->forget_req); +	struct inode *inode = container_of(head, struct inode, i_rcu);  	kmem_cache_free(fuse_inode_cachep, inode);  } -void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, -		      u64 nodeid, u64 nlookup) +static void fuse_destroy_inode(struct inode *inode)  { -	struct fuse_forget_in *inarg = &req->misc.forget_in; -	inarg->nlookup = nlookup; -	req->in.h.opcode = FUSE_FORGET; -	req->in.h.nodeid = nodeid; -	req->in.numargs = 1; -	req->in.args[0].size = sizeof(struct fuse_forget_in); -	req->in.args[0].value = inarg; -	fuse_request_send_noreply(fc, req); +	struct fuse_inode *fi = get_fuse_inode(inode); +	BUG_ON(!list_empty(&fi->write_files)); +	BUG_ON(!list_empty(&fi->queued_writes)); +	kfree(fi->forget); +	call_rcu(&inode->i_rcu, fuse_i_callback);  }  static void fuse_evict_inode(struct inode *inode)  { -	truncate_inode_pages(&inode->i_data, 0); -	end_writeback(inode); +	truncate_inode_pages_final(&inode->i_data); +	clear_inode(inode);  	if (inode->i_sb->s_flags & MS_ACTIVE) {  		struct fuse_conn *fc = get_fuse_conn(inode);  		struct fuse_inode *fi = get_fuse_inode(inode); -		fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup); -		fi->forget_req = NULL; +		fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); +		fi->forget = NULL;  	}  }  static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)  { +	sync_filesystem(sb);  	if (*flags & MS_MANDLOCK)  		return -EINVAL;  	return 0;  } +/* + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. + */ +static ino_t fuse_squash_ino(u64 ino64) +{ +	ino_t ino = (ino_t) ino64; +	if (sizeof(ino_t) < sizeof(u64)) +		ino ^= ino64 >> (sizeof(u64) - sizeof(ino_t)) * 8; +	return ino; +} +  void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,  				   u64 attr_valid)  { @@ -151,18 +163,21 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,  	fi->attr_version = ++fc->attr_version;  	fi->i_time = attr_valid; -	inode->i_ino     = attr->ino; +	inode->i_ino     = fuse_squash_ino(attr->ino);  	inode->i_mode    = (inode->i_mode & S_IFMT) | (attr->mode & 07777); -	inode->i_nlink   = attr->nlink; -	inode->i_uid     = attr->uid; -	inode->i_gid     = attr->gid; +	set_nlink(inode, attr->nlink); +	inode->i_uid     = make_kuid(&init_user_ns, attr->uid); +	inode->i_gid     = make_kgid(&init_user_ns, attr->gid);  	inode->i_blocks  = attr->blocks;  	inode->i_atime.tv_sec   = attr->atime;  	inode->i_atime.tv_nsec  = attr->atimensec; -	inode->i_mtime.tv_sec   = attr->mtime; -	inode->i_mtime.tv_nsec  = attr->mtimensec; -	inode->i_ctime.tv_sec   = attr->ctime; -	inode->i_ctime.tv_nsec  = attr->ctimensec; +	/* mtime from server may be stale due to local buffered write */ +	if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { +		inode->i_mtime.tv_sec   = attr->mtime; +		inode->i_mtime.tv_nsec  = attr->mtimensec; +		inode->i_ctime.tv_sec   = attr->ctime; +		inode->i_ctime.tv_nsec  = attr->ctimensec; +	}  	if (attr->blksize != 0)  		inode->i_blkbits = ilog2(attr->blksize); @@ -177,6 +192,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,  	fi->orig_i_mode = inode->i_mode;  	if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))  		inode->i_mode &= ~S_ISVTX; + +	fi->orig_ino = attr->ino;  }  void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, @@ -184,23 +201,52 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,  {  	struct fuse_conn *fc = get_fuse_conn(inode);  	struct fuse_inode *fi = get_fuse_inode(inode); +	bool is_wb = fc->writeback_cache;  	loff_t oldsize; +	struct timespec old_mtime;  	spin_lock(&fc->lock); -	if (attr_version != 0 && fi->attr_version > attr_version) { +	if ((attr_version != 0 && fi->attr_version > attr_version) || +	    test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {  		spin_unlock(&fc->lock);  		return;  	} +	old_mtime = inode->i_mtime;  	fuse_change_attributes_common(inode, attr, attr_valid);  	oldsize = inode->i_size; -	i_size_write(inode, attr->size); +	/* +	 * In case of writeback_cache enabled, the cached writes beyond EOF +	 * extend local i_size without keeping userspace server in sync. So, +	 * attr->size coming from server can be stale. We cannot trust it. +	 */ +	if (!is_wb || !S_ISREG(inode->i_mode)) +		i_size_write(inode, attr->size);  	spin_unlock(&fc->lock); -	if (S_ISREG(inode->i_mode) && oldsize != attr->size) { -		truncate_pagecache(inode, oldsize, attr->size); -		invalidate_inode_pages2(inode->i_mapping); +	if (!is_wb && S_ISREG(inode->i_mode)) { +		bool inval = false; + +		if (oldsize != attr->size) { +			truncate_pagecache(inode, attr->size); +			inval = true; +		} else if (fc->auto_inval_data) { +			struct timespec new_mtime = { +				.tv_sec = attr->mtime, +				.tv_nsec = attr->mtimensec, +			}; + +			/* +			 * Auto inval mode also checks and invalidates if mtime +			 * has changed. +			 */ +			if (!timespec_equal(&old_mtime, &new_mtime)) +				inval = true; +		} + +		if (inval) +			invalidate_inode_pages2(inode->i_mapping);  	}  } @@ -208,6 +254,10 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)  {  	inode->i_mode = attr->mode & S_IFMT;  	inode->i_size = attr->size; +	inode->i_mtime.tv_sec  = attr->mtime; +	inode->i_mtime.tv_nsec = attr->mtimensec; +	inode->i_ctime.tv_sec  = attr->ctime; +	inode->i_ctime.tv_nsec = attr->ctimensec;  	if (S_ISREG(inode->i_mode)) {  		fuse_init_common(inode);  		fuse_init_file_inode(inode); @@ -254,7 +304,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,  		return NULL;  	if ((inode->i_state & I_NEW)) { -		inode->i_flags |= S_NOATIME|S_NOCMTIME; +		inode->i_flags |= S_NOATIME; +		if (!fc->writeback_cache || !S_ISREG(attr->mode)) +			inode->i_flags |= S_NOCMTIME;  		inode->i_generation = generation;  		inode->i_data.backing_dev_info = &fc->bdi;  		fuse_init_inode(inode, attr); @@ -312,6 +364,7 @@ static void fuse_send_destroy(struct fuse_conn *fc)  		fc->destroy_req = NULL;  		req->in.h.opcode = FUSE_DESTROY;  		req->force = 1; +		req->background = 0;  		fuse_request_send(fc, req);  		fuse_put_request(fc, req);  	} @@ -328,17 +381,13 @@ void fuse_conn_kill(struct fuse_conn *fc)  	spin_lock(&fc->lock);  	fc->connected = 0;  	fc->blocked = 0; +	fc->initialized = 1;  	spin_unlock(&fc->lock);  	/* Flush all readers on this fs */  	kill_fasync(&fc->fasync, SIGIO, POLL_IN);  	wake_up_all(&fc->waitq);  	wake_up_all(&fc->blocked_waitq);  	wake_up_all(&fc->reserved_req_waitq); -	mutex_lock(&fuse_mutex); -	list_del(&fc->entry); -	fuse_ctl_remove_conn(fc); -	mutex_unlock(&fuse_mutex); -	fuse_bdi_destroy(fc);  }  EXPORT_SYMBOL_GPL(fuse_conn_kill); @@ -347,7 +396,14 @@ static void fuse_put_super(struct super_block *sb)  	struct fuse_conn *fc = get_fuse_conn_super(sb);  	fuse_send_destroy(fc); +  	fuse_conn_kill(fc); +	mutex_lock(&fuse_mutex); +	list_del(&fc->entry); +	fuse_ctl_remove_conn(fc); +	mutex_unlock(&fuse_mutex); +	fuse_bdi_destroy(fc); +  	fuse_conn_put(fc);  } @@ -373,12 +429,12 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)  	struct fuse_statfs_out outarg;  	int err; -	if (!fuse_allow_task(fc, current)) { +	if (!fuse_allow_current_process(fc)) {  		buf->f_type = FUSE_SUPER_MAGIC;  		return 0;  	} -	req = fuse_get_req(fc); +	req = fuse_get_req_nopages(fc);  	if (IS_ERR(req))  		return PTR_ERR(req); @@ -422,6 +478,17 @@ static const match_table_t tokens = {  	{OPT_ERR,			NULL}  }; +static int fuse_match_uint(substring_t *s, unsigned int *res) +{ +	int err = -ENOMEM; +	char *buf = match_strdup(s); +	if (buf) { +		err = kstrtouint(buf, 10, res); +		kfree(buf); +	} +	return err; +} +  static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)  {  	char *p; @@ -432,6 +499,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)  	while ((p = strsep(&opt, ",")) != NULL) {  		int token;  		int value; +		unsigned uv;  		substring_t args[MAX_OPT_ARGS];  		if (!*p)  			continue; @@ -455,16 +523,20 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)  			break;  		case OPT_USER_ID: -			if (match_int(&args[0], &value)) +			if (fuse_match_uint(&args[0], &uv)) +				return 0; +			d->user_id = make_kuid(current_user_ns(), uv); +			if (!uid_valid(d->user_id))  				return 0; -			d->user_id = value;  			d->user_id_present = 1;  			break;  		case OPT_GROUP_ID: -			if (match_int(&args[0], &value)) +			if (fuse_match_uint(&args[0], &uv)) +				return 0; +			d->group_id = make_kgid(current_user_ns(), uv); +			if (!gid_valid(d->group_id))  				return 0; -			d->group_id = value;  			d->group_id_present = 1;  			break; @@ -500,21 +572,21 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)  	return 1;  } -static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) +static int fuse_show_options(struct seq_file *m, struct dentry *root)  { -	struct fuse_conn *fc = get_fuse_conn_super(mnt->mnt_sb); +	struct super_block *sb = root->d_sb; +	struct fuse_conn *fc = get_fuse_conn_super(sb); -	seq_printf(m, ",user_id=%u", fc->user_id); -	seq_printf(m, ",group_id=%u", fc->group_id); +	seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id)); +	seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));  	if (fc->flags & FUSE_DEFAULT_PERMISSIONS)  		seq_puts(m, ",default_permissions");  	if (fc->flags & FUSE_ALLOW_OTHER)  		seq_puts(m, ",allow_other");  	if (fc->max_read != ~0)  		seq_printf(m, ",max_read=%u", fc->max_read); -	if (mnt->mnt_sb->s_bdev && -	    mnt->mnt_sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) -		seq_printf(m, ",blksize=%lu", mnt->mnt_sb->s_blocksize); +	if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) +		seq_printf(m, ",blksize=%lu", sb->s_blocksize);  	return 0;  } @@ -522,7 +594,6 @@ void fuse_conn_init(struct fuse_conn *fc)  {  	memset(fc, 0, sizeof(*fc));  	spin_lock_init(&fc->lock); -	mutex_init(&fc->inst_mutex);  	init_rwsem(&fc->killsb);  	atomic_set(&fc->count, 1);  	init_waitqueue_head(&fc->waitq); @@ -534,13 +605,15 @@ void fuse_conn_init(struct fuse_conn *fc)  	INIT_LIST_HEAD(&fc->interrupts);  	INIT_LIST_HEAD(&fc->bg_queue);  	INIT_LIST_HEAD(&fc->entry); +	fc->forget_list_tail = &fc->forget_list_head;  	atomic_set(&fc->num_waiting, 0);  	fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;  	fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;  	fc->khctr = 0;  	fc->polled_files = RB_ROOT;  	fc->reqctr = 0; -	fc->blocked = 1; +	fc->blocked = 0; +	fc->initialized = 0;  	fc->attr_version = 1;  	get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));  } @@ -551,7 +624,6 @@ void fuse_conn_put(struct fuse_conn *fc)  	if (atomic_dec_and_test(&fc->count)) {  		if (fc->destroy_req)  			fuse_request_free(fc->destroy_req); -		mutex_destroy(&fc->inst_mutex);  		fc->release(fc);  	}  } @@ -618,10 +690,8 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,  		goto out_iput;  	entry = d_obtain_alias(inode); -	if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) { -		entry->d_op = &fuse_dentry_operations; +	if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID)  		fuse_invalidate_entry_cache(entry); -	}  	return entry; @@ -631,17 +701,17 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,  	return ERR_PTR(err);  } -static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, -			   int connectable) +static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len, +			   struct inode *parent)  { -	struct inode *inode = dentry->d_inode; -	bool encode_parent = connectable && !S_ISDIR(inode->i_mode); -	int len = encode_parent ? 6 : 3; +	int len = parent ? 6 : 3;  	u64 nodeid;  	u32 generation; -	if (*max_len < len) -		return  255; +	if (*max_len < len) { +		*max_len = len; +		return  FILEID_INVALID; +	}  	nodeid = get_fuse_inode(inode)->nodeid;  	generation = inode->i_generation; @@ -650,14 +720,9 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,  	fh[1] = (u32)(nodeid & 0xffffffff);  	fh[2] = generation; -	if (encode_parent) { -		struct inode *parent; - -		spin_lock(&dentry->d_lock); -		parent = dentry->d_parent->d_inode; +	if (parent) {  		nodeid = get_fuse_inode(parent)->nodeid;  		generation = parent->i_generation; -		spin_unlock(&dentry->d_lock);  		fh[3] = (u32)(nodeid >> 32);  		fh[4] = (u32)(nodeid & 0xffffffff); @@ -665,7 +730,7 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,  	}  	*max_len = len; -	return encode_parent ? 0x82 : 0x81; +	return parent ? 0x82 : 0x81;  }  static struct dentry *fuse_fh_to_dentry(struct super_block *sb, @@ -720,10 +785,8 @@ static struct dentry *fuse_get_parent(struct dentry *child)  	}  	parent = d_obtain_alias(inode); -	if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) { -		parent->d_op = &fuse_dentry_operations; +	if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID)  		fuse_invalidate_entry_cache(parent); -	}  	return parent;  } @@ -739,6 +802,7 @@ static const struct super_operations fuse_super_operations = {  	.alloc_inode    = fuse_alloc_inode,  	.destroy_inode  = fuse_destroy_inode,  	.evict_inode	= fuse_evict_inode, +	.write_inode	= fuse_write_inode,  	.drop_inode	= generic_delete_inode,  	.remount_fs	= fuse_remount_fs,  	.put_super	= fuse_put_super, @@ -750,7 +814,7 @@ static const struct super_operations fuse_super_operations = {  static void sanitize_global_limit(unsigned *limit)  {  	if (*limit == 0) -		*limit = ((num_physpages << PAGE_SHIFT) >> 13) / +		*limit = ((totalram_pages << PAGE_SHIFT) >> 13) /  			 sizeof(struct fuse_req);  	if (*limit >= 1 << 16) @@ -812,6 +876,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)  				fc->async_read = 1;  			if (!(arg->flags & FUSE_POSIX_LOCKS))  				fc->no_lock = 1; +			if (arg->minor >= 17) { +				if (!(arg->flags & FUSE_FLOCK_LOCKS)) +					fc->no_flock = 1; +			} else { +				if (!(arg->flags & FUSE_POSIX_LOCKS)) +					fc->no_flock = 1; +			}  			if (arg->flags & FUSE_ATOMIC_O_TRUNC)  				fc->atomic_o_trunc = 1;  			if (arg->minor >= 9) { @@ -823,9 +894,23 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)  				fc->big_writes = 1;  			if (arg->flags & FUSE_DONT_MASK)  				fc->dont_mask = 1; +			if (arg->flags & FUSE_AUTO_INVAL_DATA) +				fc->auto_inval_data = 1; +			if (arg->flags & FUSE_DO_READDIRPLUS) { +				fc->do_readdirplus = 1; +				if (arg->flags & FUSE_READDIRPLUS_AUTO) +					fc->readdirplus_auto = 1; +			} +			if (arg->flags & FUSE_ASYNC_DIO) +				fc->async_dio = 1; +			if (arg->flags & FUSE_WRITEBACK_CACHE) +				fc->writeback_cache = 1; +			if (arg->time_gran && arg->time_gran <= 1000000000) +				fc->sb->s_time_gran = arg->time_gran;  		} else {  			ra_pages = fc->max_read / PAGE_CACHE_SIZE;  			fc->no_lock = 1; +			fc->no_flock = 1;  		}  		fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); @@ -834,7 +919,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)  		fc->max_write = max_t(unsigned, 4096, fc->max_write);  		fc->conn_init = 1;  	} -	fc->blocked = 0; +	fc->initialized = 1;  	wake_up_all(&fc->blocked_waitq);  } @@ -846,7 +931,11 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)  	arg->minor = FUSE_KERNEL_MINOR_VERSION;  	arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;  	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | -		FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK; +		FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | +		FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | +		FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | +		FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | +		FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT;  	req->in.h.opcode = FUSE_INIT;  	req->in.numargs = 1;  	req->in.args[0].size = sizeof(*arg); @@ -864,7 +953,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)  static void fuse_free_conn(struct fuse_conn *fc)  { -	kfree(fc); +	kfree_rcu(fc, rcu);  }  static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) @@ -873,9 +962,8 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)  	fc->bdi.name = "fuse";  	fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; -	fc->bdi.unplug_io_fn = default_unplug_io_fn;  	/* fuse does it's own writeback accounting */ -	fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; +	fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;  	err = bdi_init(&fc->bdi);  	if (err) @@ -925,7 +1013,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)  	if (sb->s_flags & MS_MANDLOCK)  		goto err; -	if (!parse_fuse_opt((char *) data, &d, is_bdev)) +	sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION); + +	if (!parse_fuse_opt(data, &d, is_bdev))  		goto err;  	if (is_bdev) { @@ -941,6 +1031,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)  	sb->s_magic = FUSE_SUPER_MAGIC;  	sb->s_op = &fuse_super_operations;  	sb->s_maxbytes = MAX_LFS_FILESIZE; +	sb->s_time_gran = 1;  	sb->s_export_op = &fuse_export_operations;  	file = fget(d.fd); @@ -948,7 +1039,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)  	if (!file)  		goto err; -	if (file->f_op != &fuse_dev_operations) +	if ((file->f_op != &fuse_dev_operations) || +	    (file->f_cred->user_ns != &init_user_ns))  		goto err_fput;  	fc = kmalloc(sizeof(*fc), GFP_KERNEL); @@ -982,21 +1074,19 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)  	err = -ENOMEM;  	root = fuse_get_root_inode(sb, d.rootmode); -	if (!root) +	root_dentry = d_make_root(root); +	if (!root_dentry)  		goto err_put_conn; +	/* only now - we want root dentry with NULL ->d_op */ +	sb->s_d_op = &fuse_dentry_operations; -	root_dentry = d_alloc_root(root); -	if (!root_dentry) { -		iput(root); -		goto err_put_conn; -	} - -	init_req = fuse_request_alloc(); +	init_req = fuse_request_alloc(0);  	if (!init_req)  		goto err_put_root; +	init_req->background = 1;  	if (is_bdev) { -		fc->destroy_req = fuse_request_alloc(); +		fc->destroy_req = fuse_request_alloc(0);  		if (!fc->destroy_req)  			goto err_free_init_req;  	} @@ -1068,6 +1158,7 @@ static struct file_system_type fuse_fs_type = {  	.mount		= fuse_mount,  	.kill_sb	= fuse_kill_sb_anon,  }; +MODULE_ALIAS_FS("fuse");  #ifdef CONFIG_BLOCK  static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, @@ -1097,6 +1188,7 @@ static struct file_system_type fuseblk_fs_type = {  	.kill_sb	= fuse_kill_sb_blk,  	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_SUBTYPE,  }; +MODULE_ALIAS_FS("fuseblk");  static inline int register_fuseblk(void)  { @@ -1129,28 +1221,28 @@ static int __init fuse_fs_init(void)  {  	int err; -	err = register_filesystem(&fuse_fs_type); -	if (err) -		goto out; - -	err = register_fuseblk(); -	if (err) -		goto out_unreg; -  	fuse_inode_cachep = kmem_cache_create("fuse_inode",  					      sizeof(struct fuse_inode),  					      0, SLAB_HWCACHE_ALIGN,  					      fuse_inode_init_once);  	err = -ENOMEM;  	if (!fuse_inode_cachep) -		goto out_unreg2; +		goto out; + +	err = register_fuseblk(); +	if (err) +		goto out2; + +	err = register_filesystem(&fuse_fs_type); +	if (err) +		goto out3;  	return 0; - out_unreg2: + out3:  	unregister_fuseblk(); - out_unreg: -	unregister_filesystem(&fuse_fs_type); + out2: +	kmem_cache_destroy(fuse_inode_cachep);   out:  	return err;  } @@ -1159,6 +1251,12 @@ static void fuse_fs_cleanup(void)  {  	unregister_filesystem(&fuse_fs_type);  	unregister_fuseblk(); + +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(fuse_inode_cachep);  }  | 
