diff options
Diffstat (limited to 'fs/nfs')
53 files changed, 4283 insertions, 2999 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index b5e80b0af31..3dece03f2fc 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -116,17 +116,17 @@ config NFS_V4_2  config PNFS_FILE_LAYOUT  	tristate  	depends on NFS_V4_1 -	default m +	default NFS_V4  config PNFS_BLOCK  	tristate  	depends on NFS_V4_1 && BLK_DEV_DM -	default m +	default NFS_V4  config PNFS_OBJLAYOUT  	tristate  	depends on NFS_V4_1 && SCSI_OSD_ULD -	default m +	default NFS_V4  config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN  	string "NFSv4.1 Implementation ID Domain" @@ -140,6 +140,17 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN  	  If the NFS client is unchanged from the upstream kernel, this  	  option should be set to the default "kernel.org". +config NFS_V4_1_MIGRATION +	bool "NFSv4.1 client support for migration" +	depends on NFS_V4_1 +	default n +	help +	  This option makes the NFS client advertise to NFSv4.1 servers that +          it can support NFSv4 migration. + +          The NFSv4.1 pieces of the Linux NFSv4 migration implementation are +          still experimental.  If you are not an NFSv4 developer, say N here. +  config NFS_V4_SECURITY_LABEL  	bool  	depends on NFS_V4_2 && SECURITY diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 03192a66c14..4782e0840dc 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -29,8 +29,6 @@ nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o  nfsv4-$(CONFIG_SYSCTL)	+= nfs4sysctl.o  nfsv4-$(CONFIG_NFS_V4_1)	+= pnfs.o pnfs_dev.o -obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o -nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o - +obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/  obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/  obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index e242bbf7297..9b431f44fad 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -134,8 +134,8 @@ bl_submit_bio(int rw, struct bio *bio)  	if (bio) {  		get_parallel(bio->bi_private);  		dprintk("%s submitting %s bio %u@%llu\n", __func__, -			rw == READ ? "read" : "write", -			bio->bi_size, (unsigned long long)bio->bi_sector); +			rw == READ ? "read" : "write", bio->bi_iter.bi_size, +			(unsigned long long)bio->bi_iter.bi_sector);  		submit_bio(rw, bio);  	}  	return NULL; @@ -156,7 +156,8 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,  	}  	if (bio) { -		bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; +		bio->bi_iter.bi_sector = isect - be->be_f_offset + +			be->be_v_offset;  		bio->bi_bdev = be->be_mdev;  		bio->bi_end_io = end_io;  		bio->bi_private = par; @@ -201,19 +202,15 @@ static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,  static void bl_end_io_read(struct bio *bio, int err)  {  	struct parallel_io *par = bio->bi_private; -	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; +	struct bio_vec *bvec; +	int i; -	do { -		struct page *page = bvec->bv_page; +	if (!err) +		bio_for_each_segment_all(bvec, bio, i) +			SetPageUptodate(bvec->bv_page); -		if (--bvec >= bio->bi_io_vec) -			prefetchw(&bvec->bv_page->flags); -		if (uptodate) -			SetPageUptodate(page); -	} while (bvec >= bio->bi_io_vec); -	if (!uptodate) { -		struct nfs_read_data *rdata = par->data; +	if (err) { +		struct nfs_pgio_data *rdata = par->data;  		struct nfs_pgio_header *header = rdata->header;  		if (!header->pnfs_error) @@ -227,17 +224,17 @@ static void bl_end_io_read(struct bio *bio, int err)  static void bl_read_cleanup(struct work_struct *work)  {  	struct rpc_task *task; -	struct nfs_read_data *rdata; +	struct nfs_pgio_data *rdata;  	dprintk("%s enter\n", __func__);  	task = container_of(work, struct rpc_task, u.tk_work); -	rdata = container_of(task, struct nfs_read_data, task); +	rdata = container_of(task, struct nfs_pgio_data, task);  	pnfs_ld_read_done(rdata);  }  static void  bl_end_par_io_read(void *data, int unused)  { -	struct nfs_read_data *rdata = data; +	struct nfs_pgio_data *rdata = data;  	rdata->task.tk_status = rdata->header->pnfs_error;  	INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); @@ -245,7 +242,7 @@ bl_end_par_io_read(void *data, int unused)  }  static enum pnfs_try_status -bl_read_pagelist(struct nfs_read_data *rdata) +bl_read_pagelist(struct nfs_pgio_data *rdata)  {  	struct nfs_pgio_header *header = rdata->header;  	int i, hole; @@ -383,21 +380,17 @@ static void mark_extents_written(struct pnfs_block_layout *bl,  static void bl_end_io_write_zero(struct bio *bio, int err)  {  	struct parallel_io *par = bio->bi_private; -	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - -	do { -		struct page *page = bvec->bv_page; +	struct bio_vec *bvec; +	int i; -		if (--bvec >= bio->bi_io_vec) -			prefetchw(&bvec->bv_page->flags); +	bio_for_each_segment_all(bvec, bio, i) {  		/* This is the zeroing page we added */ -		end_page_writeback(page); -		page_cache_release(page); -	} while (bvec >= bio->bi_io_vec); +		end_page_writeback(bvec->bv_page); +		page_cache_release(bvec->bv_page); +	} -	if (unlikely(!uptodate)) { -		struct nfs_write_data *data = par->data; +	if (unlikely(err)) { +		struct nfs_pgio_data *data = par->data;  		struct nfs_pgio_header *header = data->header;  		if (!header->pnfs_error) @@ -412,7 +405,7 @@ static void bl_end_io_write(struct bio *bio, int err)  {  	struct parallel_io *par = bio->bi_private;  	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -	struct nfs_write_data *data = par->data; +	struct nfs_pgio_data *data = par->data;  	struct nfs_pgio_header *header = data->header;  	if (!uptodate) { @@ -430,10 +423,10 @@ static void bl_end_io_write(struct bio *bio, int err)  static void bl_write_cleanup(struct work_struct *work)  {  	struct rpc_task *task; -	struct nfs_write_data *wdata; +	struct nfs_pgio_data *wdata;  	dprintk("%s enter\n", __func__);  	task = container_of(work, struct rpc_task, u.tk_work); -	wdata = container_of(task, struct nfs_write_data, task); +	wdata = container_of(task, struct nfs_pgio_data, task);  	if (likely(!wdata->header->pnfs_error)) {  		/* Marks for LAYOUTCOMMIT */  		mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg), @@ -445,7 +438,7 @@ static void bl_write_cleanup(struct work_struct *work)  /* Called when last of bios associated with a bl_write_pagelist call finishes */  static void bl_end_par_io_write(void *data, int num_se)  { -	struct nfs_write_data *wdata = data; +	struct nfs_pgio_data *wdata = data;  	if (unlikely(wdata->header->pnfs_error)) {  		bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval, @@ -519,7 +512,7 @@ bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,  	isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +  		(offset / SECTOR_SIZE); -	bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; +	bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset;  	bio->bi_bdev = be->be_mdev;  	bio->bi_end_io = bl_read_single_end_io; @@ -680,7 +673,7 @@ check_page:  }  static enum pnfs_try_status -bl_write_pagelist(struct nfs_write_data *wdata, int sync) +bl_write_pagelist(struct nfs_pgio_data *wdata, int sync)  {  	struct nfs_pgio_header *header = wdata->header;  	int i, ret, npg_zero, pg_index, last = 0; @@ -1196,13 +1189,17 @@ bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)  		pnfs_generic_pg_init_read(pgio, req);  } -static bool +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t  bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,  		struct nfs_page *req)  {  	if (pgio->pg_dreq != NULL &&  	    !is_aligned_req(req, SECTOR_SIZE)) -		return false; +		return 0;  	return pnfs_generic_pg_test(pgio, prev, req);  } @@ -1220,7 +1217,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)  	end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);  	if (end != NFS_I(inode)->npages) {  		rcu_read_lock(); -		end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX); +		end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);  		rcu_read_unlock();  	} @@ -1248,13 +1245,17 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)  	}  } -static bool +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t  bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,  		 struct nfs_page *req)  {  	if (pgio->pg_dreq != NULL &&  	    !is_aligned_req(req, PAGE_CACHE_SIZE)) -		return false; +		return 0;  	return pnfs_generic_pg_test(pgio, prev, req);  } diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 8485978993e..9838fb02047 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -36,6 +36,7 @@  #include <linux/nfs_fs.h>  #include <linux/sunrpc/rpc_pipe_fs.h> +#include "../nfs4_fs.h"  #include "../pnfs.h"  #include "../netns.h" diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 9c3e117c3ed..4d016144256 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -44,7 +44,7 @@  static inline sector_t normalize(sector_t s, int base)  {  	sector_t tmp = s; /* Since do_div modifies its argument */ -	return s - do_div(tmp, base); +	return s - sector_div(tmp, base);  }  static inline sector_t normalize_up(sector_t s, int base) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 67cd7321316..073b4cf67ed 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -164,8 +164,7 @@ nfs41_callback_up(struct svc_serv *serv)  		svc_xprt_put(serv->sv_bc_xprt);  		serv->sv_bc_xprt = NULL;  	} -	dprintk("--> %s return %ld\n", __func__, -		IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0); +	dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));  	return rqstp;  } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index ae2e87b9545..41db5258e7a 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -112,7 +112,8 @@ out:   * TODO: keep track of all layouts (and delegations) in a hash table   * hashed by filehandle.   */ -static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, struct nfs_fh *fh) +static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, +		struct nfs_fh *fh, nfs4_stateid *stateid)  {  	struct nfs_server *server;  	struct inode *ino; @@ -120,17 +121,19 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,  	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {  		list_for_each_entry(lo, &server->layouts, plh_layouts) { +			if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid)) +				continue;  			if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))  				continue;  			ino = igrab(lo->plh_inode);  			if (!ino) -				continue; +				break;  			spin_lock(&ino->i_lock);  			/* Is this layout in the process of being freed? */  			if (NFS_I(ino)->layout != lo) {  				spin_unlock(&ino->i_lock);  				iput(ino); -				continue; +				break;  			}  			pnfs_get_layout_hdr(lo);  			spin_unlock(&ino->i_lock); @@ -141,13 +144,14 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,  	return NULL;  } -static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, struct nfs_fh *fh) +static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, +		struct nfs_fh *fh, nfs4_stateid *stateid)  {  	struct pnfs_layout_hdr *lo;  	spin_lock(&clp->cl_lock);  	rcu_read_lock(); -	lo = get_layout_by_fh_locked(clp, fh); +	lo = get_layout_by_fh_locked(clp, fh, stateid);  	rcu_read_unlock();  	spin_unlock(&clp->cl_lock); @@ -162,9 +166,9 @@ static u32 initiate_file_draining(struct nfs_client *clp,  	u32 rv = NFS4ERR_NOMATCHING_LAYOUT;  	LIST_HEAD(free_me_list); -	lo = get_layout_by_fh(clp, &args->cbl_fh); +	lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);  	if (!lo) -		return NFS4ERR_NOMATCHING_LAYOUT; +		goto out;  	ino = lo->plh_inode;  	spin_lock(&ino->i_lock); @@ -179,6 +183,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,  	pnfs_free_lseg_list(&free_me_list);  	pnfs_put_layout_hdr(lo);  	iput(ino); +out:  	return rv;  } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 2dceee4db07..1d09289c8f0 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -590,6 +590,8 @@ int nfs_create_rpc_client(struct nfs_client *clp,  	if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags))  		args.flags |= RPC_CLNT_CREATE_DISCRTRY; +	if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags)) +		args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT;  	if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags))  		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;  	if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) @@ -784,8 +786,10 @@ static int nfs_init_server(struct nfs_server *server,  		goto error;  	server->port = data->nfs_server.port; +	server->auth_info = data->auth_info; -	error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); +	error = nfs_init_server_rpcclient(server, &timeparms, +					  data->selected_flavor);  	if (error < 0)  		goto error; @@ -926,6 +930,7 @@ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *sour  	target->acdirmax = source->acdirmax;  	target->caps = source->caps;  	target->options = source->options; +	target->auth_info = source->auth_info;  }  EXPORT_SYMBOL_GPL(nfs_server_copy_userdata); @@ -943,7 +948,7 @@ void nfs_server_insert_lists(struct nfs_server *server)  }  EXPORT_SYMBOL_GPL(nfs_server_insert_lists); -static void nfs_server_remove_lists(struct nfs_server *server) +void nfs_server_remove_lists(struct nfs_server *server)  {  	struct nfs_client *clp = server->nfs_client;  	struct nfs_net *nn; @@ -960,6 +965,7 @@ static void nfs_server_remove_lists(struct nfs_server *server)  	synchronize_rcu();  } +EXPORT_SYMBOL_GPL(nfs_server_remove_lists);  /*   * Allocate and initialise a server record diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index ef792f29f83..5d8ccecf5f5 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -659,16 +659,19 @@ int nfs_async_inode_return_delegation(struct inode *inode,  	rcu_read_lock();  	delegation = rcu_dereference(NFS_I(inode)->delegation); +	if (delegation == NULL) +		goto out_enoent; -	if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) { -		rcu_read_unlock(); -		return -ENOENT; -	} +	if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) +		goto out_enoent;  	nfs_mark_return_delegation(server, delegation);  	rcu_read_unlock();  	nfs_delegation_run_state_manager(clp);  	return 0; +out_enoent: +	rcu_read_unlock(); +	return -ENOENT;  }  static struct inode * diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index de434f309af..4a3d4ef7612 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -69,21 +69,28 @@ const struct address_space_operations nfs_dir_aops = {  static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)  { +	struct nfs_inode *nfsi = NFS_I(dir);  	struct nfs_open_dir_context *ctx;  	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);  	if (ctx != NULL) {  		ctx->duped = 0; -		ctx->attr_gencount = NFS_I(dir)->attr_gencount; +		ctx->attr_gencount = nfsi->attr_gencount;  		ctx->dir_cookie = 0;  		ctx->dup_cookie = 0;  		ctx->cred = get_rpccred(cred); +		spin_lock(&dir->i_lock); +		list_add(&ctx->list, &nfsi->open_files); +		spin_unlock(&dir->i_lock);  		return ctx;  	}  	return  ERR_PTR(-ENOMEM);  } -static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) +static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx)  { +	spin_lock(&dir->i_lock); +	list_del(&ctx->list); +	spin_unlock(&dir->i_lock);  	put_rpccred(ctx->cred);  	kfree(ctx);  } @@ -98,9 +105,7 @@ nfs_opendir(struct inode *inode, struct file *filp)  	struct nfs_open_dir_context *ctx;  	struct rpc_cred *cred; -	dfprintk(FILE, "NFS: open dir(%s/%s)\n", -			filp->f_path.dentry->d_parent->d_name.name, -			filp->f_path.dentry->d_name.name); +	dfprintk(FILE, "NFS: open dir(%pD2)\n", filp);  	nfs_inc_stats(inode, NFSIOS_VFSOPEN); @@ -128,7 +133,7 @@ out:  static int  nfs_closedir(struct inode *inode, struct file *filp)  { -	put_nfs_open_dir_context(filp->private_data); +	put_nfs_open_dir_context(filp->f_path.dentry->d_inode, filp->private_data);  	return 0;  } @@ -276,6 +281,15 @@ out_eof:  	return -EBADCOOKIE;  } +static bool +nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) +{ +	if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) +		return false; +	smp_rmb(); +	return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); +} +  static  int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)  { @@ -289,21 +303,19 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des  			struct nfs_open_dir_context *ctx = desc->file->private_data;  			new_pos = desc->current_index + i; -			if (ctx->attr_gencount != nfsi->attr_gencount -			    || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { +			if (ctx->attr_gencount != nfsi->attr_gencount || +			    !nfs_readdir_inode_mapping_valid(nfsi)) {  				ctx->duped = 0;  				ctx->attr_gencount = nfsi->attr_gencount;  			} else if (new_pos < desc->ctx->pos) {  				if (ctx->duped > 0  				    && ctx->dup_cookie == *desc->dir_cookie) {  					if (printk_ratelimit()) { -						pr_notice("NFS: directory %s/%s contains a readdir loop." +						pr_notice("NFS: directory %pD2 contains a readdir loop."  								"Please contact your server vendor.  " -								"The file: %s has duplicate cookie %llu\n", -								desc->file->f_dentry->d_parent->d_name.name, -								desc->file->f_dentry->d_name.name, -								array->array[i].string.name, -								*desc->dir_cookie); +								"The file: %.*s has duplicate cookie %llu\n", +								desc->file, array->array[i].string.len, +								array->array[i].string.name, *desc->dir_cookie);  					}  					status = -ELOOP;  					goto out; @@ -431,6 +443,22 @@ void nfs_advise_use_readdirplus(struct inode *dir)  	set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags);  } +/* + * This function is mainly for use by nfs_getattr(). + * + * If this is an 'ls -l', we want to force use of readdirplus. + * Do this by checking if there is an active file descriptor + * and calling nfs_advise_use_readdirplus, then forcing a + * cache flush. + */ +void nfs_force_use_readdirplus(struct inode *dir) +{ +	if (!list_empty(&NFS_I(dir)->open_files)) { +		nfs_advise_use_readdirplus(dir); +		nfs_zap_mapping(dir, dir->i_mapping); +	} +} +  static  void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)  { @@ -809,6 +837,17 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)  	goto out;  } +static bool nfs_dir_mapping_need_revalidate(struct inode *dir) +{ +	struct nfs_inode *nfsi = NFS_I(dir); + +	if (nfs_attribute_cache_expired(dir)) +		return true; +	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) +		return true; +	return false; +} +  /* The file offset position represents the dirent entry number.  A     last cookie cache takes care of the common case of reading the     whole directory. @@ -822,9 +861,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)  	struct nfs_open_dir_context *dir_ctx = file->private_data;  	int res = 0; -	dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", -			dentry->d_parent->d_name.name, dentry->d_name.name, -			(long long)ctx->pos); +	dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", +			file, (long long)ctx->pos);  	nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);  	/* @@ -842,7 +880,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)  	desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;  	nfs_block_sillyrename(dentry); -	if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) +	if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode))  		res = nfs_revalidate_mapping(inode, file->f_mapping);  	if (res < 0)  		goto out; @@ -880,22 +918,17 @@ out:  	nfs_unblock_sillyrename(dentry);  	if (res > 0)  		res = 0; -	dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n", -			dentry->d_parent->d_name.name, dentry->d_name.name, -			res); +	dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);  	return res;  }  static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)  { -	struct dentry *dentry = filp->f_path.dentry; -	struct inode *inode = dentry->d_inode; +	struct inode *inode = file_inode(filp);  	struct nfs_open_dir_context *dir_ctx = filp->private_data; -	dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n", -			dentry->d_parent->d_name.name, -			dentry->d_name.name, -			offset, whence); +	dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", +			filp, offset, whence);  	mutex_lock(&inode->i_mutex);  	switch (whence) { @@ -925,15 +958,12 @@ out:  static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,  			 int datasync)  { -	struct dentry *dentry = filp->f_path.dentry; -	struct inode *inode = dentry->d_inode; +	struct inode *inode = file_inode(filp); -	dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", -			dentry->d_parent->d_name.name, dentry->d_name.name, -			datasync); +	dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync);  	mutex_lock(&inode->i_mutex); -	nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC); +	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);  	mutex_unlock(&inode->i_mutex);  	return 0;  } @@ -1073,9 +1103,8 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)  	}  	if (is_bad_inode(inode)) { -		dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n", -				__func__, dentry->d_parent->d_name.name, -				dentry->d_name.name); +		dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", +				__func__, dentry);  		goto out_bad;  	} @@ -1125,9 +1154,8 @@ out_set_verifier:  	nfs_advise_use_readdirplus(dir);   out_valid_noent:  	dput(parent); -	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n", -			__func__, dentry->d_parent->d_name.name, -			dentry->d_name.name); +	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", +			__func__, dentry);  	return 1;  out_zap_parent:  	nfs_zap_caches(dir); @@ -1139,7 +1167,13 @@ out_zap_parent:  	if (inode && S_ISDIR(inode->i_mode)) {  		/* Purge readdir caches. */  		nfs_zap_caches(inode); -		if (dentry->d_flags & DCACHE_DISCONNECTED) +		/* +		 * We can't d_drop the root of a disconnected tree: +		 * its d_hash is on the s_anon list and d_drop() would hide +		 * it from shrink_dcache_for_unmount(), leading to busy +		 * inodes on unmount and further oopses. +		 */ +		if (IS_ROOT(dentry))  			goto out_valid;  	}  	/* If we have submounts, don't unhash ! */ @@ -1147,18 +1181,16 @@ out_zap_parent:  		goto out_valid;  	dput(parent); -	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", -			__func__, dentry->d_parent->d_name.name, -			dentry->d_name.name); +	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", +			__func__, dentry);  	return 0;  out_error:  	nfs_free_fattr(fattr);  	nfs_free_fhandle(fhandle);  	nfs4_label_free(label);  	dput(parent); -	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n", -			__func__, dentry->d_parent->d_name.name, -			dentry->d_name.name, error); +	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n", +			__func__, dentry, error);  	return error;  } @@ -1182,16 +1214,14 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)  	 * eventually need to do something more here.  	 */  	if (!inode) { -		dfprintk(LOOKUPCACHE, "%s: %s/%s has negative inode\n", -				__func__, dentry->d_parent->d_name.name, -				dentry->d_name.name); +		dfprintk(LOOKUPCACHE, "%s: %pd2 has negative inode\n", +				__func__, dentry);  		return 1;  	}  	if (is_bad_inode(inode)) { -		dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n", -				__func__, dentry->d_parent->d_name.name, -				dentry->d_name.name); +		dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", +				__func__, dentry);  		return 0;  	} @@ -1206,9 +1236,8 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)   */  static int nfs_dentry_delete(const struct dentry *dentry)  { -	dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name, -		dentry->d_flags); +	dfprintk(VFS, "NFS: dentry_delete(%pd2, %x)\n", +		dentry, dentry->d_flags);  	/* Unhash any dentry with a stale inode */  	if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode)) @@ -1286,8 +1315,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in  	struct nfs4_label *label = NULL;  	int error; -	dfprintk(VFS, "NFS: lookup(%s/%s)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name); +	dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry);  	nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);  	res = ERR_PTR(-ENAMETOOLONG); @@ -1381,7 +1409,7 @@ static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, i  static int do_open(struct inode *inode, struct file *filp)  { -	nfs_fscache_set_inode_cookie(inode, filp); +	nfs_fscache_open_file(inode, filp);  	return 0;  } @@ -1392,6 +1420,9 @@ static int nfs_finish_open(struct nfs_open_context *ctx,  {  	int err; +	if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) +		*opened |= FILE_CREATED; +  	err = finish_open(file, dentry, do_open, opened);  	if (err)  		goto out; @@ -1415,8 +1446,8 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,  	/* Expect a negative dentry */  	BUG_ON(dentry->d_inode); -	dfprintk(VFS, "NFS: atomic_open(%s/%ld), %s\n", -			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); +	dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n", +			dir->i_sb->s_id, dir->i_ino, dentry);  	err = nfs_check_flags(open_flags);  	if (err) @@ -1455,7 +1486,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,  	trace_nfs_atomic_open_enter(dir, ctx, open_flags);  	nfs_block_sillyrename(dentry->d_parent); -	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); +	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened);  	nfs_unblock_sillyrename(dentry->d_parent);  	if (IS_ERR(inode)) {  		err = PTR_ERR(inode); @@ -1605,8 +1636,8 @@ int nfs_create(struct inode *dir, struct dentry *dentry,  	int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;  	int error; -	dfprintk(VFS, "NFS: create(%s/%ld), %s\n", -			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); +	dfprintk(VFS, "NFS: create(%s/%lu), %pd\n", +			dir->i_sb->s_id, dir->i_ino, dentry);  	attr.ia_mode = mode;  	attr.ia_valid = ATTR_MODE; @@ -1632,8 +1663,8 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)  	struct iattr attr;  	int status; -	dfprintk(VFS, "NFS: mknod(%s/%ld), %s\n", -			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); +	dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n", +			dir->i_sb->s_id, dir->i_ino, dentry);  	if (!new_valid_dev(rdev))  		return -EINVAL; @@ -1661,8 +1692,8 @@ int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  	struct iattr attr;  	int error; -	dfprintk(VFS, "NFS: mkdir(%s/%ld), %s\n", -			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); +	dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", +			dir->i_sb->s_id, dir->i_ino, dentry);  	attr.ia_valid = ATTR_MODE;  	attr.ia_mode = mode | S_IFDIR; @@ -1689,8 +1720,8 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)  {  	int error; -	dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n", -			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); +	dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n", +			dir->i_sb->s_id, dir->i_ino, dentry);  	trace_nfs_rmdir_enter(dir, dentry);  	if (dentry->d_inode) { @@ -1725,8 +1756,7 @@ static int nfs_safe_remove(struct dentry *dentry)  	struct inode *inode = dentry->d_inode;  	int error = -EBUSY; -	dfprintk(VFS, "NFS: safe_remove(%s/%s)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name); +	dfprintk(VFS, "NFS: safe_remove(%pd2)\n", dentry);  	/* If the dentry was sillyrenamed, we simply call d_delete() */  	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { @@ -1759,8 +1789,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)  	int error;  	int need_rehash = 0; -	dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, -		dir->i_ino, dentry->d_name.name); +	dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id, +		dir->i_ino, dentry);  	trace_nfs_unlink_enter(dir, dentry);  	spin_lock(&dentry->d_lock); @@ -1810,8 +1840,8 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)  	unsigned int pathlen = strlen(symname);  	int error; -	dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id, -		dir->i_ino, dentry->d_name.name, symname); +	dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id, +		dir->i_ino, dentry, symname);  	if (pathlen > PAGE_SIZE)  		return -ENAMETOOLONG; @@ -1833,9 +1863,9 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)  	error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);  	trace_nfs_symlink_exit(dir, dentry, error);  	if (error != 0) { -		dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n", +		dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n",  			dir->i_sb->s_id, dir->i_ino, -			dentry->d_name.name, symname, error); +			dentry, symname, error);  		d_drop(dentry);  		__free_page(page);  		return error; @@ -1849,6 +1879,11 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)  							GFP_KERNEL)) {  		SetPageUptodate(page);  		unlock_page(page); +		/* +		 * add_to_page_cache_lru() grabs an extra page refcount. +		 * Drop it here to avoid leaking this page later. +		 */ +		page_cache_release(page);  	} else  		__free_page(page); @@ -1862,9 +1897,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)  	struct inode *inode = old_dentry->d_inode;  	int error; -	dfprintk(VFS, "NFS: link(%s/%s -> %s/%s)\n", -		old_dentry->d_parent->d_name.name, old_dentry->d_name.name, -		dentry->d_parent->d_name.name, dentry->d_name.name); +	dfprintk(VFS, "NFS: link(%pd2 -> %pd2)\n", +		old_dentry, dentry);  	trace_nfs_link_enter(inode, dir, dentry);  	NFS_PROTO(inode)->return_delegation(inode); @@ -1910,11 +1944,11 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	struct inode *old_inode = old_dentry->d_inode;  	struct inode *new_inode = new_dentry->d_inode;  	struct dentry *dentry = NULL, *rehash = NULL; +	struct rpc_task *task;  	int error = -EBUSY; -	dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", -		 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, -		 new_dentry->d_parent->d_name.name, new_dentry->d_name.name, +	dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n", +		 old_dentry, new_dentry,  		 d_count(new_dentry));  	trace_nfs_rename_enter(old_dir, old_dentry, new_dir, new_dentry); @@ -1958,8 +1992,16 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	if (new_inode != NULL)  		NFS_PROTO(new_inode)->return_delegation(new_inode); -	error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, -					   new_dir, &new_dentry->d_name); +	task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); +	if (IS_ERR(task)) { +		error = PTR_ERR(task); +		goto out; +	} + +	error = rpc_wait_for_completion_task(task); +	if (error == 0) +		error = task->tk_status; +	rpc_put_task(task);  	nfs_mark_for_revalidate(old_inode);  out:  	if (rehash) @@ -1990,9 +2032,9 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry)  {  	put_rpccred(entry->cred);  	kfree(entry); -	smp_mb__before_atomic_dec(); +	smp_mb__before_atomic();  	atomic_long_dec(&nfs_access_nr_entries); -	smp_mb__after_atomic_dec(); +	smp_mb__after_atomic();  }  static void nfs_access_free_list(struct list_head *head) @@ -2040,9 +2082,9 @@ nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)  		else {  remove_lru_entry:  			list_del_init(&nfsi->access_cache_inode_lru); -			smp_mb__before_clear_bit(); +			smp_mb__before_atomic();  			clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); -			smp_mb__after_clear_bit(); +			smp_mb__after_atomic();  		}  		spin_unlock(&inode->i_lock);  	} @@ -2190,9 +2232,9 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)  	nfs_access_add_rbtree(inode, cache);  	/* Update accounting */ -	smp_mb__before_atomic_inc(); +	smp_mb__before_atomic();  	atomic_long_inc(&nfs_access_nr_entries); -	smp_mb__after_atomic_inc(); +	smp_mb__after_atomic();  	/* Add inode to global LRU list */  	if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { @@ -2318,7 +2360,7 @@ out:  	if (!res && (mask & MAY_EXEC) && !execute_ok(inode))  		res = -EACCES; -	dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", +	dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n",  		inode->i_sb->s_id, inode->i_ino, mask, res);  	return res;  out_notsup: diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 91ff089d341..f11b9eed0de 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -108,6 +108,97 @@ static inline int put_dreq(struct nfs_direct_req *dreq)  	return atomic_dec_and_test(&dreq->io_count);  } +/* + * nfs_direct_select_verf - select the right verifier + * @dreq - direct request possibly spanning multiple servers + * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs + * @ds_idx - index of data server in data server list, only valid if ds_clp set + * + * returns the correct verifier to use given the role of the server + */ +static struct nfs_writeverf * +nfs_direct_select_verf(struct nfs_direct_req *dreq, +		       struct nfs_client *ds_clp, +		       int ds_idx) +{ +	struct nfs_writeverf *verfp = &dreq->verf; + +#ifdef CONFIG_NFS_V4_1 +	if (ds_clp) { +		/* pNFS is in use, use the DS verf */ +		if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets) +			verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf; +		else +			WARN_ON_ONCE(1); +	} +#endif +	return verfp; +} + + +/* + * nfs_direct_set_hdr_verf - set the write/commit verifier + * @dreq - direct request possibly spanning multiple servers + * @hdr - pageio header to validate against previously seen verfs + * + * Set the server's (MDS or DS) "seen" verifier + */ +static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, +				    struct nfs_pgio_header *hdr) +{ +	struct nfs_writeverf *verfp; + +	verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, +				      hdr->data->ds_idx); +	WARN_ON_ONCE(verfp->committed >= 0); +	memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); +	WARN_ON_ONCE(verfp->committed < 0); +} + +/* + * nfs_direct_cmp_hdr_verf - compare verifier for pgio header + * @dreq - direct request possibly spanning multiple servers + * @hdr - pageio header to validate against previously seen verf + * + * set the server's "seen" verf if not initialized. + * returns result of comparison between @hdr->verf and the "seen" + * verf of the server used by @hdr (DS or MDS) + */ +static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, +					  struct nfs_pgio_header *hdr) +{ +	struct nfs_writeverf *verfp; + +	verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, +					 hdr->data->ds_idx); +	if (verfp->committed < 0) { +		nfs_direct_set_hdr_verf(dreq, hdr); +		return 0; +	} +	return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); +} + +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) +/* + * nfs_direct_cmp_commit_data_verf - compare verifier for commit data + * @dreq - direct request possibly spanning multiple servers + * @data - commit data to validate against previously seen verf + * + * returns result of comparison between @data->verf and the verf of + * the server used by @data (DS or MDS) + */ +static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, +					   struct nfs_commit_data *data) +{ +	struct nfs_writeverf *verfp; + +	verfp = nfs_direct_select_verf(dreq, data->ds_clp, +					 data->ds_commit_index); +	WARN_ON_ONCE(verfp->committed < 0); +	return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); +} +#endif +  /**   * nfs_direct_IO - NFS address space operation for direct I/O   * @rw: direction (read or write) @@ -121,21 +212,20 @@ static inline int put_dreq(struct nfs_direct_req *dreq)   * shunt off direct read and write requests before the VFS gets them,   * so this method is only ever called for swap.   */ -ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) +ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)  {  #ifndef CONFIG_NFS_SWAP -	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", -			iocb->ki_filp->f_path.dentry->d_name.name, -			(long long) pos, nr_segs); +	dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n", +			iocb->ki_filp, (long long) pos, iter->nr_segs);  	return -EINVAL;  #else  	VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);  	if (rw == READ || rw == KERNEL_READ) -		return nfs_file_direct_read(iocb, iov, nr_segs, pos, +		return nfs_file_direct_read(iocb, iter, pos,  				rw == READ ? true : false); -	return nfs_file_direct_write(iocb, iov, nr_segs, pos, +	return nfs_file_direct_write(iocb, iter, pos,  				rw == WRITE ? true : false);  #endif /* CONFIG_NFS_SWAP */  } @@ -169,6 +259,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)  	kref_get(&dreq->kref);  	init_completion(&dreq->completion);  	INIT_LIST_HEAD(&dreq->mds_cinfo.list); +	dreq->verf.committed = NFS_INVALID_STABLE_HOW;	/* not set yet */  	INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);  	spin_lock_init(&dreq->lock); @@ -223,14 +314,31 @@ out:   * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust   * the iocb is still valid here if this is a synchronous request.   */ -static void nfs_direct_complete(struct nfs_direct_req *dreq) +static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)  { +	struct inode *inode = dreq->inode; + +	if (dreq->iocb && write) { +		loff_t pos = dreq->iocb->ki_pos + dreq->count; + +		spin_lock(&inode->i_lock); +		if (i_size_read(inode) < pos) +			i_size_write(inode, pos); +		spin_unlock(&inode->i_lock); +	} + +	if (write) +		nfs_zap_mapping(inode, inode->i_mapping); + +	inode_dio_done(inode); +  	if (dreq->iocb) {  		long res = (long) dreq->error;  		if (!res)  			res = (long) dreq->count;  		aio_complete(dreq->iocb, res, 0);  	} +  	complete_all(&dreq->completion);  	nfs_direct_req_release(dreq); @@ -238,9 +346,9 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)  static void nfs_direct_readpage_release(struct nfs_page *req)  { -	dprintk("NFS: direct read done (%s/%lld %d@%lld)\n", +	dprintk("NFS: direct read done (%s/%llu %d@%lld)\n",  		req->wb_context->dentry->d_inode->i_sb->s_id, -		(long long)NFS_FILEID(req->wb_context->dentry->d_inode), +		(unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),  		req->wb_bytes,  		(long long)req_offset(req));  	nfs_release_request(req); @@ -273,7 +381,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)  	}  out_put:  	if (put_dreq(dreq)) -		nfs_direct_complete(dreq); +		nfs_direct_complete(dreq, false);  	hdr->release(hdr);  } @@ -306,66 +414,42 @@ static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {   * handled automatically by nfs_direct_read_result().  Otherwise, if   * no requests have been sent, just return an error.   */ -static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc, -						const struct iovec *iov, -						loff_t pos, bool uio) -{ -	struct nfs_direct_req *dreq = desc->pg_dreq; -	struct nfs_open_context *ctx = dreq->ctx; -	struct inode *inode = ctx->dentry->d_inode; -	unsigned long user_addr = (unsigned long)iov->iov_base; -	size_t count = iov->iov_len; -	size_t rsize = NFS_SERVER(inode)->rsize; -	unsigned int pgbase; -	int result; -	ssize_t started = 0; -	struct page **pagevec = NULL; -	unsigned int npages; - -	do { -		size_t bytes; -		int i; -		pgbase = user_addr & ~PAGE_MASK; -		bytes = min(max_t(size_t, rsize, PAGE_SIZE), count); +static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, +					      struct iov_iter *iter, +					      loff_t pos) +{ +	struct nfs_pageio_descriptor desc; +	struct inode *inode = dreq->inode; +	ssize_t result = -EINVAL; +	size_t requested_bytes = 0; +	size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE); -		result = -ENOMEM; -		npages = nfs_page_array_len(pgbase, bytes); -		if (!pagevec) -			pagevec = kmalloc(npages * sizeof(struct page *), -					  GFP_KERNEL); -		if (!pagevec) -			break; -		if (uio) { -			down_read(¤t->mm->mmap_sem); -			result = get_user_pages(current, current->mm, user_addr, -					npages, 1, 0, pagevec, NULL); -			up_read(¤t->mm->mmap_sem); -			if (result < 0) -				break; -		} else { -			WARN_ON(npages != 1); -			result = get_kernel_page(user_addr, 1, pagevec); -			if (WARN_ON(result != 1)) -				break; -		} +	nfs_pageio_init_read(&desc, dreq->inode, false, +			     &nfs_direct_read_completion_ops); +	get_dreq(dreq); +	desc.pg_dreq = dreq; +	atomic_inc(&inode->i_dio_count); -		if ((unsigned)result < npages) { -			bytes = result * PAGE_SIZE; -			if (bytes <= pgbase) { -				nfs_direct_release_pages(pagevec, result); -				break; -			} -			bytes -= pgbase; -			npages = result; -		} +	while (iov_iter_count(iter)) { +		struct page **pagevec; +		size_t bytes; +		size_t pgbase; +		unsigned npages, i; +		result = iov_iter_get_pages_alloc(iter, &pagevec,  +						  rsize, &pgbase); +		if (result < 0) +			break; +	 +		bytes = result; +		iov_iter_advance(iter, bytes); +		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;  		for (i = 0; i < npages; i++) {  			struct nfs_page *req;  			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);  			/* XXX do we need to do the eof zeroing found in async_filler? */ -			req = nfs_create_request(dreq->ctx, dreq->inode, -						 pagevec[i], +			req = nfs_create_request(dreq->ctx, pagevec[i], NULL,  						 pgbase, req_len);  			if (IS_ERR(req)) {  				result = PTR_ERR(req); @@ -373,54 +457,21 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de  			}  			req->wb_index = pos >> PAGE_SHIFT;  			req->wb_offset = pos & ~PAGE_MASK; -			if (!nfs_pageio_add_request(desc, req)) { -				result = desc->pg_error; +			if (!nfs_pageio_add_request(&desc, req)) { +				result = desc.pg_error;  				nfs_release_request(req);  				break;  			}  			pgbase = 0;  			bytes -= req_len; -			started += req_len; -			user_addr += req_len; +			requested_bytes += req_len;  			pos += req_len; -			count -= req_len;  			dreq->bytes_left -= req_len;  		} -		/* The nfs_page now hold references to these pages */  		nfs_direct_release_pages(pagevec, npages); -	} while (count != 0 && result >= 0); - -	kfree(pagevec); - -	if (started) -		return started; -	return result < 0 ? (ssize_t) result : -EFAULT; -} - -static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, -					      const struct iovec *iov, -					      unsigned long nr_segs, -					      loff_t pos, bool uio) -{ -	struct nfs_pageio_descriptor desc; -	ssize_t result = -EINVAL; -	size_t requested_bytes = 0; -	unsigned long seg; - -	NFS_PROTO(dreq->inode)->read_pageio_init(&desc, dreq->inode, -			     &nfs_direct_read_completion_ops); -	get_dreq(dreq); -	desc.pg_dreq = dreq; - -	for (seg = 0; seg < nr_segs; seg++) { -		const struct iovec *vec = &iov[seg]; -		result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio); +		kvfree(pagevec);  		if (result < 0)  			break; -		requested_bytes += result; -		if ((size_t)result < vec->iov_len) -			break; -		pos += vec->iov_len;  	}  	nfs_pageio_complete(&desc); @@ -430,29 +481,69 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,  	 * generic layer handle the completion.  	 */  	if (requested_bytes == 0) { +		inode_dio_done(inode);  		nfs_direct_req_release(dreq);  		return result < 0 ? result : -EIO;  	}  	if (put_dreq(dreq)) -		nfs_direct_complete(dreq); +		nfs_direct_complete(dreq, false);  	return 0;  } -static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, -			       unsigned long nr_segs, loff_t pos, bool uio) +/** + * nfs_file_direct_read - file direct read operation for NFS files + * @iocb: target I/O control block + * @iter: vector of user buffers into which to read data + * @pos: byte offset in file where reading starts + * + * We use this function for direct reads instead of calling + * generic_file_aio_read() in order to avoid gfar's check to see if + * the request starts before the end of the file.  For that check + * to work, we must generate a GETATTR before each direct read, and + * even then there is a window between the GETATTR and the subsequent + * READ where the file size could change.  Our preference is simply + * to do all reads the application wants, and the server will take + * care of managing the end of file boundary. + * + * This function also eliminates unnecessarily updating the file's + * atime locally, as the NFS server sets the file's atime, and this + * client must read the updated atime from the server back into its + * cache. + */ +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, +				loff_t pos, bool uio)  { -	ssize_t result = -ENOMEM; -	struct inode *inode = iocb->ki_filp->f_mapping->host; +	struct file *file = iocb->ki_filp; +	struct address_space *mapping = file->f_mapping; +	struct inode *inode = mapping->host;  	struct nfs_direct_req *dreq;  	struct nfs_lock_context *l_ctx; +	ssize_t result = -EINVAL; +	size_t count = iov_iter_count(iter); +	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); + +	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", +		file, count, (long long) pos); + +	result = 0; +	if (!count) +		goto out; + +	mutex_lock(&inode->i_mutex); +	result = nfs_sync_mapping(mapping); +	if (result) +		goto out_unlock; +	task_io_account_read(count); + +	result = -ENOMEM;  	dreq = nfs_direct_req_alloc();  	if (dreq == NULL) -		goto out; +		goto out_unlock;  	dreq->inode = inode; -	dreq->bytes_left = iov_length(iov, nr_segs); +	dreq->bytes_left = count;  	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));  	l_ctx = nfs_get_lock_context(dreq->ctx);  	if (IS_ERR(l_ctx)) { @@ -463,22 +554,28 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,  	if (!is_sync_kiocb(iocb))  		dreq->iocb = iocb; -	NFS_I(inode)->read_io += iov_length(iov, nr_segs); -	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); -	if (!result) +	NFS_I(inode)->read_io += count; +	result = nfs_direct_read_schedule_iovec(dreq, iter, pos); + +	mutex_unlock(&inode->i_mutex); + +	if (!result) {  		result = nfs_direct_wait(dreq); +		if (result > 0) +			iocb->ki_pos = pos + result; +	} + +	nfs_direct_req_release(dreq); +	return result; +  out_release:  	nfs_direct_req_release(dreq); +out_unlock: +	mutex_unlock(&inode->i_mutex);  out:  	return result;  } -static void nfs_inode_dio_write_done(struct inode *inode) -{ -	nfs_zap_mapping(inode, inode->i_mapping); -	inode_dio_done(inode); -} -  #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)  static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)  { @@ -497,7 +594,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)  	dreq->count = 0;  	get_dreq(dreq); -	NFS_PROTO(dreq->inode)->write_pageio_init(&desc, dreq->inode, FLUSH_STABLE, +	nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,  			      &nfs_direct_write_completion_ops);  	desc.pg_dreq = dreq; @@ -536,7 +633,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)  		dprintk("NFS: %5u commit failed with error %d.\n",  			data->task.tk_pid, status);  		dreq->flags = NFS_ODIRECT_RESCHED_WRITES; -	} else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) { +	} else if (nfs_direct_cmp_commit_data_verf(dreq, data)) {  		dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);  		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;  	} @@ -594,8 +691,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)  			nfs_direct_write_reschedule(dreq);  			break;  		default: -			nfs_inode_dio_write_done(dreq->inode); -			nfs_direct_complete(dreq); +			nfs_direct_complete(dreq, true);  	}  } @@ -611,114 +707,10 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)  static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)  { -	nfs_inode_dio_write_done(inode); -	nfs_direct_complete(dreq); +	nfs_direct_complete(dreq, true);  }  #endif -/* - * NB: Return the value of the first error return code.  Subsequent - *     errors after the first one are ignored. - */ -/* - * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE - * operation.  If nfs_writedata_alloc() or get_user_pages() fails, - * bail and stop sending more writes.  Write length accounting is - * handled automatically by nfs_direct_write_result().  Otherwise, if - * no requests have been sent, just return an error. - */ -static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc, -						 const struct iovec *iov, -						 loff_t pos, bool uio) -{ -	struct nfs_direct_req *dreq = desc->pg_dreq; -	struct nfs_open_context *ctx = dreq->ctx; -	struct inode *inode = ctx->dentry->d_inode; -	unsigned long user_addr = (unsigned long)iov->iov_base; -	size_t count = iov->iov_len; -	size_t wsize = NFS_SERVER(inode)->wsize; -	unsigned int pgbase; -	int result; -	ssize_t started = 0; -	struct page **pagevec = NULL; -	unsigned int npages; - -	do { -		size_t bytes; -		int i; - -		pgbase = user_addr & ~PAGE_MASK; -		bytes = min(max_t(size_t, wsize, PAGE_SIZE), count); - -		result = -ENOMEM; -		npages = nfs_page_array_len(pgbase, bytes); -		if (!pagevec) -			pagevec = kmalloc(npages * sizeof(struct page *), GFP_KERNEL); -		if (!pagevec) -			break; - -		if (uio) { -			down_read(¤t->mm->mmap_sem); -			result = get_user_pages(current, current->mm, user_addr, -						npages, 0, 0, pagevec, NULL); -			up_read(¤t->mm->mmap_sem); -			if (result < 0) -				break; -		} else { -			WARN_ON(npages != 1); -			result = get_kernel_page(user_addr, 0, pagevec); -			if (WARN_ON(result != 1)) -				break; -		} - -		if ((unsigned)result < npages) { -			bytes = result * PAGE_SIZE; -			if (bytes <= pgbase) { -				nfs_direct_release_pages(pagevec, result); -				break; -			} -			bytes -= pgbase; -			npages = result; -		} - -		for (i = 0; i < npages; i++) { -			struct nfs_page *req; -			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); - -			req = nfs_create_request(dreq->ctx, dreq->inode, -						 pagevec[i], -						 pgbase, req_len); -			if (IS_ERR(req)) { -				result = PTR_ERR(req); -				break; -			} -			nfs_lock_request(req); -			req->wb_index = pos >> PAGE_SHIFT; -			req->wb_offset = pos & ~PAGE_MASK; -			if (!nfs_pageio_add_request(desc, req)) { -				result = desc->pg_error; -				nfs_unlock_and_release_request(req); -				break; -			} -			pgbase = 0; -			bytes -= req_len; -			started += req_len; -			user_addr += req_len; -			pos += req_len; -			count -= req_len; -			dreq->bytes_left -= req_len; -		} -		/* The nfs_page now hold references to these pages */ -		nfs_direct_release_pages(pagevec, npages); -	} while (count != 0 && result >= 0); - -	kfree(pagevec); - -	if (started) -		return started; -	return result < 0 ? (ssize_t) result : -EFAULT; -} -  static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)  {  	struct nfs_direct_req *dreq = hdr->dreq; @@ -748,13 +740,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)  			if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)  				bit = NFS_IOHDR_NEED_RESCHED;  			else if (dreq->flags == 0) { -				memcpy(&dreq->verf, hdr->verf, -				       sizeof(dreq->verf)); +				nfs_direct_set_hdr_verf(dreq, hdr);  				bit = NFS_IOHDR_NEED_COMMIT;  				dreq->flags = NFS_ODIRECT_DO_COMMIT;  			} else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { -				if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) { -					dreq->flags = NFS_ODIRECT_RESCHED_WRITES; +				if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) { +					dreq->flags = +						NFS_ODIRECT_RESCHED_WRITES;  					bit = NFS_IOHDR_NEED_RESCHED;  				} else  					bit = NFS_IOHDR_NEED_COMMIT; @@ -764,6 +756,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)  	spin_unlock(&dreq->lock);  	while (!list_empty(&hdr->pages)) { +  		req = nfs_list_entry(hdr->pages.next);  		nfs_list_remove_request(req);  		switch (bit) { @@ -798,33 +791,77 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {  	.completion = nfs_direct_write_completion,  }; + +/* + * NB: Return the value of the first error return code.  Subsequent + *     errors after the first one are ignored. + */ +/* + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation.  If nfs_writedata_alloc() or get_user_pages() fails, + * bail and stop sending more writes.  Write length accounting is + * handled automatically by nfs_direct_write_result().  Otherwise, if + * no requests have been sent, just return an error. + */  static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, -					       const struct iovec *iov, -					       unsigned long nr_segs, -					       loff_t pos, bool uio) +					       struct iov_iter *iter, +					       loff_t pos)  {  	struct nfs_pageio_descriptor desc;  	struct inode *inode = dreq->inode;  	ssize_t result = 0;  	size_t requested_bytes = 0; -	unsigned long seg; +	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); -	NFS_PROTO(inode)->write_pageio_init(&desc, inode, FLUSH_COND_STABLE, +	nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,  			      &nfs_direct_write_completion_ops);  	desc.pg_dreq = dreq;  	get_dreq(dreq);  	atomic_inc(&inode->i_dio_count); -	NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs); -	for (seg = 0; seg < nr_segs; seg++) { -		const struct iovec *vec = &iov[seg]; -		result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio); +	NFS_I(inode)->write_io += iov_iter_count(iter); +	while (iov_iter_count(iter)) { +		struct page **pagevec; +		size_t bytes; +		size_t pgbase; +		unsigned npages, i; + +		result = iov_iter_get_pages_alloc(iter, &pagevec,  +						  wsize, &pgbase);  		if (result < 0)  			break; -		requested_bytes += result; -		if ((size_t)result < vec->iov_len) + +		bytes = result; +		iov_iter_advance(iter, bytes); +		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; +		for (i = 0; i < npages; i++) { +			struct nfs_page *req; +			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); + +			req = nfs_create_request(dreq->ctx, pagevec[i], NULL, +						 pgbase, req_len); +			if (IS_ERR(req)) { +				result = PTR_ERR(req); +				break; +			} +			nfs_lock_request(req); +			req->wb_index = pos >> PAGE_SHIFT; +			req->wb_offset = pos & ~PAGE_MASK; +			if (!nfs_pageio_add_request(&desc, req)) { +				result = desc.pg_error; +				nfs_unlock_and_release_request(req); +				break; +			} +			pgbase = 0; +			bytes -= req_len; +			requested_bytes += req_len; +			pos += req_len; +			dreq->bytes_left -= req_len; +		} +		nfs_direct_release_pages(pagevec, npages); +		kvfree(pagevec); +		if (result < 0)  			break; -		pos += vec->iov_len;  	}  	nfs_pageio_complete(&desc); @@ -843,100 +880,10 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,  	return 0;  } -static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, -				unsigned long nr_segs, loff_t pos, -				size_t count, bool uio) -{ -	ssize_t result = -ENOMEM; -	struct inode *inode = iocb->ki_filp->f_mapping->host; -	struct nfs_direct_req *dreq; -	struct nfs_lock_context *l_ctx; - -	dreq = nfs_direct_req_alloc(); -	if (!dreq) -		goto out; - -	dreq->inode = inode; -	dreq->bytes_left = count; -	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); -	l_ctx = nfs_get_lock_context(dreq->ctx); -	if (IS_ERR(l_ctx)) { -		result = PTR_ERR(l_ctx); -		goto out_release; -	} -	dreq->l_ctx = l_ctx; -	if (!is_sync_kiocb(iocb)) -		dreq->iocb = iocb; - -	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio); -	if (!result) -		result = nfs_direct_wait(dreq); -out_release: -	nfs_direct_req_release(dreq); -out: -	return result; -} - -/** - * nfs_file_direct_read - file direct read operation for NFS files - * @iocb: target I/O control block - * @iov: vector of user buffers into which to read data - * @nr_segs: size of iov vector - * @pos: byte offset in file where reading starts - * - * We use this function for direct reads instead of calling - * generic_file_aio_read() in order to avoid gfar's check to see if - * the request starts before the end of the file.  For that check - * to work, we must generate a GETATTR before each direct read, and - * even then there is a window between the GETATTR and the subsequent - * READ where the file size could change.  Our preference is simply - * to do all reads the application wants, and the server will take - * care of managing the end of file boundary. - * - * This function also eliminates unnecessarily updating the file's - * atime locally, as the NFS server sets the file's atime, and this - * client must read the updated atime from the server back into its - * cache. - */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, -				unsigned long nr_segs, loff_t pos, bool uio) -{ -	ssize_t retval = -EINVAL; -	struct file *file = iocb->ki_filp; -	struct address_space *mapping = file->f_mapping; -	size_t count; - -	count = iov_length(iov, nr_segs); -	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); - -	dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n", -		file->f_path.dentry->d_parent->d_name.name, -		file->f_path.dentry->d_name.name, -		count, (long long) pos); - -	retval = 0; -	if (!count) -		goto out; - -	retval = nfs_sync_mapping(mapping); -	if (retval) -		goto out; - -	task_io_account_read(count); - -	retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio); -	if (retval > 0) -		iocb->ki_pos = pos + retval; - -out: -	return retval; -} -  /**   * nfs_file_direct_write - file direct write operation for NFS files   * @iocb: target I/O control block - * @iov: vector of user buffers from which to write data - * @nr_segs: size of iov vector + * @iter: vector of user buffers from which to write data   * @pos: byte offset in file where writing starts   *   * We use this function for direct writes instead of calling @@ -954,51 +901,97 @@ out:   * Note that O_APPEND is not supported for NFS direct writes, as there   * is no atomic O_APPEND write facility in the NFS protocol.   */ -ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, -				unsigned long nr_segs, loff_t pos, bool uio) +ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, +				loff_t pos, bool uio)  { -	ssize_t retval = -EINVAL; +	ssize_t result = -EINVAL;  	struct file *file = iocb->ki_filp;  	struct address_space *mapping = file->f_mapping; -	size_t count; +	struct inode *inode = mapping->host; +	struct nfs_direct_req *dreq; +	struct nfs_lock_context *l_ctx; +	loff_t end; +	size_t count = iov_iter_count(iter); +	end = (pos + count - 1) >> PAGE_CACHE_SHIFT; -	count = iov_length(iov, nr_segs);  	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); -	dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n", -		file->f_path.dentry->d_parent->d_name.name, -		file->f_path.dentry->d_name.name, -		count, (long long) pos); +	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", +		file, count, (long long) pos); -	retval = generic_write_checks(file, &pos, &count, 0); -	if (retval) +	result = generic_write_checks(file, &pos, &count, 0); +	if (result)  		goto out; -	retval = -EINVAL; +	result = -EINVAL;  	if ((ssize_t) count < 0)  		goto out; -	retval = 0; +	result = 0;  	if (!count)  		goto out; -	retval = nfs_sync_mapping(mapping); -	if (retval) -		goto out; +	mutex_lock(&inode->i_mutex); + +	result = nfs_sync_mapping(mapping); +	if (result) +		goto out_unlock; + +	if (mapping->nrpages) { +		result = invalidate_inode_pages2_range(mapping, +					pos >> PAGE_CACHE_SHIFT, end); +		if (result) +			goto out_unlock; +	}  	task_io_account_write(count); -	retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio); -	if (retval > 0) { -		struct inode *inode = mapping->host; +	result = -ENOMEM; +	dreq = nfs_direct_req_alloc(); +	if (!dreq) +		goto out_unlock; -		iocb->ki_pos = pos + retval; -		spin_lock(&inode->i_lock); -		if (i_size_read(inode) < iocb->ki_pos) -			i_size_write(inode, iocb->ki_pos); -		spin_unlock(&inode->i_lock); +	dreq->inode = inode; +	dreq->bytes_left = count; +	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); +	l_ctx = nfs_get_lock_context(dreq->ctx); +	if (IS_ERR(l_ctx)) { +		result = PTR_ERR(l_ctx); +		goto out_release; +	} +	dreq->l_ctx = l_ctx; +	if (!is_sync_kiocb(iocb)) +		dreq->iocb = iocb; + +	result = nfs_direct_write_schedule_iovec(dreq, iter, pos); + +	if (mapping->nrpages) { +		invalidate_inode_pages2_range(mapping, +					      pos >> PAGE_CACHE_SHIFT, end);  	} + +	mutex_unlock(&inode->i_mutex); + +	if (!result) { +		result = nfs_direct_wait(dreq); +		if (result > 0) { +			struct inode *inode = mapping->host; + +			iocb->ki_pos = pos + result; +			spin_lock(&inode->i_lock); +			if (i_size_read(inode) < iocb->ki_pos) +				i_size_write(inode, iocb->ki_pos); +			spin_unlock(&inode->i_lock); +		} +	} +	nfs_direct_req_release(dreq); +	return result; + +out_release: +	nfs_direct_req_release(dreq); +out_unlock: +	mutex_unlock(&inode->i_mutex);  out: -	return retval; +	return result;  }  /** diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index fc0f95ec735..d25f10fb492 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -46,7 +46,9 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,  #include <linux/sunrpc/cache.h>  #include <linux/sunrpc/svcauth.h>  #include <linux/sunrpc/rpc_pipe_fs.h> +#include <linux/nfs_fs.h> +#include "nfs4_fs.h"  #include "dns_resolve.h"  #include "cache_lib.h"  #include "netns.h" diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 1e6bfdbc1af..4042ff58fe3 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -65,9 +65,7 @@ nfs_file_open(struct inode *inode, struct file *filp)  {  	int res; -	dprintk("NFS: open file(%s/%s)\n", -			filp->f_path.dentry->d_parent->d_name.name, -			filp->f_path.dentry->d_name.name); +	dprintk("NFS: open file(%pD2)\n", filp);  	nfs_inc_stats(inode, NFSIOS_VFSOPEN);  	res = nfs_check_flags(filp->f_flags); @@ -81,9 +79,7 @@ nfs_file_open(struct inode *inode, struct file *filp)  int  nfs_file_release(struct inode *inode, struct file *filp)  { -	dprintk("NFS: release(%s/%s)\n", -			filp->f_path.dentry->d_parent->d_name.name, -			filp->f_path.dentry->d_name.name); +	dprintk("NFS: release(%pD2)\n", filp);  	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);  	return nfs_release(inode, filp); @@ -123,10 +119,8 @@ force_reval:  loff_t nfs_file_llseek(struct file *filp, loff_t offset, int whence)  { -	dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", -			filp->f_path.dentry->d_parent->d_name.name, -			filp->f_path.dentry->d_name.name, -			offset, whence); +	dprintk("NFS: llseek file(%pD2, %lld, %d)\n", +			filp, offset, whence);  	/*  	 * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate @@ -150,12 +144,9 @@ EXPORT_SYMBOL_GPL(nfs_file_llseek);  int  nfs_file_flush(struct file *file, fl_owner_t id)  { -	struct dentry	*dentry = file->f_path.dentry; -	struct inode	*inode = dentry->d_inode; +	struct inode	*inode = file_inode(file); -	dprintk("NFS: flush(%s/%s)\n", -			dentry->d_parent->d_name.name, -			dentry->d_name.name); +	dprintk("NFS: flush(%pD2)\n", file);  	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);  	if ((file->f_mode & FMODE_WRITE) == 0) @@ -174,23 +165,21 @@ nfs_file_flush(struct file *file, fl_owner_t id)  EXPORT_SYMBOL_GPL(nfs_file_flush);  ssize_t -nfs_file_read(struct kiocb *iocb, const struct iovec *iov, -		unsigned long nr_segs, loff_t pos) +nfs_file_read(struct kiocb *iocb, struct iov_iter *to)  { -	struct dentry * dentry = iocb->ki_filp->f_path.dentry; -	struct inode * inode = dentry->d_inode; +	struct inode *inode = file_inode(iocb->ki_filp);  	ssize_t result;  	if (iocb->ki_filp->f_flags & O_DIRECT) -		return nfs_file_direct_read(iocb, iov, nr_segs, pos, true); +		return nfs_file_direct_read(iocb, to, iocb->ki_pos, true); -	dprintk("NFS: read(%s/%s, %lu@%lu)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name, -		(unsigned long) iov_length(iov, nr_segs), (unsigned long) pos); +	dprintk("NFS: read(%pD2, %zu@%lu)\n", +		iocb->ki_filp, +		iov_iter_count(to), (unsigned long) iocb->ki_pos);  	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);  	if (!result) { -		result = generic_file_aio_read(iocb, iov, nr_segs, pos); +		result = generic_file_read_iter(iocb, to);  		if (result > 0)  			nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);  	} @@ -203,13 +192,11 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,  		     struct pipe_inode_info *pipe, size_t count,  		     unsigned int flags)  { -	struct dentry *dentry = filp->f_path.dentry; -	struct inode *inode = dentry->d_inode; +	struct inode *inode = file_inode(filp);  	ssize_t res; -	dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name, -		(unsigned long) count, (unsigned long long) *ppos); +	dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", +		filp, (unsigned long) count, (unsigned long long) *ppos);  	res = nfs_revalidate_mapping(inode, filp->f_mapping);  	if (!res) { @@ -224,12 +211,10 @@ EXPORT_SYMBOL_GPL(nfs_file_splice_read);  int  nfs_file_mmap(struct file * file, struct vm_area_struct * vma)  { -	struct dentry *dentry = file->f_path.dentry; -	struct inode *inode = dentry->d_inode; +	struct inode *inode = file_inode(file);  	int	status; -	dprintk("NFS: mmap(%s/%s)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name); +	dprintk("NFS: mmap(%pD2)\n", file);  	/* Note: generic_file_mmap() returns ENOSYS on nommu systems  	 *       so we call that before revalidating the mapping @@ -258,15 +243,12 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap);  int  nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)  { -	struct dentry *dentry = file->f_path.dentry;  	struct nfs_open_context *ctx = nfs_file_open_context(file); -	struct inode *inode = dentry->d_inode; +	struct inode *inode = file_inode(file);  	int have_error, do_resend, status;  	int ret = 0; -	dprintk("NFS: fsync file(%s/%s) datasync %d\n", -			dentry->d_parent->d_name.name, dentry->d_name.name, -			datasync); +	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);  	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); @@ -371,10 +353,8 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,  	struct page *page;  	int once_thru = 0; -	dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", -		file->f_path.dentry->d_parent->d_name.name, -		file->f_path.dentry->d_name.name, -		mapping->host->i_ino, len, (long long) pos); +	dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", +		file, mapping->host->i_ino, len, (long long) pos);  start:  	/* @@ -414,10 +394,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,  	struct nfs_open_context *ctx = nfs_file_open_context(file);  	int status; -	dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", -		file->f_path.dentry->d_parent->d_name.name, -		file->f_path.dentry->d_name.name, -		mapping->host->i_ino, len, (long long) pos); +	dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", +		file, mapping->host->i_ino, len, (long long) pos);  	/*  	 * Zero any uninitialised parts of the page, and then mark the page @@ -601,22 +579,21 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  {  	struct page *page = vmf->page;  	struct file *filp = vma->vm_file; -	struct dentry *dentry = filp->f_path.dentry; +	struct inode *inode = file_inode(filp);  	unsigned pagelen;  	int ret = VM_FAULT_NOPAGE;  	struct address_space *mapping; -	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name, -		filp->f_mapping->host->i_ino, +	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n", +		filp, filp->f_mapping->host->i_ino,  		(long long)page_offset(page));  	/* make sure the cache has finished storing the page */ -	nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); +	nfs_fscache_wait_on_page_write(NFS_I(inode), page);  	lock_page(page);  	mapping = page_file_mapping(page); -	if (mapping != dentry->d_inode->i_mapping) +	if (mapping != inode->i_mapping)  		goto out_unlock;  	wait_on_page_writeback(page); @@ -639,6 +616,7 @@ out:  static const struct vm_operations_struct nfs_file_vm_ops = {  	.fault = filemap_fault, +	.map_pages = filemap_map_pages,  	.page_mkwrite = nfs_vm_page_mkwrite,  	.remap_pages = generic_file_remap_pages,  }; @@ -656,25 +634,24 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode)  	return 0;  } -ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, -		       unsigned long nr_segs, loff_t pos) +ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)  { -	struct dentry * dentry = iocb->ki_filp->f_path.dentry; -	struct inode * inode = dentry->d_inode; +	struct file *file = iocb->ki_filp; +	struct inode *inode = file_inode(file);  	unsigned long written = 0;  	ssize_t result; -	size_t count = iov_length(iov, nr_segs); +	size_t count = iov_iter_count(from); +	loff_t pos = iocb->ki_pos; -	result = nfs_key_timeout_notify(iocb->ki_filp, inode); +	result = nfs_key_timeout_notify(file, inode);  	if (result)  		return result; -	if (iocb->ki_filp->f_flags & O_DIRECT) -		return nfs_file_direct_write(iocb, iov, nr_segs, pos, true); +	if (file->f_flags & O_DIRECT) +		return nfs_file_direct_write(iocb, from, pos, true); -	dprintk("NFS: write(%s/%s, %lu@%Ld)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name, -		(unsigned long) count, (long long) pos); +	dprintk("NFS: write(%pD2, %zu@%Ld)\n", +		file, count, (long long) pos);  	result = -EBUSY;  	if (IS_SWAPFILE(inode)) @@ -682,8 +659,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,  	/*  	 * O_APPEND implies that we must revalidate the file length.  	 */ -	if (iocb->ki_filp->f_flags & O_APPEND) { -		result = nfs_revalidate_file_size(inode, iocb->ki_filp); +	if (file->f_flags & O_APPEND) { +		result = nfs_revalidate_file_size(inode, file);  		if (result)  			goto out;  	} @@ -692,13 +669,13 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,  	if (!count)  		goto out; -	result = generic_file_aio_write(iocb, iov, nr_segs, pos); +	result = generic_file_write_iter(iocb, from);  	if (result > 0)  		written = result;  	/* Return error values for O_DSYNC and IS_SYNC() */ -	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { -		int err = vfs_fsync(iocb->ki_filp, 0); +	if (result >= 0 && nfs_need_sync_write(file, inode)) { +		int err = vfs_fsync(file, 0);  		if (err < 0)  			result = err;  	} @@ -713,38 +690,6 @@ out_swapfile:  }  EXPORT_SYMBOL_GPL(nfs_file_write); -ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, -			      struct file *filp, loff_t *ppos, -			      size_t count, unsigned int flags) -{ -	struct dentry *dentry = filp->f_path.dentry; -	struct inode *inode = dentry->d_inode; -	unsigned long written = 0; -	ssize_t ret; - -	dprintk("NFS splice_write(%s/%s, %lu@%llu)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name, -		(unsigned long) count, (unsigned long long) *ppos); - -	/* -	 * The combination of splice and an O_APPEND destination is disallowed. -	 */ - -	ret = generic_file_splice_write(pipe, filp, ppos, count, flags); -	if (ret > 0) -		written = ret; - -	if (ret >= 0 && nfs_need_sync_write(filp, inode)) { -		int err = vfs_fsync(filp, 0); -		if (err < 0) -			ret = err; -	} -	if (ret > 0) -		nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); -	return ret; -} -EXPORT_SYMBOL_GPL(nfs_file_splice_write); -  static int  do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)  { @@ -883,10 +828,8 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)  	int ret = -ENOLCK;  	int is_local = 0; -	dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n", -			filp->f_path.dentry->d_parent->d_name.name, -			filp->f_path.dentry->d_name.name, -			fl->fl_type, fl->fl_flags, +	dprintk("NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld)\n", +			filp, fl->fl_type, fl->fl_flags,  			(long long)fl->fl_start, (long long)fl->fl_end);  	nfs_inc_stats(inode, NFSIOS_VFSLOCK); @@ -923,10 +866,8 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)  	struct inode *inode = filp->f_mapping->host;  	int is_local = 0; -	dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n", -			filp->f_path.dentry->d_parent->d_name.name, -			filp->f_path.dentry->d_name.name, -			fl->fl_type, fl->fl_flags); +	dprintk("NFS: flock(%pD2, t=%x, fl=%x)\n", +			filp, fl->fl_type, fl->fl_flags);  	if (!(fl->fl_flags & FL_FLOCK))  		return -ENOLCK; @@ -944,10 +885,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)  		is_local = 1;  	/* We're simulating flock() locks using posix locks on the server */ -	fl->fl_owner = (fl_owner_t)filp; -	fl->fl_start = 0; -	fl->fl_end = OFFSET_MAX; -  	if (fl->fl_type == F_UNLCK)  		return do_unlk(filp, cmd, fl, is_local);  	return do_setlk(filp, cmd, fl, is_local); @@ -960,19 +897,17 @@ EXPORT_SYMBOL_GPL(nfs_flock);   */  int nfs_setlease(struct file *file, long arg, struct file_lock **fl)  { -	dprintk("NFS: setlease(%s/%s, arg=%ld)\n", -			file->f_path.dentry->d_parent->d_name.name, -			file->f_path.dentry->d_name.name, arg); +	dprintk("NFS: setlease(%pD2, arg=%ld)\n", file, arg);  	return -EINVAL;  }  EXPORT_SYMBOL_GPL(nfs_setlease);  const struct file_operations nfs_file_operations = {  	.llseek		= nfs_file_llseek, -	.read		= do_sync_read, -	.write		= do_sync_write, -	.aio_read	= nfs_file_read, -	.aio_write	= nfs_file_write, +	.read		= new_sync_read, +	.write		= new_sync_write, +	.read_iter	= nfs_file_read, +	.write_iter	= nfs_file_write,  	.mmap		= nfs_file_mmap,  	.open		= nfs_file_open,  	.flush		= nfs_file_flush, @@ -981,7 +916,7 @@ const struct file_operations nfs_file_operations = {  	.lock		= nfs_lock,  	.flock		= nfs_flock,  	.splice_read	= nfs_file_splice_read, -	.splice_write	= nfs_file_splice_write, +	.splice_write	= iter_file_splice_write,  	.check_flags	= nfs_check_flags,  	.setlease	= nfs_setlease,  }; diff --git a/fs/nfs/filelayout/Makefile b/fs/nfs/filelayout/Makefile new file mode 100644 index 00000000000..8516cdffb9e --- /dev/null +++ b/fs/nfs/filelayout/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS Files Layout Driver kernel module +# +obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o +nfs_layout_nfsv41_files-y := filelayout.o filelayoutdev.o diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/filelayout/filelayout.c index b86464ba25e..d2eba1c13b7 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -35,11 +35,11 @@  #include <linux/sunrpc/metrics.h> -#include "nfs4session.h" -#include "internal.h" -#include "delegation.h" -#include "nfs4filelayout.h" -#include "nfs4trace.h" +#include "../nfs4session.h" +#include "../internal.h" +#include "../delegation.h" +#include "filelayout.h" +#include "../nfs4trace.h"  #define NFSDBG_FACILITY         NFSDBG_PNFS_LD @@ -84,17 +84,17 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)  	BUG();  } -static void filelayout_reset_write(struct nfs_write_data *data) +static void filelayout_reset_write(struct nfs_pgio_data *data)  {  	struct nfs_pgio_header *hdr = data->header;  	struct rpc_task *task = &data->task;  	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {  		dprintk("%s Reset task %5u for i/o through MDS " -			"(req %s/%lld, %u bytes @ offset %llu)\n", __func__, +			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,  			data->task.tk_pid,  			hdr->inode->i_sb->s_id, -			(long long)NFS_FILEID(hdr->inode), +			(unsigned long long)NFS_FILEID(hdr->inode),  			data->args.count,  			(unsigned long long)data->args.offset); @@ -105,17 +105,17 @@ static void filelayout_reset_write(struct nfs_write_data *data)  	}  } -static void filelayout_reset_read(struct nfs_read_data *data) +static void filelayout_reset_read(struct nfs_pgio_data *data)  {  	struct nfs_pgio_header *hdr = data->header;  	struct rpc_task *task = &data->task;  	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {  		dprintk("%s Reset task %5u for i/o through MDS " -			"(req %s/%lld, %u bytes @ offset %llu)\n", __func__, +			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,  			data->task.tk_pid,  			hdr->inode->i_sb->s_id, -			(long long)NFS_FILEID(hdr->inode), +			(unsigned long long)NFS_FILEID(hdr->inode),  			data->args.count,  			(unsigned long long)data->args.offset); @@ -243,7 +243,7 @@ wait_on_recovery:  /* NFS_PROTO call done callback routines */  static int filelayout_read_done_cb(struct rpc_task *task, -				struct nfs_read_data *data) +				struct nfs_pgio_data *data)  {  	struct nfs_pgio_header *hdr = data->header;  	int err; @@ -270,7 +270,7 @@ static int filelayout_read_done_cb(struct rpc_task *task,   * rfc5661 is not clear about which credential should be used.   */  static void -filelayout_set_layoutcommit(struct nfs_write_data *wdata) +filelayout_set_layoutcommit(struct nfs_pgio_data *wdata)  {  	struct nfs_pgio_header *hdr = wdata->header; @@ -279,7 +279,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)  		return;  	pnfs_set_layoutcommit(wdata); -	dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, +	dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,  		(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);  } @@ -305,7 +305,7 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)   */  static void filelayout_read_prepare(struct rpc_task *task, void *data)  { -	struct nfs_read_data *rdata = data; +	struct nfs_pgio_data *rdata = data;  	if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) {  		rpc_exit(task, -EIO); @@ -317,26 +317,29 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)  		rpc_exit(task, 0);  		return;  	} -	rdata->read_done_cb = filelayout_read_done_cb; +	rdata->pgio_done_cb = filelayout_read_done_cb;  	if (nfs41_setup_sequence(rdata->ds_clp->cl_session,  			&rdata->args.seq_args,  			&rdata->res.seq_res,  			task))  		return; -	nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, -			rdata->args.lock_context, FMODE_READ); +	if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, +			rdata->args.lock_context, FMODE_READ) == -EIO) +		rpc_exit(task, -EIO); /* lost lock, terminate I/O */  }  static void filelayout_read_call_done(struct rpc_task *task, void *data)  { -	struct nfs_read_data *rdata = data; +	struct nfs_pgio_data *rdata = data;  	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);  	if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) && -	    task->tk_status == 0) +	    task->tk_status == 0) { +		nfs41_sequence_done(task, &rdata->res.seq_res);  		return; +	}  	/* Note this may cause RPC to be resent */  	rdata->header->mds_ops->rpc_call_done(task, data); @@ -344,14 +347,14 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)  static void filelayout_read_count_stats(struct rpc_task *task, void *data)  { -	struct nfs_read_data *rdata = data; +	struct nfs_pgio_data *rdata = data;  	rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics);  }  static void filelayout_read_release(void *data)  { -	struct nfs_read_data *rdata = data; +	struct nfs_pgio_data *rdata = data;  	struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout;  	filelayout_fenceme(lo->plh_inode, lo); @@ -360,7 +363,7 @@ static void filelayout_read_release(void *data)  }  static int filelayout_write_done_cb(struct rpc_task *task, -				struct nfs_write_data *data) +				struct nfs_pgio_data *data)  {  	struct nfs_pgio_header *hdr = data->header;  	int err; @@ -416,7 +419,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,  static void filelayout_write_prepare(struct rpc_task *task, void *data)  { -	struct nfs_write_data *wdata = data; +	struct nfs_pgio_data *wdata = data;  	if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) {  		rpc_exit(task, -EIO); @@ -433,17 +436,20 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)  			&wdata->res.seq_res,  			task))  		return; -	nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, -			wdata->args.lock_context, FMODE_WRITE); +	if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, +			wdata->args.lock_context, FMODE_WRITE) == -EIO) +		rpc_exit(task, -EIO); /* lost lock, terminate I/O */  }  static void filelayout_write_call_done(struct rpc_task *task, void *data)  { -	struct nfs_write_data *wdata = data; +	struct nfs_pgio_data *wdata = data;  	if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) && -	    task->tk_status == 0) +	    task->tk_status == 0) { +		nfs41_sequence_done(task, &wdata->res.seq_res);  		return; +	}  	/* Note this may cause RPC to be resent */  	wdata->header->mds_ops->rpc_call_done(task, data); @@ -451,14 +457,14 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)  static void filelayout_write_count_stats(struct rpc_task *task, void *data)  { -	struct nfs_write_data *wdata = data; +	struct nfs_pgio_data *wdata = data;  	rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics);  }  static void filelayout_write_release(void *data)  { -	struct nfs_write_data *wdata = data; +	struct nfs_pgio_data *wdata = data;  	struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout;  	filelayout_fenceme(lo->plh_inode, lo); @@ -523,7 +529,7 @@ static const struct rpc_call_ops filelayout_commit_call_ops = {  };  static enum pnfs_try_status -filelayout_read_pagelist(struct nfs_read_data *data) +filelayout_read_pagelist(struct nfs_pgio_data *data)  {  	struct nfs_pgio_header *hdr = data->header;  	struct pnfs_layout_segment *lseg = hdr->lseg; @@ -554,6 +560,7 @@ filelayout_read_pagelist(struct nfs_read_data *data)  	/* No multipath support. Use first DS */  	atomic_inc(&ds->ds_clp->cl_count);  	data->ds_clp = ds->ds_clp; +	data->ds_idx = idx;  	fh = nfs4_fl_select_ds_fh(lseg, j);  	if (fh)  		data->args.fh = fh; @@ -562,14 +569,14 @@ filelayout_read_pagelist(struct nfs_read_data *data)  	data->mds_offset = offset;  	/* Perform an asynchronous read to ds */ -	nfs_initiate_read(ds_clnt, data, -				  &filelayout_read_call_ops, RPC_TASK_SOFTCONN); +	nfs_initiate_pgio(ds_clnt, data, +			    &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);  	return PNFS_ATTEMPTED;  }  /* Perform async writes. */  static enum pnfs_try_status -filelayout_write_pagelist(struct nfs_write_data *data, int sync) +filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)  {  	struct nfs_pgio_header *hdr = data->header;  	struct pnfs_layout_segment *lseg = hdr->lseg; @@ -594,20 +601,18 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)  		__func__, hdr->inode->i_ino, sync, (size_t) data->args.count,  		offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); -	data->write_done_cb = filelayout_write_done_cb; +	data->pgio_done_cb = filelayout_write_done_cb;  	atomic_inc(&ds->ds_clp->cl_count);  	data->ds_clp = ds->ds_clp; +	data->ds_idx = idx;  	fh = nfs4_fl_select_ds_fh(lseg, j);  	if (fh)  		data->args.fh = fh; -	/* -	 * Get the file offset on the dserver. Set the write offset to -	 * this offset and save the original offset. -	 */ +  	data->args.offset = filelayout_get_dserver_offset(lseg, offset);  	/* Perform an asynchronous write */ -	nfs_initiate_write(ds_clnt, data, +	nfs_initiate_pgio(ds_clnt, data,  				    &filelayout_write_call_ops, sync,  				    RPC_TASK_SOFTCONN);  	return PNFS_ATTEMPTED; @@ -631,7 +636,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,  	struct nfs4_deviceid_node *d;  	struct nfs4_file_layout_dsaddr *dsaddr;  	int status = -EINVAL; -	struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);  	dprintk("--> %s\n", __func__); @@ -649,7 +653,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,  		goto out;  	} -	if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) { +	if (!fl->stripe_unit) {  		dprintk("%s Invalid stripe unit (%u)\n",  			__func__, fl->stripe_unit);  		goto out; @@ -686,12 +690,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,  		goto out_put;  	} -	if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { -		dprintk("%s Stripe unit (%u) not aligned with rsize %u " -			"wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, -			nfss->wsize); -	} -  	status = 0;  out:  	dprintk("--> %s returns %d\n", __func__, status); @@ -844,11 +842,15 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,  {  	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);  	struct pnfs_commit_bucket *buckets; -	int size; +	int size, i;  	if (fl->commit_through_mds)  		return 0; -	if (cinfo->ds->nbuckets != 0) { + +	size = (fl->stripe_type == STRIPE_SPARSE) ? +		fl->dsaddr->ds_num : fl->dsaddr->stripe_count; + +	if (cinfo->ds->nbuckets >= size) {  		/* This assumes there is only one IOMODE_RW lseg.  What  		 * we really want to do is have a layout_hdr level  		 * dictionary of <multipath_list4, fh> keys, each @@ -858,30 +860,36 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,  		return 0;  	} -	size = (fl->stripe_type == STRIPE_SPARSE) ? -		fl->dsaddr->ds_num : fl->dsaddr->stripe_count; -  	buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),  			  gfp_flags);  	if (!buckets)  		return -ENOMEM; -	else { -		int i; +	for (i = 0; i < size; i++) { +		INIT_LIST_HEAD(&buckets[i].written); +		INIT_LIST_HEAD(&buckets[i].committing); +		/* mark direct verifier as unset */ +		buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; +	} -		spin_lock(cinfo->lock); -		if (cinfo->ds->nbuckets != 0) -			kfree(buckets); -		else { -			cinfo->ds->buckets = buckets; -			cinfo->ds->nbuckets = size; -			for (i = 0; i < size; i++) { -				INIT_LIST_HEAD(&buckets[i].written); -				INIT_LIST_HEAD(&buckets[i].committing); -			} -		} -		spin_unlock(cinfo->lock); -		return 0; +	spin_lock(cinfo->lock); +	if (cinfo->ds->nbuckets >= size) +		goto out; +	for (i = 0; i < cinfo->ds->nbuckets; i++) { +		list_splice(&cinfo->ds->buckets[i].written, +			    &buckets[i].written); +		list_splice(&cinfo->ds->buckets[i].committing, +			    &buckets[i].committing); +		buckets[i].direct_verf.committed = +			cinfo->ds->buckets[i].direct_verf.committed; +		buckets[i].wlseg = cinfo->ds->buckets[i].wlseg; +		buckets[i].clseg = cinfo->ds->buckets[i].clseg;  	} +	swap(cinfo->ds->buckets, buckets); +	cinfo->ds->nbuckets = size; +out: +	spin_unlock(cinfo->lock); +	kfree(buckets); +	return 0;  }  static struct pnfs_layout_segment * @@ -909,47 +917,51 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,  /*   * filelayout_pg_test(). Called by nfs_can_coalesce_requests()   * - * return true  : coalesce page - * return false : don't coalesce page + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced.   */ -static bool +static size_t  filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,  		   struct nfs_page *req)  { +	unsigned int size;  	u64 p_stripe, r_stripe; -	u32 stripe_unit; +	u32 stripe_offset; +	u64 segment_offset = pgio->pg_lseg->pls_range.offset; +	u32 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; -	if (!pnfs_generic_pg_test(pgio, prev, req) || -	    !nfs_generic_pg_test(pgio, prev, req)) -		return false; +	/* calls nfs_generic_pg_test */ +	size = pnfs_generic_pg_test(pgio, prev, req); +	if (!size) +		return 0; -	p_stripe = (u64)req_offset(prev); -	r_stripe = (u64)req_offset(req); -	stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; +	/* see if req and prev are in the same stripe */ +	if (prev) { +		p_stripe = (u64)req_offset(prev) - segment_offset; +		r_stripe = (u64)req_offset(req) - segment_offset; +		do_div(p_stripe, stripe_unit); +		do_div(r_stripe, stripe_unit); -	do_div(p_stripe, stripe_unit); -	do_div(r_stripe, stripe_unit); +		if (p_stripe != r_stripe) +			return 0; +	} -	return (p_stripe == r_stripe); +	/* calculate remaining bytes in the current stripe */ +	div_u64_rem((u64)req_offset(req) - segment_offset, +			stripe_unit, +			&stripe_offset); +	WARN_ON_ONCE(stripe_offset > stripe_unit); +	if (stripe_offset >= stripe_unit) +		return 0; +	return min(stripe_unit - (unsigned int)stripe_offset, size);  }  static void  filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,  			struct nfs_page *req)  { -	WARN_ON_ONCE(pgio->pg_lseg != NULL); - -	if (req->wb_offset != req->wb_pgbase) { -		/* -		 * Handling unaligned pages is difficult, because have to -		 * somehow split a req in two in certain cases in the -		 * pg.test code.  Avoid this by just not using pnfs -		 * in this case. -		 */ -		nfs_pageio_reset_read_mds(pgio); -		return; -	} -	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, +	if (!pgio->pg_lseg) +		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,  					   req->wb_context,  					   0,  					   NFS4_MAX_UINT64, @@ -967,11 +979,8 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,  	struct nfs_commit_info cinfo;  	int status; -	WARN_ON_ONCE(pgio->pg_lseg != NULL); - -	if (req->wb_offset != req->wb_pgbase) -		goto out_mds; -	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, +	if (!pgio->pg_lseg) +		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,  					   req->wb_context,  					   0,  					   NFS4_MAX_UINT64, @@ -1061,6 +1070,7 @@ filelayout_choose_commit_list(struct nfs_page *req,  	 */  	j = nfs4_fl_calc_j_index(lseg, req_offset(req));  	i = select_bucket_index(fl, j); +	spin_lock(cinfo->lock);  	buckets = cinfo->ds->buckets;  	list = &buckets[i].written;  	if (list_empty(list)) { @@ -1074,6 +1084,7 @@ filelayout_choose_commit_list(struct nfs_page *req,  	}  	set_bit(PG_COMMIT_TO_DS, &req->wb_flags);  	cinfo->ds->nwritten++; +	spin_unlock(cinfo->lock);  	return list;  } @@ -1170,6 +1181,7 @@ transfer_commit_list(struct list_head *src, struct list_head *dst,  	return ret;  } +/* Note called with cinfo->lock held. */  static int  filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,  			       struct nfs_commit_info *cinfo, @@ -1214,19 +1226,22 @@ static void filelayout_recover_commit_reqs(struct list_head *dst,  					   struct nfs_commit_info *cinfo)  {  	struct pnfs_commit_bucket *b; +	struct pnfs_layout_segment *freeme;  	int i; -	/* NOTE cinfo->lock is NOT held, relying on fact that this is -	 * only called on single thread per dreq. -	 * Can't take the lock because need to do pnfs_put_lseg -	 */ +restart: +	spin_lock(cinfo->lock);  	for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {  		if (transfer_commit_list(&b->written, dst, cinfo, 0)) { -			pnfs_put_lseg(b->wlseg); +			freeme = b->wlseg;  			b->wlseg = NULL; +			spin_unlock(cinfo->lock); +			pnfs_put_lseg(freeme); +			goto restart;  		}  	}  	cinfo->ds->nwritten = 0; +	spin_unlock(cinfo->lock);  }  static unsigned int @@ -1237,6 +1252,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)  	struct nfs_commit_data *data;  	int i, j;  	unsigned int nreq = 0; +	struct pnfs_layout_segment *freeme;  	fl_cinfo = cinfo->ds;  	bucket = fl_cinfo->buckets; @@ -1247,8 +1263,10 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)  		if (!data)  			break;  		data->ds_commit_index = i; +		spin_lock(cinfo->lock);  		data->lseg = bucket->clseg;  		bucket->clseg = NULL; +		spin_unlock(cinfo->lock);  		list_add(&data->pages, list);  		nreq++;  	} @@ -1258,8 +1276,11 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)  		if (list_empty(&bucket->committing))  			continue;  		nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); -		pnfs_put_lseg(bucket->clseg); +		spin_lock(cinfo->lock); +		freeme = bucket->clseg;  		bucket->clseg = NULL; +		spin_unlock(cinfo->lock); +		pnfs_put_lseg(freeme);  	}  	/* Caller will clean up entries put on list */  	return nreq; @@ -1324,7 +1345,7 @@ filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)  	struct nfs4_filelayout *flo;  	flo = kzalloc(sizeof(*flo), gfp_flags); -	return &flo->generic_hdr; +	return flo != NULL ? &flo->generic_hdr : NULL;  }  static void diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/filelayout/filelayout.h index cebd20e7e92..ffbddf2219e 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -30,7 +30,7 @@  #ifndef FS_NFS_NFS4FILELAYOUT_H  #define FS_NFS_NFS4FILELAYOUT_H -#include "pnfs.h" +#include "../pnfs.h"  /*   * Default data server connection timeout and retrans vaules. diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 95604f64cab..44bf0140a4c 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -33,9 +33,9 @@  #include <linux/module.h>  #include <linux/sunrpc/addr.h> -#include "internal.h" -#include "nfs4session.h" -#include "nfs4filelayout.h" +#include "../internal.h" +#include "../nfs4session.h" +#include "filelayout.h"  #define NFSDBG_FACILITY		NFSDBG_PNFS_LD @@ -95,7 +95,7 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)  		b6 = (struct sockaddr_in6 *)addr2;  		/* LINKLOCAL addresses must have matching scope_id */ -		if (ipv6_addr_scope(&a6->sin6_addr) == +		if (ipv6_addr_src_scope(&a6->sin6_addr) ==  		    IPV6_ADDR_SCOPE_LINKLOCAL &&  		    a6->sin6_scope_id != b6->sin6_scope_id)  			return false; @@ -185,6 +185,7 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)  	if (status)  		goto out_put; +	smp_wmb();  	ds->ds_clp = clp;  	dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);  out: @@ -788,9 +789,9 @@ static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)  static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)  { -	smp_mb__before_clear_bit(); +	smp_mb__before_atomic();  	clear_bit(NFS4DS_CONNECTING, &ds->ds_state); -	smp_mb__after_clear_bit(); +	smp_mb__after_atomic();  	wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);  } @@ -801,34 +802,35 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)  	struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;  	struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];  	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); - -	if (filelayout_test_devid_unavailable(devid)) -		return NULL; +	struct nfs4_pnfs_ds *ret = ds;  	if (ds == NULL) {  		printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",  			__func__, ds_idx);  		filelayout_mark_devid_invalid(devid); -		return NULL; +		goto out;  	} +	smp_rmb();  	if (ds->ds_clp) -		return ds; +		goto out_test_devid;  	if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {  		struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);  		int err;  		err = nfs4_ds_connect(s, ds); -		if (err) { +		if (err)  			nfs4_mark_deviceid_unavailable(devid); -			ds = NULL; -		}  		nfs4_clear_ds_conn_bit(ds);  	} else {  		/* Either ds is connected, or ds is NULL */  		nfs4_wait_ds_connect(ds);  	} -	return ds; +out_test_devid: +	if (filelayout_test_devid_unavailable(devid)) +		ret = NULL; +out: +	return ret;  }  module_param(dataserver_retrans, uint, 0644); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 24d1d1c5fca..3ef01f0ba0b 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -39,7 +39,7 @@ void nfs_fscache_get_client_cookie(struct nfs_client *clp)  	/* create a cache index for looking up filehandles */  	clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,  					      &nfs_fscache_server_index_def, -					      clp); +					      clp, true);  	dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",  		 clp, clp->fscache);  } @@ -139,7 +139,7 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int  	/* create a cache index for looking up filehandles */  	nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,  					       &nfs_fscache_super_index_def, -					       nfss); +					       nfss, true);  	dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",  		 nfss, nfss->fscache);  	return; @@ -178,163 +178,79 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)  /*   * Initialise the per-inode cache cookie pointer for an NFS inode.   */ -void nfs_fscache_init_inode_cookie(struct inode *inode) +void nfs_fscache_init_inode(struct inode *inode)  { -	NFS_I(inode)->fscache = NULL; -	if (S_ISREG(inode->i_mode)) -		set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); -} - -/* - * Get the per-inode cache cookie for an NFS inode. - */ -static void nfs_fscache_enable_inode_cookie(struct inode *inode) -{ -	struct super_block *sb = inode->i_sb;  	struct nfs_inode *nfsi = NFS_I(inode); -	if (nfsi->fscache || !NFS_FSCACHE(inode)) +	nfsi->fscache = NULL; +	if (!S_ISREG(inode->i_mode))  		return; - -	if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) { -		nfsi->fscache = fscache_acquire_cookie( -			NFS_SB(sb)->fscache, -			&nfs_fscache_inode_object_def, -			nfsi); - -		dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n", -			 sb, nfsi, nfsi->fscache); -	} +	nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, +					       &nfs_fscache_inode_object_def, +					       nfsi, false);  }  /*   * Release a per-inode cookie.   */ -void nfs_fscache_release_inode_cookie(struct inode *inode) +void nfs_fscache_clear_inode(struct inode *inode)  {  	struct nfs_inode *nfsi = NFS_I(inode); +	struct fscache_cookie *cookie = nfs_i_fscache(inode); -	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", -		 nfsi, nfsi->fscache); +	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); -	fscache_relinquish_cookie(nfsi->fscache, 0); +	fscache_relinquish_cookie(cookie, false);  	nfsi->fscache = NULL;  } -/* - * Retire a per-inode cookie, destroying the data attached to it. - */ -void nfs_fscache_zap_inode_cookie(struct inode *inode) +static bool nfs_fscache_can_enable(void *data)  { -	struct nfs_inode *nfsi = NFS_I(inode); +	struct inode *inode = data; -	dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n", -		 nfsi, nfsi->fscache); - -	fscache_relinquish_cookie(nfsi->fscache, 1); -	nfsi->fscache = NULL; +	return !inode_is_open_for_write(inode);  }  /* - * Turn off the cache with regard to a per-inode cookie if opened for writing, - * invalidating all the pages in the page cache relating to the associated - * inode to clear the per-page caching. - */ -static void nfs_fscache_disable_inode_cookie(struct inode *inode) -{ -	clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); - -	if (NFS_I(inode)->fscache) { -		dfprintk(FSCACHE, -			 "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode)); - -		/* Need to uncache any pages attached to this inode that -		 * fscache knows about before turning off the cache. -		 */ -		fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode); -		nfs_fscache_zap_inode_cookie(inode); -	} -} - -/* - * wait_on_bit() sleep function for uninterruptible waiting - */ -static int nfs_fscache_wait_bit(void *flags) -{ -	schedule(); -	return 0; -} - -/* - * Lock against someone else trying to also acquire or relinquish a cookie - */ -static inline void nfs_fscache_inode_lock(struct inode *inode) -{ -	struct nfs_inode *nfsi = NFS_I(inode); - -	while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags)) -		wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK, -			    nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE); -} - -/* - * Unlock cookie management lock - */ -static inline void nfs_fscache_inode_unlock(struct inode *inode) -{ -	struct nfs_inode *nfsi = NFS_I(inode); - -	smp_mb__before_clear_bit(); -	clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags); -	smp_mb__after_clear_bit(); -	wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK); -} - -/* - * Decide if we should enable or disable local caching for this inode. - * - For now, with NFS, only regular files that are open read-only will be able - *   to use the cache. - * - May be invoked multiple times in parallel by parallel nfs_open() functions. - */ -void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) -{ -	if (NFS_FSCACHE(inode)) { -		nfs_fscache_inode_lock(inode); -		if ((filp->f_flags & O_ACCMODE) != O_RDONLY) -			nfs_fscache_disable_inode_cookie(inode); -		else -			nfs_fscache_enable_inode_cookie(inode); -		nfs_fscache_inode_unlock(inode); -	} -} -EXPORT_SYMBOL_GPL(nfs_fscache_set_inode_cookie); - -/* - * Replace a per-inode cookie due to revalidation detecting a file having - * changed on the server. + * Enable or disable caching for a file that is being opened as appropriate. + * The cookie is allocated when the inode is initialised, but is not enabled at + * that time.  Enablement is deferred to file-open time to avoid stat() and + * access() thrashing the cache. + * + * For now, with NFS, only regular files that are open read-only will be able + * to use the cache. + * + * We enable the cache for an inode if we open it read-only and it isn't + * currently open for writing.  We disable the cache if the inode is open + * write-only. + * + * The caller uses the file struct to pin i_writecount on the inode before + * calling us when a file is opened for writing, so we can make use of that. + * + * Note that this may be invoked multiple times in parallel by parallel + * nfs_open() functions.   */ -void nfs_fscache_reset_inode_cookie(struct inode *inode) +void nfs_fscache_open_file(struct inode *inode, struct file *filp)  {  	struct nfs_inode *nfsi = NFS_I(inode); -	struct nfs_server *nfss = NFS_SERVER(inode); -	NFS_IFDEBUG(struct fscache_cookie *old = nfsi->fscache); +	struct fscache_cookie *cookie = nfs_i_fscache(inode); -	nfs_fscache_inode_lock(inode); -	if (nfsi->fscache) { -		/* retire the current fscache cache and get a new one */ -		fscache_relinquish_cookie(nfsi->fscache, 1); - -		nfsi->fscache = fscache_acquire_cookie( -			nfss->nfs_client->fscache, -			&nfs_fscache_inode_object_def, -			nfsi); +	if (!fscache_cookie_valid(cookie)) +		return; -		dfprintk(FSCACHE, -			 "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n", -			 nfss, nfsi, old, nfsi->fscache); +	if (inode_is_open_for_write(inode)) { +		dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); +		clear_bit(NFS_INO_FSCACHE, &nfsi->flags); +		fscache_disable_cookie(cookie, true); +		fscache_uncache_all_inode_pages(cookie, inode); +	} else { +		dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi); +		fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode); +		if (fscache_cookie_enabled(cookie)) +			set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);  	} -	nfs_fscache_inode_unlock(inode);  } +EXPORT_SYMBOL_GPL(nfs_fscache_open_file);  /*   * Release the caching state associated with a page, if the page isn't busy @@ -344,12 +260,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)  int nfs_fscache_release_page(struct page *page, gfp_t gfp)  {  	if (PageFsCache(page)) { -		struct nfs_inode *nfsi = NFS_I(page->mapping->host); -		struct fscache_cookie *cookie = nfsi->fscache; +		struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host);  		BUG_ON(!cookie);  		dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", -			 cookie, page, nfsi); +			 cookie, page, NFS_I(page->mapping->host));  		if (!fscache_maybe_release_page(cookie, page, gfp))  			return 0; @@ -367,13 +282,12 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp)   */  void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)  { -	struct nfs_inode *nfsi = NFS_I(inode); -	struct fscache_cookie *cookie = nfsi->fscache; +	struct fscache_cookie *cookie = nfs_i_fscache(inode);  	BUG_ON(!cookie);  	dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n", -		 cookie, page, nfsi); +		 cookie, page, NFS_I(inode));  	fscache_wait_on_page_write(cookie, page); @@ -417,9 +331,9 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,  	dfprintk(FSCACHE,  		 "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", -		 NFS_I(inode)->fscache, page, page->index, page->flags, inode); +		 nfs_i_fscache(inode), page, page->index, page->flags, inode); -	ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache, +	ret = fscache_read_or_alloc_page(nfs_i_fscache(inode),  					 page,  					 nfs_readpage_from_fscache_complete,  					 ctx, @@ -459,9 +373,9 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,  	int ret;  	dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", -		 NFS_I(inode)->fscache, npages, inode); +		 nfs_i_fscache(inode), npages, inode); -	ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache, +	ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode),  					  mapping, pages, nr_pages,  					  nfs_readpage_from_fscache_complete,  					  ctx, @@ -506,15 +420,15 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)  	dfprintk(FSCACHE,  		 "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", -		 NFS_I(inode)->fscache, page, page->index, page->flags, sync); +		 nfs_i_fscache(inode), page, page->index, page->flags, sync); -	ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL); +	ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL);  	dfprintk(FSCACHE,  		 "NFS:     readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",  		 page, page->index, page->flags, ret);  	if (ret != 0) { -		fscache_uncache_page(NFS_I(inode)->fscache, page); +		fscache_uncache_page(nfs_i_fscache(inode), page);  		nfs_add_fscache_stats(inode,  				      NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);  		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1); diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 4ecb76652eb..d7fe3e799f2 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -76,11 +76,9 @@ extern void nfs_fscache_release_client_cookie(struct nfs_client *);  extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int);  extern void nfs_fscache_release_super_cookie(struct super_block *); -extern void nfs_fscache_init_inode_cookie(struct inode *); -extern void nfs_fscache_release_inode_cookie(struct inode *); -extern void nfs_fscache_zap_inode_cookie(struct inode *); -extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *); -extern void nfs_fscache_reset_inode_cookie(struct inode *); +extern void nfs_fscache_init_inode(struct inode *); +extern void nfs_fscache_clear_inode(struct inode *); +extern void nfs_fscache_open_file(struct inode *, struct file *);  extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);  extern int nfs_fscache_release_page(struct page *, gfp_t); @@ -187,12 +185,10 @@ static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}  static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} -static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {} -static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {} -static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {} -static inline void nfs_fscache_set_inode_cookie(struct inode *inode, -						struct file *filp) {} -static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_init_inode(struct inode *inode) {} +static inline void nfs_fscache_clear_inode(struct inode *inode) {} +static inline void nfs_fscache_open_file(struct inode *inode, +					 struct file *filp) {}  static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)  { diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 66984a9aafa..b94f80420a5 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -120,7 +120,8 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,  	security_d_instantiate(ret, inode);  	spin_lock(&ret->d_lock); -	if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { +	if (IS_ROOT(ret) && !ret->d_fsdata && +	    !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {  		ret->d_fsdata = name;  		name = NULL;  	} diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index eda8879171c..9927913c97c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -122,13 +122,13 @@ void nfs_clear_inode(struct inode *inode)  	WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files));  	nfs_zap_acl_cache(inode);  	nfs_access_zap_cache(inode); -	nfs_fscache_release_inode_cookie(inode); +	nfs_fscache_clear_inode(inode);  }  EXPORT_SYMBOL_GPL(nfs_clear_inode);  void nfs_evict_inode(struct inode *inode)  { -	truncate_inode_pages(&inode->i_data, 0); +	truncate_inode_pages_final(&inode->i_data);  	clear_inode(inode);  	nfs_clear_inode(inode);  } @@ -147,6 +147,17 @@ int nfs_sync_mapping(struct address_space *mapping)  	return ret;  } +static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) +{ +	struct nfs_inode *nfsi = NFS_I(inode); + +	if (inode->i_mapping->nrpages == 0) +		flags &= ~NFS_INO_INVALID_DATA; +	nfsi->cache_validity |= flags; +	if (flags & NFS_INO_INVALID_DATA) +		nfs_fscache_invalidate(inode); +} +  /*   * Invalidate the local caches   */ @@ -162,19 +173,17 @@ static void nfs_zap_caches_locked(struct inode *inode)  	memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));  	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { -		nfs_fscache_invalidate(inode); -		nfsi->cache_validity |= NFS_INO_INVALID_ATTR -					| NFS_INO_INVALID_LABEL +		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR  					| NFS_INO_INVALID_DATA  					| NFS_INO_INVALID_ACCESS  					| NFS_INO_INVALID_ACL -					| NFS_INO_REVAL_PAGECACHE; +					| NFS_INO_REVAL_PAGECACHE);  	} else -		nfsi->cache_validity |= NFS_INO_INVALID_ATTR -					| NFS_INO_INVALID_LABEL +		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR  					| NFS_INO_INVALID_ACCESS  					| NFS_INO_INVALID_ACL -					| NFS_INO_REVAL_PAGECACHE; +					| NFS_INO_REVAL_PAGECACHE); +	nfs_zap_label_cache_locked(nfsi);  }  void nfs_zap_caches(struct inode *inode) @@ -188,8 +197,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)  {  	if (mapping->nrpages != 0) {  		spin_lock(&inode->i_lock); -		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; -		nfs_fscache_invalidate(inode); +		nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);  		spin_unlock(&inode->i_lock);  	}  } @@ -210,7 +218,7 @@ EXPORT_SYMBOL_GPL(nfs_zap_acl_cache);  void nfs_invalidate_atime(struct inode *inode)  {  	spin_lock(&inode->i_lock); -	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; +	nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);  	spin_unlock(&inode->i_lock);  }  EXPORT_SYMBOL_GPL(nfs_invalidate_atime); @@ -266,6 +274,13 @@ nfs_init_locked(struct inode *inode, void *opaque)  }  #ifdef CONFIG_NFS_V4_SECURITY_LABEL +static void nfs_clear_label_invalid(struct inode *inode) +{ +	spin_lock(&inode->i_lock); +	NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_LABEL; +	spin_unlock(&inode->i_lock); +} +  void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,  					struct nfs4_label *label)  { @@ -274,12 +289,6 @@ void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,  	if (label == NULL)  		return; -	if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0) -		return; - -	if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2) -		return; -  	if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {  		error = security_inode_notifysecctx(inode, label->label,  				label->len); @@ -289,6 +298,7 @@ void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,  					__func__,  					(char *)label->label,  					label->len, error); +		nfs_clear_label_invalid(inode);  	}  } @@ -318,7 +328,7 @@ struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)  }  EXPORT_SYMBOL_GPL(nfs4_label_alloc);  #else -void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, +void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,  					struct nfs4_label *label)  {  } @@ -368,7 +378,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st  		inode->i_mode = fattr->mode;  		if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0  				&& nfs_server_capable(inode, NFS_CAP_MODE)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		/* Why so? Because we want revalidate for devices/FIFOs, and  		 * that's precisely what we have in nfs_file_inode_operations.  		 */ @@ -414,36 +424,36 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st  		if (fattr->valid & NFS_ATTR_FATTR_ATIME)  			inode->i_atime = fattr->atime;  		else if (nfs_server_capable(inode, NFS_CAP_ATIME)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		if (fattr->valid & NFS_ATTR_FATTR_MTIME)  			inode->i_mtime = fattr->mtime;  		else if (nfs_server_capable(inode, NFS_CAP_MTIME)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		if (fattr->valid & NFS_ATTR_FATTR_CTIME)  			inode->i_ctime = fattr->ctime;  		else if (nfs_server_capable(inode, NFS_CAP_CTIME)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)  			inode->i_version = fattr->change_attr;  		else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		if (fattr->valid & NFS_ATTR_FATTR_SIZE)  			inode->i_size = nfs_size_to_loff_t(fattr->size);  		else -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR -				| NFS_INO_REVAL_PAGECACHE; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR +				| NFS_INO_REVAL_PAGECACHE);  		if (fattr->valid & NFS_ATTR_FATTR_NLINK)  			set_nlink(inode, fattr->nlink);  		else if (nfs_server_capable(inode, NFS_CAP_NLINK)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		if (fattr->valid & NFS_ATTR_FATTR_OWNER)  			inode->i_uid = fattr->uid;  		else if (nfs_server_capable(inode, NFS_CAP_OWNER)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		if (fattr->valid & NFS_ATTR_FATTR_GROUP)  			inode->i_gid = fattr->gid;  		else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)  			inode->i_blocks = fattr->du.nfs2.blocks;  		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -459,14 +469,14 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st  		nfsi->attrtimeo_timestamp = now;  		nfsi->access_cache = RB_ROOT; -		nfs_fscache_init_inode_cookie(inode); +		nfs_fscache_init_inode(inode);  		unlock_new_inode(inode);  	} else  		nfs_refresh_inode(inode, fattr); -	dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n", +	dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n",  		inode->i_sb->s_id, -		(long long)NFS_FILEID(inode), +		(unsigned long long)NFS_FILEID(inode),  		nfs_display_fhandle_hash(fh),  		atomic_read(&inode->i_count)); @@ -549,6 +559,9 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)  	spin_lock(&inode->i_lock);  	i_size_write(inode, offset); +	/* Optimisation */ +	if (offset == 0) +		NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;  	spin_unlock(&inode->i_lock);  	truncate_pagecache(inode, offset); @@ -577,7 +590,8 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)  			inode->i_uid = attr->ia_uid;  		if ((attr->ia_valid & ATTR_GID) != 0)  			inode->i_gid = attr->ia_gid; -		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; +		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS +				| NFS_INO_INVALID_ACL);  		spin_unlock(&inode->i_lock);  	}  	if ((attr->ia_valid & ATTR_SIZE) != 0) { @@ -587,6 +601,25 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)  }  EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); +static void nfs_request_parent_use_readdirplus(struct dentry *dentry) +{ +	struct dentry *parent; + +	parent = dget_parent(dentry); +	nfs_force_use_readdirplus(parent->d_inode); +	dput(parent); +} + +static bool nfs_need_revalidate_inode(struct inode *inode) +{ +	if (NFS_I(inode)->cache_validity & +			(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) +		return true; +	if (nfs_attribute_cache_expired(inode)) +		return true; +	return false; +} +  int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)  {  	struct inode *inode = dentry->d_inode; @@ -615,10 +648,13 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)   	    ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))  		need_atime = 0; -	if (need_atime) -		err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); -	else -		err = nfs_revalidate_inode(NFS_SERVER(inode), inode); +	if (need_atime || nfs_need_revalidate_inode(inode)) { +		struct nfs_server *server = NFS_SERVER(inode); + +		if (server->caps & NFS_CAP_READDIRPLUS) +			nfs_request_parent_use_readdirplus(dentry); +		err = __nfs_revalidate_inode(server, inode); +	}  	if (!err) {  		generic_fillattr(inode, stat);  		stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); @@ -854,7 +890,7 @@ int nfs_open(struct inode *inode, struct file *filp)  		return PTR_ERR(ctx);  	nfs_file_set_open_context(filp, ctx);  	put_nfs_open_context(ctx); -	nfs_fscache_set_inode_cookie(inode, filp); +	nfs_fscache_open_file(inode, filp);  	return 0;  } @@ -876,8 +912,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)  	struct nfs_fattr *fattr = NULL;  	struct nfs_inode *nfsi = NFS_I(inode); -	dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", -		inode->i_sb->s_id, (long long)NFS_FILEID(inode)); +	dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n", +		inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode));  	trace_nfs_revalidate_inode_enter(inode); @@ -901,9 +937,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)  	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);  	if (status != 0) { -		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", +		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n",  			 inode->i_sb->s_id, -			 (long long)NFS_FILEID(inode), status); +			 (unsigned long long)NFS_FILEID(inode), status);  		if (status == -ESTALE) {  			nfs_zap_caches(inode);  			if (!S_ISDIR(inode->i_mode)) @@ -914,18 +950,20 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)  	status = nfs_refresh_inode(inode, fattr);  	if (status) { -		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", +		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n",  			 inode->i_sb->s_id, -			 (long long)NFS_FILEID(inode), status); +			 (unsigned long long)NFS_FILEID(inode), status);  		goto err_out;  	}  	if (nfsi->cache_validity & NFS_INO_INVALID_ACL)  		nfs_zap_acl_cache(inode); -	dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n", +	nfs_setsecurity(inode, fattr, label); + +	dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n",  		inode->i_sb->s_id, -		(long long)NFS_FILEID(inode)); +		(unsigned long long)NFS_FILEID(inode));  err_out:  	nfs4_label_free(label); @@ -958,9 +996,7 @@ int nfs_attribute_cache_expired(struct inode *inode)   */  int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)  { -	if (!(NFS_I(inode)->cache_validity & -			(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) -			&& !nfs_attribute_cache_expired(inode)) +	if (!nfs_need_revalidate_inode(inode))  		return NFS_STALE(inode) ? -ESTALE : 0;  	return __nfs_revalidate_inode(server, inode);  } @@ -981,16 +1017,17 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map  		if (ret < 0)  			return ret;  	} -	spin_lock(&inode->i_lock); -	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; -	if (S_ISDIR(inode->i_mode)) +	if (S_ISDIR(inode->i_mode)) { +		spin_lock(&inode->i_lock);  		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); -	spin_unlock(&inode->i_lock); +		spin_unlock(&inode->i_lock); +	}  	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);  	nfs_fscache_wait_on_invalidate(inode); -	dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", -			inode->i_sb->s_id, (long long)NFS_FILEID(inode)); +	dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n", +			inode->i_sb->s_id, +			(unsigned long long)NFS_FILEID(inode));  	return 0;  } @@ -1011,6 +1048,7 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)  int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)  {  	struct nfs_inode *nfsi = NFS_I(inode); +	unsigned long *bitlock = &nfsi->flags;  	int ret = 0;  	/* swapfiles are not supposed to be shared. */ @@ -1022,12 +1060,46 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)  		if (ret < 0)  			goto out;  	} -	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { -		trace_nfs_invalidate_mapping_enter(inode); -		ret = nfs_invalidate_mapping(inode, mapping); -		trace_nfs_invalidate_mapping_exit(inode, ret); + +	/* +	 * We must clear NFS_INO_INVALID_DATA first to ensure that +	 * invalidations that come in while we're shooting down the mappings +	 * are respected. But, that leaves a race window where one revalidator +	 * can clear the flag, and then another checks it before the mapping +	 * gets invalidated. Fix that by serializing access to this part of +	 * the function. +	 * +	 * At the same time, we need to allow other tasks to see whether we +	 * might be in the middle of invalidating the pages, so we only set +	 * the bit lock here if it looks like we're going to be doing that. +	 */ +	for (;;) { +		ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING, +				  nfs_wait_bit_killable, TASK_KILLABLE); +		if (ret) +			goto out; +		spin_lock(&inode->i_lock); +		if (test_bit(NFS_INO_INVALIDATING, bitlock)) { +			spin_unlock(&inode->i_lock); +			continue; +		} +		if (nfsi->cache_validity & NFS_INO_INVALID_DATA) +			break; +		spin_unlock(&inode->i_lock); +		goto out;  	} +	set_bit(NFS_INO_INVALIDATING, bitlock); +	smp_wmb(); +	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; +	spin_unlock(&inode->i_lock); +	trace_nfs_invalidate_mapping_enter(inode); +	ret = nfs_invalidate_mapping(inode, mapping); +	trace_nfs_invalidate_mapping_exit(inode, ret); + +	clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); +	smp_mb__after_atomic(); +	wake_up_bit(bitlock, NFS_INO_INVALIDATING);  out:  	return ret;  } @@ -1042,7 +1114,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr  			&& inode->i_version == fattr->pre_change_attr) {  		inode->i_version = fattr->change_attr;  		if (S_ISDIR(inode->i_mode)) -			nfsi->cache_validity |= NFS_INO_INVALID_DATA; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);  		ret |= NFS_INO_INVALID_ATTR;  	}  	/* If we have atomic WCC data, we may update some attributes */ @@ -1058,7 +1130,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr  			&& timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {  		memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));  		if (S_ISDIR(inode->i_mode)) -			nfsi->cache_validity |= NFS_INO_INVALID_DATA; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);  		ret |= NFS_INO_INVALID_ATTR;  	}  	if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) @@ -1069,9 +1141,6 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr  		ret |= NFS_INO_INVALID_ATTR;  	} -	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) -		nfs_fscache_invalidate(inode); -  	return ret;  } @@ -1130,7 +1199,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat  		invalid |= NFS_INO_INVALID_ATIME;  	if (invalid != 0) -		nfsi->cache_validity |= invalid; +		nfs_set_cache_invalid(inode, invalid);  	nfsi->read_cache_jiffies = fattr->time_start;  	return 0; @@ -1209,6 +1278,7 @@ u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh)  	 * not on the result */  	return nfs_fhandle_hash(fh);  } +EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash);  /*   * _nfs_display_fhandle - display an NFS file handle on the console @@ -1253,6 +1323,7 @@ void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption)  		}  	}  } +EXPORT_SYMBOL_GPL(_nfs_display_fhandle);  #endif  /** @@ -1284,12 +1355,28 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n  		((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);  } +/* + * Don't trust the change_attribute, mtime, ctime or size if + * a pnfs LAYOUTCOMMIT is outstanding + */ +static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode, +		struct nfs_fattr *fattr) +{ +	if (pnfs_layoutcommit_outstanding(inode)) +		fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE | +				NFS_ATTR_FATTR_MTIME | +				NFS_ATTR_FATTR_CTIME | +				NFS_ATTR_FATTR_SIZE); +} +  static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)  {  	int ret;  	trace_nfs_refresh_inode_enter(inode); +	nfs_inode_attrs_handle_layoutcommit(inode, fattr); +  	if (nfs_inode_attrs_need_update(inode, fattr))  		ret = nfs_update_inode(inode, fattr);  	else @@ -1325,13 +1412,11 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode);  static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)  { -	struct nfs_inode *nfsi = NFS_I(inode); +	unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; -	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; -	if (S_ISDIR(inode->i_mode)) { -		nfsi->cache_validity |= NFS_INO_INVALID_DATA; -		nfs_fscache_invalidate(inode); -	} +	if (S_ISDIR(inode->i_mode)) +		invalid |= NFS_INO_INVALID_DATA; +	nfs_set_cache_invalid(inode, invalid);  	if ((fattr->valid & NFS_ATTR_FATTR) == 0)  		return 0;  	return nfs_refresh_inode_locked(inode, fattr); @@ -1436,7 +1521,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  	unsigned long now = jiffies;  	unsigned long save_cache_validity; -	dfprintk(VFS, "NFS: %s(%s/%ld fh_crc=0x%08x ct=%d info=0x%x)\n", +	dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",  			__func__, inode->i_sb->s_id, inode->i_ino,  			nfs_display_fhandle_hash(NFS_FH(inode)),  			atomic_read(&inode->i_count), fattr->valid); @@ -1457,7 +1542,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  		/*  		* Big trouble! The inode has become a different object.  		*/ -		printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n", +		printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n",  				__func__, inode->i_ino, inode->i_mode, fattr->mode);  		goto out_err;  	} @@ -1498,18 +1583,20 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  			inode->i_version = fattr->change_attr;  		}  	} else if (server->caps & NFS_CAP_CHANGE_ATTR) -		invalid |= save_cache_validity; +		nfsi->cache_validity |= save_cache_validity;  	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {  		memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));  	} else if (server->caps & NFS_CAP_MTIME) -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATTR  				| NFS_INO_REVAL_FORCED);  	if (fattr->valid & NFS_ATTR_FATTR_CTIME) {  		memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));  	} else if (server->caps & NFS_CAP_CTIME) -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATTR  				| NFS_INO_REVAL_FORCED);  	/* Check if our cached file size is stale */ @@ -1519,10 +1606,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  		if (new_isize != cur_isize) {  			/* Do we perhaps have any outstanding writes, or has  			 * the file grown beyond our last write? */ -			if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) || -			     new_isize > cur_isize) { +			if ((nfsi->npages == 0) || new_isize > cur_isize) {  				i_size_write(inode, new_isize);  				invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; +				invalid &= ~NFS_INO_REVAL_PAGECACHE;  			}  			dprintk("NFS: isize change on server for file %s/%ld "  					"(%Ld to %Ld)\n", @@ -1532,7 +1619,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  					(long long)new_isize);  		}  	} else -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATTR  				| NFS_INO_REVAL_PAGECACHE  				| NFS_INO_REVAL_FORCED); @@ -1540,7 +1628,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  	if (fattr->valid & NFS_ATTR_FATTR_ATIME)  		memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));  	else if (server->caps & NFS_CAP_ATIME) -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATIME  				| NFS_INO_REVAL_FORCED);  	if (fattr->valid & NFS_ATTR_FATTR_MODE) { @@ -1551,7 +1640,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;  		}  	} else if (server->caps & NFS_CAP_MODE) -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATTR  				| NFS_INO_INVALID_ACCESS  				| NFS_INO_INVALID_ACL  				| NFS_INO_REVAL_FORCED); @@ -1562,7 +1652,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  			inode->i_uid = fattr->uid;  		}  	} else if (server->caps & NFS_CAP_OWNER) -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATTR  				| NFS_INO_INVALID_ACCESS  				| NFS_INO_INVALID_ACL  				| NFS_INO_REVAL_FORCED); @@ -1573,7 +1664,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  			inode->i_gid = fattr->gid;  		}  	} else if (server->caps & NFS_CAP_OWNER_GROUP) -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATTR  				| NFS_INO_INVALID_ACCESS  				| NFS_INO_INVALID_ACL  				| NFS_INO_REVAL_FORCED); @@ -1586,7 +1678,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  			set_nlink(inode, fattr->nlink);  		}  	} else if (server->caps & NFS_CAP_NLINK) -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATTR  				| NFS_INO_REVAL_FORCED);  	if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -1599,7 +1692,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  		inode->i_blocks = fattr->du.nfs2.blocks;  	/* Update attrtimeo value if we're out of the unstable period */ -	if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) { +	if (invalid & NFS_INO_INVALID_ATTR) {  		nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);  		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);  		nfsi->attrtimeo_timestamp = now; @@ -1612,17 +1705,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  		}  	}  	invalid &= ~NFS_INO_INVALID_ATTR; -	invalid &= ~NFS_INO_INVALID_LABEL;  	/* Don't invalidate the data if we were to blame */  	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)  				|| S_ISLNK(inode->i_mode)))  		invalid &= ~NFS_INO_INVALID_DATA;  	if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) ||  			(save_cache_validity & NFS_INO_REVAL_FORCED)) -		nfsi->cache_validity |= invalid; - -	if (invalid & NFS_INO_INVALID_DATA) -		nfs_fscache_invalidate(inode); +		nfs_set_cache_invalid(inode, invalid);  	return 0;   out_err: @@ -1643,10 +1732,6 @@ struct inode *nfs_alloc_inode(struct super_block *sb)  		return NULL;  	nfsi->flags = 0UL;  	nfsi->cache_validity = 0UL; -#ifdef CONFIG_NFS_V3_ACL -	nfsi->acl_access = ERR_PTR(-EAGAIN); -	nfsi->acl_default = ERR_PTR(-EAGAIN); -#endif  #if IS_ENABLED(CONFIG_NFS_V4)  	nfsi->nfs4_acl = NULL;  #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 38da8c2b81a..f415cbf9f6c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -88,8 +88,8 @@ struct nfs_parsed_mount_data {  	unsigned int		namlen;  	unsigned int		options;  	unsigned int		bsize; -	unsigned int		auth_flavor_len; -	rpc_authflavor_t	auth_flavors[1]; +	struct nfs_auth_info	auth_info; +	rpc_authflavor_t	selected_flavor;  	char			*client_address;  	unsigned int		version;  	unsigned int		minorversion; @@ -154,6 +154,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,  				  rpc_authflavor_t);  int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);  void nfs_server_insert_lists(struct nfs_server *); +void nfs_server_remove_lists(struct nfs_server *);  void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int);  int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t,  		rpc_authflavor_t); @@ -174,6 +175,9 @@ extern struct nfs_server *nfs4_create_server(  					struct nfs_subversion *);  extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,  						      struct nfs_fh *); +extern int nfs4_update_server(struct nfs_server *server, const char *hostname, +					struct sockaddr *sap, size_t salen, +					struct net *net);  extern void nfs_free_server(struct nfs_server *server);  extern struct nfs_server *nfs_clone_server(struct nfs_server *,  					   struct nfs_fh *, @@ -227,13 +231,21 @@ extern void nfs_destroy_writepagecache(void);  extern int __init nfs_init_directcache(void);  extern void nfs_destroy_directcache(void); -extern bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount);  extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,  			      struct nfs_pgio_header *hdr,  			      void (*release)(struct nfs_pgio_header *hdr));  void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);  int nfs_iocounter_wait(struct nfs_io_counter *c); +extern const struct nfs_pageio_ops nfs_pgio_rw_ops; +struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *); +void nfs_rw_header_free(struct nfs_pgio_header *); +void nfs_pgio_data_release(struct nfs_pgio_data *); +int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); +int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *, +		      const struct rpc_call_ops *, int, int); +void nfs_free_request(struct nfs_page *req); +  static inline void nfs_iocounter_init(struct nfs_io_counter *c)  {  	c->flags = 0; @@ -266,6 +278,30 @@ extern const u32 nfs41_maxgetdevinfo_overhead;  extern struct rpc_procinfo nfs4_procedures[];  #endif +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); +static inline void nfs4_label_free(struct nfs4_label *label) +{ +	if (label) { +		kfree(label->label); +		kfree(label); +	} +	return; +} + +static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) +{ +	if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL)) +		nfsi->cache_validity |= NFS_INO_INVALID_LABEL; +} +#else +static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; } +static inline void nfs4_label_free(void *label) {} +static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) +{ +} +#endif /* CONFIG_NFS_V4_SECURITY_LABEL */ +  /* proc.c */  void nfs_close_context(struct nfs_open_context *ctx, int is_sync);  extern struct nfs_client *nfs_init_client(struct nfs_client *clp, @@ -273,6 +309,7 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,  			   const char *ip_addr);  /* dir.c */ +extern void nfs_force_use_readdirplus(struct inode *dir);  extern unsigned long nfs_access_cache_count(struct shrinker *shrink,  					    struct shrink_control *sc);  extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, @@ -291,16 +328,14 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *)  int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);  loff_t nfs_file_llseek(struct file *, loff_t, int);  int nfs_file_flush(struct file *, fl_owner_t); -ssize_t nfs_file_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); +ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);  ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,  			     size_t, unsigned int);  int nfs_file_mmap(struct file *, struct vm_area_struct *); -ssize_t nfs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); +ssize_t nfs_file_write(struct kiocb *, struct iov_iter *);  int nfs_file_release(struct inode *, struct file *);  int nfs_lock(struct file *, int, struct file_lock *);  int nfs_flock(struct file *, int, struct file_lock *); -ssize_t nfs_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, -			      size_t, unsigned int);  int nfs_check_flags(int);  int nfs_setlease(struct file *, long, struct file_lock **); @@ -323,6 +358,7 @@ extern struct file_system_type nfs_xdev_fs_type;  extern struct file_system_type nfs4_xdev_fs_type;  extern struct file_system_type nfs4_referral_fs_type;  #endif +bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t);  struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *,  			struct nfs_subversion *);  void nfs_initialise_sb(struct super_block *); @@ -365,19 +401,11 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool  struct nfs_pgio_completion_ops;  /* read.c */ -extern struct nfs_read_header *nfs_readhdr_alloc(void); -extern void nfs_readhdr_free(struct nfs_pgio_header *hdr);  extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, -			struct inode *inode, +			struct inode *inode, bool force_mds,  			const struct nfs_pgio_completion_ops *compl_ops); -extern int nfs_initiate_read(struct rpc_clnt *clnt, -			     struct nfs_read_data *data, -			     const struct rpc_call_ops *call_ops, int flags);  extern void nfs_read_prepare(struct rpc_task *task, void *calldata); -extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, -			      struct nfs_pgio_header *hdr);  extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); -extern void nfs_readdata_release(struct nfs_read_data *rdata);  /* super.c */  void nfs_clone_super(struct super_block *, struct nfs_mount_info *); @@ -392,19 +420,10 @@ int nfs_remount(struct super_block *sb, int *flags, char *raw_data);  /* write.c */  extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, -			struct inode *inode, int ioflags, +			struct inode *inode, int ioflags, bool force_mds,  			const struct nfs_pgio_completion_ops *compl_ops); -extern struct nfs_write_header *nfs_writehdr_alloc(void); -extern void nfs_writehdr_free(struct nfs_pgio_header *hdr); -extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, -			     struct nfs_pgio_header *hdr);  extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); -extern void nfs_writedata_release(struct nfs_write_data *wdata);  extern void nfs_commit_free(struct nfs_commit_data *p); -extern int nfs_initiate_write(struct rpc_clnt *clnt, -			      struct nfs_write_data *data, -			      const struct rpc_call_ops *call_ops, -			      int how, int flags);  extern void nfs_write_prepare(struct rpc_task *task, void *calldata);  extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);  extern int nfs_initiate_commit(struct rpc_clnt *clnt, @@ -417,6 +436,7 @@ extern void nfs_init_commit(struct nfs_commit_data *data,  			    struct nfs_commit_info *cinfo);  int nfs_scan_commit_list(struct list_head *src, struct list_head *dst,  			 struct nfs_commit_info *cinfo, int max); +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *);  int nfs_scan_commit(struct inode *inode, struct list_head *dst,  		    struct nfs_commit_info *cinfo);  void nfs_mark_request_commit(struct nfs_page *req, @@ -445,6 +465,13 @@ extern int nfs_migrate_page(struct address_space *,  #define nfs_migrate_page NULL  #endif +/* unlink.c */ +extern struct rpc_task * +nfs_async_rename(struct inode *old_dir, struct inode *new_dir, +		 struct dentry *old_dentry, struct dentry *new_dentry, +		 void (*complete)(struct rpc_task *, struct nfs_renamedata *)); +extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); +  /* direct.c */  void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,  			      struct nfs_direct_req *dreq); @@ -455,7 +482,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)  extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);  /* nfs4proc.c */ -extern void __nfs4_read_done_cb(struct nfs_read_data *); +extern void __nfs4_read_done_cb(struct nfs_pgio_data *);  extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,  			    const struct rpc_timeout *timeparms,  			    const char *ip_addr); diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 348b535cd78..b5a0afc3ee1 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -253,9 +253,8 @@ struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh,  	dprintk("--> nfs_do_submount()\n"); -	dprintk("%s: submounting on %s/%s\n", __func__, -			dentry->d_parent->d_name.name, -			dentry->d_name.name); +	dprintk("%s: submounting on %pd2\n", __func__, +			dentry);  	if (page == NULL)  		goto out;  	devname = nfs_devname(dentry, page, PAGE_SIZE); diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 62db136339e..5f61b83f4a1 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -103,7 +103,7 @@ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)  /*   *	typedef opaque	nfsdata<>;   */ -static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result) +static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result)  {  	u32 recvd, count;  	__be32 *p; @@ -613,7 +613,7 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,   *	};   */  static void encode_readargs(struct xdr_stream *xdr, -			    const struct nfs_readargs *args) +			    const struct nfs_pgio_args *args)  {  	u32 offset = args->offset;  	u32 count = args->count; @@ -629,7 +629,7 @@ static void encode_readargs(struct xdr_stream *xdr,  static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,  				  struct xdr_stream *xdr, -				  const struct nfs_readargs *args) +				  const struct nfs_pgio_args *args)  {  	encode_readargs(xdr, args);  	prepare_reply_buffer(req, args->pages, args->pgbase, @@ -649,7 +649,7 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,   *	};   */  static void encode_writeargs(struct xdr_stream *xdr, -			     const struct nfs_writeargs *args) +			     const struct nfs_pgio_args *args)  {  	u32 offset = args->offset;  	u32 count = args->count; @@ -669,7 +669,7 @@ static void encode_writeargs(struct xdr_stream *xdr,  static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,  				   struct xdr_stream *xdr, -				   const struct nfs_writeargs *args) +				   const struct nfs_pgio_args *args)  {  	encode_writeargs(xdr, args);  	xdr->buf->flags |= XDRBUF_WRITE; @@ -857,7 +857,7 @@ out_default:   *	};   */  static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr, -				struct nfs_readres *result) +				struct nfs_pgio_res *result)  {  	enum nfs_stat status;  	int error; @@ -878,7 +878,7 @@ out_default:  }  static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr, -				 struct nfs_writeres *result) +				 struct nfs_pgio_res *result)  {  	/* All NFSv2 writes are "file sync" writes */  	result->verf->committed = NFS_FILE_SYNC; diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 4a1aafba6a2..8f854dde415 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -10,179 +10,7 @@  #define NFSDBG_FACILITY	NFSDBG_PROC -ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ -	struct inode *inode = dentry->d_inode; -	struct posix_acl *acl; -	int pos=0, len=0; - -#	define output(s) do {						\ -			if (pos + sizeof(s) <= size) {			\ -				memcpy(buffer + pos, s, sizeof(s));	\ -				pos += sizeof(s);			\ -			}						\ -			len += sizeof(s);				\ -		} while(0) - -	acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS); -	if (IS_ERR(acl)) -		return PTR_ERR(acl); -	if (acl) { -		output("system.posix_acl_access"); -		posix_acl_release(acl); -	} - -	if (S_ISDIR(inode->i_mode)) { -		acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT); -		if (IS_ERR(acl)) -			return PTR_ERR(acl); -		if (acl) { -			output("system.posix_acl_default"); -			posix_acl_release(acl); -		} -	} - -#	undef output - -	if (!buffer || len <= size) -		return len; -	return -ERANGE; -} - -ssize_t nfs3_getxattr(struct dentry *dentry, const char *name, -		void *buffer, size_t size) -{ -	struct inode *inode = dentry->d_inode; -	struct posix_acl *acl; -	int type, error = 0; - -	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) -		type = ACL_TYPE_ACCESS; -	else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) -		type = ACL_TYPE_DEFAULT; -	else -		return -EOPNOTSUPP; - -	acl = nfs3_proc_getacl(inode, type); -	if (IS_ERR(acl)) -		return PTR_ERR(acl); -	else if (acl) { -		if (type == ACL_TYPE_ACCESS && acl->a_count == 0) -			error = -ENODATA; -		else -			error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); -		posix_acl_release(acl); -	} else -		error = -ENODATA; - -	return error; -} - -int nfs3_setxattr(struct dentry *dentry, const char *name, -	     const void *value, size_t size, int flags) -{ -	struct inode *inode = dentry->d_inode; -	struct posix_acl *acl; -	int type, error; - -	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) -		type = ACL_TYPE_ACCESS; -	else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) -		type = ACL_TYPE_DEFAULT; -	else -		return -EOPNOTSUPP; - -	acl = posix_acl_from_xattr(&init_user_ns, value, size); -	if (IS_ERR(acl)) -		return PTR_ERR(acl); -	error = nfs3_proc_setacl(inode, type, acl); -	posix_acl_release(acl); - -	return error; -} - -int nfs3_removexattr(struct dentry *dentry, const char *name) -{ -	struct inode *inode = dentry->d_inode; -	int type; - -	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) -		type = ACL_TYPE_ACCESS; -	else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) -		type = ACL_TYPE_DEFAULT; -	else -		return -EOPNOTSUPP; - -	return nfs3_proc_setacl(inode, type, NULL); -} - -static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi) -{ -	if (!IS_ERR(nfsi->acl_access)) { -		posix_acl_release(nfsi->acl_access); -		nfsi->acl_access = ERR_PTR(-EAGAIN); -	} -	if (!IS_ERR(nfsi->acl_default)) { -		posix_acl_release(nfsi->acl_default); -		nfsi->acl_default = ERR_PTR(-EAGAIN); -	} -} - -void nfs3_forget_cached_acls(struct inode *inode) -{ -	dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id, -		inode->i_ino); -	spin_lock(&inode->i_lock); -	__nfs3_forget_cached_acls(NFS_I(inode)); -	spin_unlock(&inode->i_lock); -} - -static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type) -{ -	struct nfs_inode *nfsi = NFS_I(inode); -	struct posix_acl *acl = ERR_PTR(-EINVAL); - -	spin_lock(&inode->i_lock); -	switch(type) { -		case ACL_TYPE_ACCESS: -			acl = nfsi->acl_access; -			break; - -		case ACL_TYPE_DEFAULT: -			acl = nfsi->acl_default; -			break; - -		default: -			goto out; -	} -	if (IS_ERR(acl)) -		acl = ERR_PTR(-EAGAIN); -	else -		acl = posix_acl_dup(acl); -out: -	spin_unlock(&inode->i_lock); -	dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id, -		inode->i_ino, type, acl); -	return acl; -} - -static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl, -		    struct posix_acl *dfacl) -{ -	struct nfs_inode *nfsi = NFS_I(inode); - -	dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id, -		inode->i_ino, acl, dfacl); -	spin_lock(&inode->i_lock); -	__nfs3_forget_cached_acls(NFS_I(inode)); -	if (!IS_ERR(acl)) -		nfsi->acl_access = posix_acl_dup(acl); -	if (!IS_ERR(dfacl)) -		nfsi->acl_default = posix_acl_dup(dfacl); -	spin_unlock(&inode->i_lock); -} - -struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) +struct posix_acl *nfs3_get_acl(struct inode *inode, int type)  {  	struct nfs_server *server = NFS_SERVER(inode);  	struct page *pages[NFSACL_MAXPAGES] = { }; @@ -198,7 +26,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)  		.rpc_argp	= &args,  		.rpc_resp	= &res,  	}; -	struct posix_acl *acl;  	int status, count;  	if (!nfs_server_capable(inode, NFS_CAP_ACLS)) @@ -207,10 +34,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)  	status = nfs_revalidate_inode(server, inode);  	if (status < 0)  		return ERR_PTR(status); -	acl = nfs3_get_cached_acl(inode, type); -	if (acl != ERR_PTR(-EAGAIN)) -		return acl; -	acl = NULL;  	/*  	 * Only get the access acl when explicitly requested: We don't @@ -257,40 +80,41 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)  	}  	if (res.acl_access != NULL) { -		if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) { +		if ((posix_acl_equiv_mode(res.acl_access, NULL) == 0) || +		    res.acl_access->a_count == 0) {  			posix_acl_release(res.acl_access);  			res.acl_access = NULL;  		}  	} -	nfs3_cache_acls(inode, -		(res.mask & NFS_ACL)   ? res.acl_access  : ERR_PTR(-EINVAL), -		(res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL)); -	switch(type) { -		case ACL_TYPE_ACCESS: -			acl = res.acl_access; -			res.acl_access = NULL; -			break; +	if (res.mask & NFS_ACL) +		set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access); +	else +		forget_cached_acl(inode, ACL_TYPE_ACCESS); -		case ACL_TYPE_DEFAULT: -			acl = res.acl_default; -			res.acl_default = NULL; +	if (res.mask & NFS_DFACL) +		set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default); +	else +		forget_cached_acl(inode, ACL_TYPE_DEFAULT); + +	nfs_free_fattr(res.fattr); +	if (type == ACL_TYPE_ACCESS) { +		posix_acl_release(res.acl_default); +		return res.acl_access; +	} else { +		posix_acl_release(res.acl_access); +		return res.acl_default;  	}  getout:  	posix_acl_release(res.acl_access);  	posix_acl_release(res.acl_default);  	nfs_free_fattr(res.fattr); - -	if (status != 0) { -		posix_acl_release(acl); -		acl = ERR_PTR(status); -	} -	return acl; +	return ERR_PTR(status);  } -static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, -		  struct posix_acl *dfacl) +static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, +		struct posix_acl *dfacl)  {  	struct nfs_server *server = NFS_SERVER(inode);  	struct nfs_fattr *fattr; @@ -353,7 +177,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,  	switch (status) {  		case 0:  			status = nfs_refresh_inode(inode, fattr); -			nfs3_cache_acls(inode, acl, dfacl); +			set_cached_acl(inode, ACL_TYPE_ACCESS, acl); +			set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl);  			break;  		case -EPFNOSUPPORT:  		case -EPROTONOSUPPORT: @@ -373,40 +198,43 @@ out:  	return status;  } -int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl) +int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, +		struct posix_acl *dfacl) +{ +	int ret; +	ret = __nfs3_proc_setacls(inode, acl, dfacl); +	return (ret == -EOPNOTSUPP) ? 0 : ret; + +} + +int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type)  {  	struct posix_acl *alloc = NULL, *dfacl = NULL;  	int status;  	if (S_ISDIR(inode->i_mode)) {  		switch(type) { -			case ACL_TYPE_ACCESS: -				alloc = dfacl = nfs3_proc_getacl(inode, -						ACL_TYPE_DEFAULT); -				if (IS_ERR(alloc)) -					goto fail; -				break; - -			case ACL_TYPE_DEFAULT: -				dfacl = acl; -				alloc = acl = nfs3_proc_getacl(inode, -						ACL_TYPE_ACCESS); -				if (IS_ERR(alloc)) -					goto fail; -				break; - -			default: -				return -EINVAL; +		case ACL_TYPE_ACCESS: +			alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT); +			if (IS_ERR(alloc)) +				goto fail; +			break; + +		case ACL_TYPE_DEFAULT: +			dfacl = acl; +			alloc = acl = get_acl(inode, ACL_TYPE_ACCESS); +			if (IS_ERR(alloc)) +				goto fail; +			break;  		} -	} else if (type != ACL_TYPE_ACCESS) -			return -EINVAL; +	}  	if (acl == NULL) {  		alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);  		if (IS_ERR(alloc))  			goto fail;  	} -	status = nfs3_proc_setacls(inode, acl, dfacl); +	status = __nfs3_proc_setacls(inode, acl, dfacl);  	posix_acl_release(alloc);  	return status; @@ -414,27 +242,51 @@ fail:  	return PTR_ERR(alloc);  } -int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, -		umode_t mode) +const struct xattr_handler *nfs3_xattr_handlers[] = { +	&posix_acl_access_xattr_handler, +	&posix_acl_default_xattr_handler, +	NULL, +}; + +static int +nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, +		size_t size, ssize_t *result)  { -	struct posix_acl *dfacl, *acl; -	int error = 0; +	struct posix_acl *acl; +	char *p = data + *result; -	dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT); -	if (IS_ERR(dfacl)) { -		error = PTR_ERR(dfacl); -		return (error == -EOPNOTSUPP) ? 0 : error; -	} -	if (!dfacl) +	acl = get_acl(inode, type); +	if (!acl)  		return 0; -	acl = posix_acl_dup(dfacl); -	error = posix_acl_create(&acl, GFP_KERNEL, &mode); -	if (error < 0) -		goto out_release_dfacl; -	error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ? -						      dfacl : NULL); +  	posix_acl_release(acl); -out_release_dfacl: -	posix_acl_release(dfacl); -	return error; + +	*result += strlen(name); +	*result += 1; +	if (!size) +		return 0; +	if (*result > size) +		return -ERANGE; + +	strcpy(p, name); +	return 0; +} + +ssize_t +nfs3_listxattr(struct dentry *dentry, char *data, size_t size) +{ +	struct inode *inode = dentry->d_inode; +	ssize_t result = 0; +	int error; + +	error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS, +			POSIX_ACL_XATTR_ACCESS, data, size, &result); +	if (error) +		return error; + +	error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT, +			POSIX_ACL_XATTR_DEFAULT, data, size, &result); +	if (error) +		return error; +	return result;  } diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 90cb10d7b69..f0afa291fd5 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -18,6 +18,7 @@  #include <linux/lockd/bind.h>  #include <linux/nfs_mount.h>  #include <linux/freezer.h> +#include <linux/xattr.h>  #include "iostat.h"  #include "internal.h" @@ -317,11 +318,11 @@ static int  nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  		 int flags)  { +	struct posix_acl *default_acl, *acl;  	struct nfs3_createdata *data; -	umode_t mode = sattr->ia_mode;  	int status = -ENOMEM; -	dprintk("NFS call  create %s\n", dentry->d_name.name); +	dprintk("NFS call  create %pd\n", dentry);  	data = nfs3_alloc_createdata();  	if (data == NULL) @@ -340,7 +341,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  		data->arg.create.verifier[1] = cpu_to_be32(current->pid);  	} -	sattr->ia_mode &= ~current_umask(); +	status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); +	if (status) +		goto out;  	for (;;) {  		status = nfs3_do_create(dir, dentry, data); @@ -366,7 +369,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  	}  	if (status != 0) -		goto out; +		goto out_release_acls;  	/* When we created the file with exclusive semantics, make  	 * sure we set the attributes afterwards. */ @@ -385,9 +388,14 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  		nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);  		dprintk("NFS reply setattr (post-create): %d\n", status);  		if (status != 0) -			goto out; +			goto out_release_acls;  	} -	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); + +	status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: +	posix_acl_release(acl); +	posix_acl_release(default_acl);  out:  	nfs3_free_createdata(data);  	dprintk("NFS reply create: %d\n", status); @@ -471,41 +479,6 @@ nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,  }  static int -nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, -		 struct inode *new_dir, struct qstr *new_name) -{ -	struct nfs_renameargs	arg = { -		.old_dir	= NFS_FH(old_dir), -		.old_name	= old_name, -		.new_dir	= NFS_FH(new_dir), -		.new_name	= new_name, -	}; -	struct nfs_renameres res; -	struct rpc_message msg = { -		.rpc_proc	= &nfs3_procedures[NFS3PROC_RENAME], -		.rpc_argp	= &arg, -		.rpc_resp	= &res, -	}; -	int status = -ENOMEM; - -	dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name); - -	res.old_fattr = nfs_alloc_fattr(); -	res.new_fattr = nfs_alloc_fattr(); -	if (res.old_fattr == NULL || res.new_fattr == NULL) -		goto out; - -	status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); -	nfs_post_op_update_inode(old_dir, res.old_fattr); -	nfs_post_op_update_inode(new_dir, res.new_fattr); -out: -	nfs_free_fattr(res.old_fattr); -	nfs_free_fattr(res.new_fattr); -	dprintk("NFS reply rename: %d\n", status); -	return status; -} - -static int  nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)  {  	struct nfs3_linkargs	arg = { @@ -548,7 +521,7 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,  	if (len > NFS3_MAXPATHLEN)  		return -ENAMETOOLONG; -	dprintk("NFS call  symlink %s\n", dentry->d_name.name); +	dprintk("NFS call  symlink %pd\n", dentry);  	data = nfs3_alloc_createdata();  	if (data == NULL) @@ -572,18 +545,20 @@ out:  static int  nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)  { +	struct posix_acl *default_acl, *acl;  	struct nfs3_createdata *data; -	umode_t mode = sattr->ia_mode;  	int status = -ENOMEM; -	dprintk("NFS call  mkdir %s\n", dentry->d_name.name); - -	sattr->ia_mode &= ~current_umask(); +	dprintk("NFS call  mkdir %pd\n", dentry);  	data = nfs3_alloc_createdata();  	if (data == NULL)  		goto out; +	status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); +	if (status) +		goto out; +  	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];  	data->arg.mkdir.fh = NFS_FH(dir);  	data->arg.mkdir.name = dentry->d_name.name; @@ -592,9 +567,13 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)  	status = nfs3_do_create(dir, dentry, data);  	if (status != 0) -		goto out; +		goto out_release_acls; -	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); +	status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: +	posix_acl_release(acl); +	posix_acl_release(default_acl);  out:  	nfs3_free_createdata(data);  	dprintk("NFS reply mkdir: %d\n", status); @@ -691,19 +670,21 @@ static int  nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  		dev_t rdev)  { +	struct posix_acl *default_acl, *acl;  	struct nfs3_createdata *data; -	umode_t mode = sattr->ia_mode;  	int status = -ENOMEM; -	dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name, +	dprintk("NFS call  mknod %pd %u:%u\n", dentry,  			MAJOR(rdev), MINOR(rdev)); -	sattr->ia_mode &= ~current_umask(); -  	data = nfs3_alloc_createdata();  	if (data == NULL)  		goto out; +	status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); +	if (status) +		goto out; +  	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];  	data->arg.mknod.fh = NFS_FH(dir);  	data->arg.mknod.name = dentry->d_name.name; @@ -731,8 +712,13 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  	status = nfs3_do_create(dir, dentry, data);  	if (status != 0) -		goto out; -	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); +		goto out_release_acls; + +	status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: +	posix_acl_release(acl); +	posix_acl_release(default_acl);  out:  	nfs3_free_createdata(data);  	dprintk("NFS reply mknod: %d\n", status); @@ -809,7 +795,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,  	return status;  } -static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data)  {  	struct inode *inode = data->header->inode; @@ -821,18 +807,18 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)  	return 0;  } -static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)  {  	msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];  } -static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)  {  	rpc_call_start(task);  	return 0;  } -static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data)  {  	struct inode *inode = data->header->inode; @@ -843,17 +829,11 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)  	return 0;  } -static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)  {  	msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];  } -static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) -{ -	rpc_call_start(task); -	return 0; -} -  static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)  {  	rpc_call_start(task); @@ -904,20 +884,28 @@ static const struct inode_operations nfs3_dir_inode_operations = {  	.permission	= nfs_permission,  	.getattr	= nfs_getattr,  	.setattr	= nfs_setattr, +#ifdef CONFIG_NFS_V3_ACL  	.listxattr	= nfs3_listxattr, -	.getxattr	= nfs3_getxattr, -	.setxattr	= nfs3_setxattr, -	.removexattr	= nfs3_removexattr, +	.getxattr	= generic_getxattr, +	.setxattr	= generic_setxattr, +	.removexattr	= generic_removexattr, +	.get_acl	= nfs3_get_acl, +	.set_acl	= nfs3_set_acl, +#endif  };  static const struct inode_operations nfs3_file_inode_operations = {  	.permission	= nfs_permission,  	.getattr	= nfs_getattr,  	.setattr	= nfs_setattr, +#ifdef CONFIG_NFS_V3_ACL  	.listxattr	= nfs3_listxattr, -	.getxattr	= nfs3_getxattr, -	.setxattr	= nfs3_setxattr, -	.removexattr	= nfs3_removexattr, +	.getxattr	= generic_getxattr, +	.setxattr	= generic_setxattr, +	.removexattr	= generic_removexattr, +	.get_acl	= nfs3_get_acl, +	.set_acl	= nfs3_set_acl, +#endif  };  const struct nfs_rpc_ops nfs_v3_clientops = { @@ -939,7 +927,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {  	.unlink_setup	= nfs3_proc_unlink_setup,  	.unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,  	.unlink_done	= nfs3_proc_unlink_done, -	.rename		= nfs3_proc_rename,  	.rename_setup	= nfs3_proc_rename_setup,  	.rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,  	.rename_done	= nfs3_proc_rename_done, @@ -953,19 +940,16 @@ const struct nfs_rpc_ops nfs_v3_clientops = {  	.fsinfo		= nfs3_proc_fsinfo,  	.pathconf	= nfs3_proc_pathconf,  	.decode_dirent	= nfs3_decode_dirent, +	.pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare,  	.read_setup	= nfs3_proc_read_setup, -	.read_pageio_init = nfs_pageio_init_read, -	.read_rpc_prepare = nfs3_proc_read_rpc_prepare,  	.read_done	= nfs3_read_done,  	.write_setup	= nfs3_proc_write_setup, -	.write_pageio_init = nfs_pageio_init_write, -	.write_rpc_prepare = nfs3_proc_write_rpc_prepare,  	.write_done	= nfs3_write_done,  	.commit_setup	= nfs3_proc_commit_setup,  	.commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,  	.commit_done	= nfs3_commit_done,  	.lock		= nfs3_proc_lock, -	.clear_acl_cache = nfs3_forget_cached_acls, +	.clear_acl_cache = forget_all_cached_acls,  	.close_context	= nfs_close_context,  	.have_delegation = nfs3_have_delegation,  	.return_delegation = nfs3_return_delegation, diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c index cc471c72523..d6a98949af1 100644 --- a/fs/nfs/nfs3super.c +++ b/fs/nfs/nfs3super.c @@ -12,6 +12,9 @@ static struct nfs_subversion nfs_v3 = {  	.rpc_vers = &nfs_version3,  	.rpc_ops  = &nfs_v3_clientops,  	.sops     = &nfs_sops, +#ifdef CONFIG_NFS_V3_ACL +	.xattr    = nfs3_xattr_handlers, +#endif  };  static int __init init_nfs_v3(void) diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index fa6d72131c1..8f4cbe7f4aa 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -953,7 +953,7 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,   *	};   */  static void encode_read3args(struct xdr_stream *xdr, -			     const struct nfs_readargs *args) +			     const struct nfs_pgio_args *args)  {  	__be32 *p; @@ -966,7 +966,7 @@ static void encode_read3args(struct xdr_stream *xdr,  static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,  				   struct xdr_stream *xdr, -				   const struct nfs_readargs *args) +				   const struct nfs_pgio_args *args)  {  	encode_read3args(xdr, args);  	prepare_reply_buffer(req, args->pages, args->pgbase, @@ -992,7 +992,7 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,   *	};   */  static void encode_write3args(struct xdr_stream *xdr, -			      const struct nfs_writeargs *args) +			      const struct nfs_pgio_args *args)  {  	__be32 *p; @@ -1008,7 +1008,7 @@ static void encode_write3args(struct xdr_stream *xdr,  static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,  				    struct xdr_stream *xdr, -				    const struct nfs_writeargs *args) +				    const struct nfs_pgio_args *args)  {  	encode_write3args(xdr, args);  	xdr->buf->flags |= XDRBUF_WRITE; @@ -1589,7 +1589,7 @@ out_default:   *	};   */  static int decode_read3resok(struct xdr_stream *xdr, -			     struct nfs_readres *result) +			     struct nfs_pgio_res *result)  {  	u32 eof, count, ocount, recvd;  	__be32 *p; @@ -1625,7 +1625,7 @@ out_overflow:  }  static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, -				 struct nfs_readres *result) +				 struct nfs_pgio_res *result)  {  	enum nfs_stat status;  	int error; @@ -1673,7 +1673,7 @@ out_status:   *	};   */  static int decode_write3resok(struct xdr_stream *xdr, -			      struct nfs_writeres *result) +			      struct nfs_pgio_res *result)  {  	__be32 *p; @@ -1697,7 +1697,7 @@ out_eio:  }  static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, -				  struct nfs_writeres *result) +				  struct nfs_pgio_res *result)  {  	enum nfs_stat status;  	int error; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 28842abafab..ba2affa5194 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -9,6 +9,14 @@  #ifndef __LINUX_FS_NFS_NFS4_FS_H  #define __LINUX_FS_NFS_NFS4_FS_H +#if defined(CONFIG_NFS_V4_2) +#define NFS4_MAX_MINOR_VERSION 2 +#elif defined(CONFIG_NFS_V4_1) +#define NFS4_MAX_MINOR_VERSION 1 +#else +#define NFS4_MAX_MINOR_VERSION 0 +#endif +  #if IS_ENABLED(CONFIG_NFS_V4)  #define NFS4_MAX_LOOP_ON_RECOVER (10) @@ -29,6 +37,8 @@ enum nfs4_client_state {  	NFS4CLNT_SERVER_SCOPE_MISMATCH,  	NFS4CLNT_PURGE_STATE,  	NFS4CLNT_BIND_CONN_TO_SESSION, +	NFS4CLNT_MOVED, +	NFS4CLNT_LEASE_MOVED,  };  #define NFS4_RENEW_TIMEOUT		0x01 @@ -50,6 +60,7 @@ struct nfs4_minor_version_ops {  	const struct nfs4_state_recovery_ops *reboot_recovery_ops;  	const struct nfs4_state_recovery_ops *nograce_recovery_ops;  	const struct nfs4_state_maintenance_ops *state_renewal_ops; +	const struct nfs4_mig_recovery_ops *mig_recovery_ops;  };  #define NFS_SEQID_CONFIRMED 1 @@ -203,6 +214,12 @@ struct nfs4_state_maintenance_ops {  	int (*renew_lease)(struct nfs_client *, struct rpc_cred *);  }; +struct nfs4_mig_recovery_ops { +	int (*get_locations)(struct inode *, struct nfs4_fs_locations *, +		struct page *, struct rpc_cred *); +	int (*fsid_present)(struct inode *, struct rpc_cred *); +}; +  extern const struct dentry_operations nfs4_dentry_operations;  /* dir.c */ @@ -213,10 +230,11 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *,  extern struct file_system_type nfs4_fs_type;  /* nfs4namespace.c */ -rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *); -struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *); +struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, struct qstr *);  struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,  			       struct nfs_fh *, struct nfs_fattr *); +int nfs4_replace_transport(struct nfs_server *server, +				const struct nfs4_fs_locations *locations);  /* nfs4proc.c */  extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); @@ -231,6 +249,9 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait);  extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);  extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *,  				  struct nfs4_fs_locations *, struct page *); +extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *, +		struct page *page, struct rpc_cred *); +extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *);  extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *,  			    struct nfs_fh *, struct nfs_fattr *);  extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); @@ -249,6 +270,7 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser  extern int nfs41_setup_sequence(struct nfs4_session *session,  		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,  		struct rpc_task *task); +extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *);  extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *);  extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *);  extern int nfs4_proc_get_lease_time(struct nfs_client *clp, @@ -315,7 +337,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,   */  static inline void  nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, -			 struct rpc_message *msg, struct nfs_write_data *wdata) +			 struct rpc_message *msg, struct nfs_pgio_data *wdata)  {  	if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&  	    !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags)) @@ -347,7 +369,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,  static inline void  nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, -			 struct rpc_message *msg, struct nfs_write_data *wdata) +			 struct rpc_message *msg, struct nfs_pgio_data *wdata)  {  }  #endif /* CONFIG_NFS_V4_1 */ @@ -405,12 +427,15 @@ extern void nfs4_close_sync(struct nfs4_state *, fmode_t);  extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);  extern void nfs_inode_find_state_and_recover(struct inode *inode,  		const nfs4_stateid *stateid); +extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *, struct nfs4_state *);  extern void nfs4_schedule_lease_recovery(struct nfs_client *);  extern int nfs4_wait_clnt_recover(struct nfs_client *clp);  extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);  extern void nfs4_schedule_state_manager(struct nfs_client *);  extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);  extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); +extern int nfs4_schedule_migration_recovery(const struct nfs_server *); +extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *);  extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);  extern void nfs41_handle_server_scope(struct nfs_client *,  				      struct nfs41_server_scope **); @@ -476,6 +501,16 @@ static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_statei  	return memcmp(dst, src, sizeof(*dst)) == 0;  } +static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src) +{ +	return memcmp(dst->other, src->other, NFS4_STATEID_OTHER_SIZE) == 0; +} + +static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stateid *s2) +{ +	return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; +} +  static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)  {  	return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index a860ab566d6..aa9ef487604 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -10,6 +10,7 @@  #include <linux/sunrpc/auth.h>  #include <linux/sunrpc/xprt.h>  #include <linux/sunrpc/bc_xprt.h> +#include <linux/sunrpc/rpc_pipe_fs.h>  #include "internal.h"  #include "callback.h"  #include "delegation.h" @@ -169,7 +170,7 @@ void nfs41_shutdown_client(struct nfs_client *clp)  void nfs40_shutdown_client(struct nfs_client *clp)  {  	if (clp->cl_slot_tbl) { -		nfs4_release_slot_table(clp->cl_slot_tbl); +		nfs4_shutdown_slot_table(clp->cl_slot_tbl);  		kfree(clp->cl_slot_tbl);  	}  } @@ -197,6 +198,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)  	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;  	clp->cl_minorversion = cl_init->minorversion;  	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; +	clp->cl_mig_gen = 1;  	return clp;  error: @@ -368,6 +370,8 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,  	if (clp->cl_minorversion != 0)  		__set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);  	__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); +	__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); +  	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);  	if (error == -EINVAL)  		error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); @@ -407,13 +411,11 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,  	error = nfs4_discover_server_trunking(clp, &old);  	if (error < 0)  		goto error; -	nfs_put_client(clp); -	if (clp != old) { -		clp->cl_preserve_clid = true; -		clp = old; -	} -	return clp; +	if (clp != old) +		clp->cl_preserve_clid = true; +	nfs_put_client(clp); +	return old;  error:  	nfs_mark_client_ready(clp, error); @@ -491,9 +493,10 @@ int nfs40_walk_client_list(struct nfs_client *new,  			prev = pos;  			status = nfs_wait_client_init_complete(pos); -			spin_lock(&nn->nfs_client_lock);  			if (status < 0) -				continue; +				goto out; +			status = -NFS4ERR_STALE_CLIENTID; +			spin_lock(&nn->nfs_client_lock);  		}  		if (pos->cl_cons_state != NFS_CS_READY)  			continue; @@ -528,6 +531,13 @@ int nfs40_walk_client_list(struct nfs_client *new,  			*result = pos;  			dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",  				__func__, pos, atomic_read(&pos->cl_count)); +			goto out; +		case -ERESTARTSYS: +		case -ETIMEDOUT: +			/* The callback path may have been inadvertently +			 * changed. Schedule recovery! +			 */ +			nfs4_schedule_path_down_recovery(pos);  		default:  			goto out;  		} @@ -631,7 +641,8 @@ int nfs41_walk_client_list(struct nfs_client *new,  			}  			spin_lock(&nn->nfs_client_lock);  			if (status < 0) -				continue; +				break; +			status = -NFS4ERR_STALE_CLIENTID;  		}  		if (pos->cl_cons_state != NFS_CS_READY)  			continue; @@ -924,7 +935,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,  	dprintk("Server FSID: %llx:%llx\n",  			(unsigned long long) server->fsid.major,  			(unsigned long long) server->fsid.minor); -	dprintk("Mount FH: %d\n", mntfh->size); +	nfs_display_fhandle(mntfh, "Pseudo-fs root FH");  	nfs4_session_set_rwsize(server); @@ -947,9 +958,8 @@ out:   * Create a version 4 volume record   */  static int nfs4_init_server(struct nfs_server *server, -		const struct nfs_parsed_mount_data *data) +		struct nfs_parsed_mount_data *data)  { -	rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;  	struct rpc_timeout timeparms;  	int error; @@ -961,9 +971,15 @@ static int nfs4_init_server(struct nfs_server *server,  	/* Initialise the client representation from the mount data */  	server->flags = data->flags;  	server->options = data->options; +	server->auth_info = data->auth_info; -	if (data->auth_flavor_len >= 1) -		pseudoflavor = data->auth_flavors[0]; +	/* Use the first specified auth flavor. If this flavor isn't +	 * allowed by the server, use the SECINFO path to try the +	 * other specified flavors */ +	if (data->auth_info.flavor_len >= 1) +		data->selected_flavor = data->auth_info.flavors[0]; +	else +		data->selected_flavor = RPC_AUTH_UNIX;  	/* Get a client record */  	error = nfs4_set_client(server, @@ -971,7 +987,7 @@ static int nfs4_init_server(struct nfs_server *server,  			(const struct sockaddr *)&data->nfs_server.address,  			data->nfs_server.addrlen,  			data->client_address, -			pseudoflavor, +			data->selected_flavor,  			data->nfs_server.protocol,  			&timeparms,  			data->minorversion, @@ -991,7 +1007,8 @@ static int nfs4_init_server(struct nfs_server *server,  	server->port = data->nfs_server.port; -	error = nfs_init_server_rpcclient(server, &timeparms, pseudoflavor); +	error = nfs_init_server_rpcclient(server, &timeparms, +					  data->selected_flavor);  error:  	/* Done */ @@ -1018,7 +1035,7 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info,  	if (!server)  		return ERR_PTR(-ENOMEM); -	auth_probe = mount_info->parsed->auth_flavor_len < 1; +	auth_probe = mount_info->parsed->auth_info.flavor_len < 1;  	/* set up the general RPC client */  	error = nfs4_init_server(server, mount_info->parsed); @@ -1046,6 +1063,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,  {  	struct nfs_client *parent_client;  	struct nfs_server *server, *parent_server; +	bool auth_probe;  	int error;  	dprintk("--> nfs4_create_referral_server()\n"); @@ -1078,8 +1096,9 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,  	if (error < 0)  		goto error; -	error = nfs4_server_common_setup(server, mntfh, -			!(parent_server->flags & NFS_MOUNT_SECFLAVOUR)); +	auth_probe = parent_server->auth_info.flavor_len < 1; + +	error = nfs4_server_common_setup(server, mntfh, auth_probe);  	if (error < 0)  		goto error; @@ -1091,3 +1110,112 @@ error:  	dprintk("<-- nfs4_create_referral_server() = error %d\n", error);  	return ERR_PTR(error);  } + +/* + * Grab the destination's particulars, including lease expiry time. + * + * Returns zero if probe succeeded and retrieved FSID matches the FSID + * we have cached. + */ +static int nfs_probe_destination(struct nfs_server *server) +{ +	struct inode *inode = server->super->s_root->d_inode; +	struct nfs_fattr *fattr; +	int error; + +	fattr = nfs_alloc_fattr(); +	if (fattr == NULL) +		return -ENOMEM; + +	/* Sanity: the probe won't work if the destination server +	 * does not recognize the migrated FH. */ +	error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr); + +	nfs_free_fattr(fattr); +	return error; +} + +/** + * nfs4_update_server - Move an nfs_server to a different nfs_client + * + * @server: represents FSID to be moved + * @hostname: new end-point's hostname + * @sap: new end-point's socket address + * @salen: size of "sap" + * @net: net namespace + * + * The nfs_server must be quiescent before this function is invoked. + * Either its session is drained (NFSv4.1+), or its transport is + * plugged and drained (NFSv4.0). + * + * Returns zero on success, or a negative errno value. + */ +int nfs4_update_server(struct nfs_server *server, const char *hostname, +		       struct sockaddr *sap, size_t salen, struct net *net) +{ +	struct nfs_client *clp = server->nfs_client; +	struct rpc_clnt *clnt = server->client; +	struct xprt_create xargs = { +		.ident		= clp->cl_proto, +		.net		= net, +		.dstaddr	= sap, +		.addrlen	= salen, +		.servername	= hostname, +	}; +	char buf[INET6_ADDRSTRLEN + 1]; +	struct sockaddr_storage address; +	struct sockaddr *localaddr = (struct sockaddr *)&address; +	int error; + +	dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__, +			(unsigned long long)server->fsid.major, +			(unsigned long long)server->fsid.minor, +			hostname); + +	error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout); +	if (error != 0) { +		dprintk("<-- %s(): rpc_switch_client_transport returned %d\n", +			__func__, error); +		goto out; +	} + +	error = rpc_localaddr(clnt, localaddr, sizeof(address)); +	if (error != 0) { +		dprintk("<-- %s(): rpc_localaddr returned %d\n", +			__func__, error); +		goto out; +	} + +	error = -EAFNOSUPPORT; +	if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) { +		dprintk("<-- %s(): rpc_ntop returned %d\n", +			__func__, error); +		goto out; +	} + +	nfs_server_remove_lists(server); +	error = nfs4_set_client(server, hostname, sap, salen, buf, +				clp->cl_rpcclient->cl_auth->au_flavor, +				clp->cl_proto, clnt->cl_timeout, +				clp->cl_minorversion, net); +	nfs_put_client(clp); +	if (error != 0) { +		nfs_server_insert_lists(server); +		dprintk("<-- %s(): nfs4_set_client returned %d\n", +			__func__, error); +		goto out; +	} + +	if (server->nfs_client->cl_hostname == NULL) +		server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); +	nfs_server_insert_lists(server); + +	error = nfs_probe_destination(server); +	if (error < 0) +		goto out; + +	dprintk("<-- %s() succeeded\n", __func__); + +out: +	return error; +} diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index e5b804dd944..a816f0627a6 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -19,6 +19,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)  	struct inode *dir;  	unsigned openflags = filp->f_flags;  	struct iattr attr; +	int opened = 0;  	int err;  	/* @@ -30,9 +31,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)  	 * -EOPENSTALE.  The VFS will retry the lookup/create/open.  	 */ -	dprintk("NFS: open file(%s/%s)\n", -		dentry->d_parent->d_name.name, -		dentry->d_name.name); +	dprintk("NFS: open file(%pd2)\n", dentry);  	if ((openflags & O_ACCMODE) == 3)  		openflags--; @@ -55,7 +54,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)  		nfs_wb_all(inode);  	} -	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr); +	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened);  	if (IS_ERR(inode)) {  		err = PTR_ERR(inode);  		switch (err) { @@ -74,7 +73,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)  	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));  	nfs_file_set_open_context(filp, ctx); -	nfs_fscache_set_inode_cookie(inode, filp); +	nfs_fscache_open_file(inode, filp);  	err = 0;  out_put_ctx: @@ -101,8 +100,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)  			break;  		mutex_lock(&inode->i_mutex);  		ret = nfs_file_fsync_commit(file, start, end, datasync); -		if (!ret && !datasync) -			/* application has asked for meta-data sync */ +		if (!ret)  			ret = pnfs_layoutcommit_inode(inode, true);  		mutex_unlock(&inode->i_mutex);  		/* @@ -119,10 +117,10 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)  const struct file_operations nfs4_file_operations = {  	.llseek		= nfs_file_llseek, -	.read		= do_sync_read, -	.write		= do_sync_write, -	.aio_read	= nfs_file_read, -	.aio_write	= nfs_file_write, +	.read		= new_sync_read, +	.write		= new_sync_write, +	.read_iter	= nfs_file_read, +	.write_iter	= nfs_file_write,  	.mmap		= nfs_file_mmap,  	.open		= nfs4_file_open,  	.flush		= nfs_file_flush, @@ -131,7 +129,7 @@ const struct file_operations nfs4_file_operations = {  	.lock		= nfs_lock,  	.flock		= nfs_flock,  	.splice_read	= nfs_file_splice_read, -	.splice_write	= nfs_file_splice_write, +	.splice_write	= iter_file_splice_write,  	.check_flags	= nfs_check_flags,  	.setlease	= nfs_setlease,  }; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 2288cd3c927..3d83cb1fdc7 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -121,9 +121,8 @@ static int nfs4_validate_fspath(struct dentry *dentry,  }  static size_t nfs_parse_server_name(char *string, size_t len, -		struct sockaddr *sa, size_t salen, struct nfs_server *server) +		struct sockaddr *sa, size_t salen, struct net *net)  { -	struct net *net = rpc_net_ns(server->client);  	ssize_t ret;  	ret = rpc_pton(net, string, len, sa, salen); @@ -137,17 +136,25 @@ static size_t nfs_parse_server_name(char *string, size_t len,  /**   * nfs_find_best_sec - Find a security mechanism supported locally + * @server: NFS server struct   * @flavors: List of security tuples returned by SECINFO procedure   * - * Return the pseudoflavor of the first security mechanism in - * "flavors" that is locally supported.  Return RPC_AUTH_UNIX if - * no matching flavor is found in the array.  The "flavors" array + * Return an rpc client that uses the first security mechanism in + * "flavors" that is locally supported.  The "flavors" array   * is searched in the order returned from the server, per RFC 3530 - * recommendation. + * recommendation and each flavor is checked for membership in the + * sec= mount option list if it exists. + * + * Return -EPERM if no matching flavor is found in the array. + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + *   */ -rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) +static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt, +					  struct nfs_server *server, +					  struct nfs4_secinfo_flavors *flavors)  { -	rpc_authflavor_t pseudoflavor; +	rpc_authflavor_t pflavor;  	struct nfs4_secinfo4 *secinfo;  	unsigned int i; @@ -158,55 +165,73 @@ rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)  		case RPC_AUTH_NULL:  		case RPC_AUTH_UNIX:  		case RPC_AUTH_GSS: -			pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor, +			pflavor = rpcauth_get_pseudoflavor(secinfo->flavor,  							&secinfo->flavor_info); -			if (pseudoflavor != RPC_AUTH_MAXFLAVOR) -				return pseudoflavor; -			break; +			/* does the pseudoflavor match a sec= mount opt? */ +			if (pflavor != RPC_AUTH_MAXFLAVOR && +			    nfs_auth_info_match(&server->auth_info, pflavor)) { +				struct rpc_clnt *new; +				struct rpc_cred *cred; + +				/* Cloning creates an rpc_auth for the flavor */ +				new = rpc_clone_client_set_auth(clnt, pflavor); +				if (IS_ERR(new)) +					continue; +				/** +				* Check that the user actually can use the +				* flavor. This is mostly for RPC_AUTH_GSS +				* where cr_init obtains a gss context +				*/ +				cred = rpcauth_lookupcred(new->cl_auth, 0); +				if (IS_ERR(cred)) { +					rpc_shutdown_client(new); +					continue; +				} +				put_rpccred(cred); +				return new; +			}  		}  	} - -	return RPC_AUTH_UNIX; +	return ERR_PTR(-EPERM);  } -static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name) +/** + * nfs4_negotiate_security - in response to an NFS4ERR_WRONGSEC on lookup, + * return an rpc_clnt that uses the best available security flavor with + * respect to the secinfo flavor list and the sec= mount options. + * + * @clnt: RPC client to clone + * @inode: directory inode + * @name: lookup name + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + */ +struct rpc_clnt * +nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode, +					struct qstr *name)  {  	struct page *page;  	struct nfs4_secinfo_flavors *flavors; -	rpc_authflavor_t flavor; +	struct rpc_clnt *new;  	int err;  	page = alloc_page(GFP_KERNEL);  	if (!page) -		return -ENOMEM; +		return ERR_PTR(-ENOMEM); +  	flavors = page_address(page);  	err = nfs4_proc_secinfo(inode, name, flavors);  	if (err < 0) { -		flavor = err; +		new = ERR_PTR(err);  		goto out;  	} -	flavor = nfs_find_best_sec(flavors); +	new = nfs_find_best_sec(clnt, NFS_SERVER(inode), flavors);  out:  	put_page(page); -	return flavor; -} - -/* - * Please call rpc_shutdown_client() when you are done with this client. - */ -struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode, -					struct qstr *name) -{ -	rpc_authflavor_t flavor; - -	flavor = nfs4_negotiate_security(inode, name); -	if ((int)flavor < 0) -		return ERR_PTR((int)flavor); - -	return rpc_clone_client_set_auth(clnt, flavor); +	return new;  }  static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, @@ -214,6 +239,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,  				     const struct nfs4_fs_location *location)  {  	const size_t addr_bufsize = sizeof(struct sockaddr_storage); +	struct net *net = rpc_net_ns(NFS_SB(mountdata->sb)->client);  	struct vfsmount *mnt = ERR_PTR(-ENOENT);  	char *mnt_path;  	unsigned int maxbuflen; @@ -239,8 +265,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,  			continue;  		mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, -				mountdata->addr, addr_bufsize, -				NFS_SB(mountdata->sb)); +				mountdata->addr, addr_bufsize, net);  		if (mountdata->addrlen == 0)  			continue; @@ -283,8 +308,7 @@ static struct vfsmount *nfs_follow_referral(struct dentry *dentry,  	if (locations == NULL || locations->nlocations <= 0)  		goto out; -	dprintk("%s: referral at %s/%s\n", __func__, -		dentry->d_parent->d_name.name, dentry->d_name.name); +	dprintk("%s: referral at %pd2\n", __func__, dentry);  	page = (char *) __get_free_page(GFP_USER);  	if (!page) @@ -348,8 +372,8 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *  	mnt = ERR_PTR(-ENOENT);  	parent = dget_parent(dentry); -	dprintk("%s: getting locations for %s/%s\n", -		__func__, parent->d_name.name, dentry->d_name.name); +	dprintk("%s: getting locations for %pd2\n", +		__func__, dentry);  	err = nfs4_proc_fs_locations(client, parent->d_inode, &dentry->d_name, fs_locations, page);  	dput(parent); @@ -390,13 +414,110 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,  	if (client->cl_auth->au_flavor != flavor)  		flavor = client->cl_auth->au_flavor; -	else if (!(server->flags & NFS_MOUNT_SECFLAVOUR)) { -		rpc_authflavor_t new = nfs4_negotiate_security(dir, name); -		if ((int)new >= 0) -			flavor = new; -	}  	mnt = nfs_do_submount(dentry, fh, fattr, flavor);  out:  	rpc_shutdown_client(client);  	return mnt;  } + +/* + * Try one location from the fs_locations array. + * + * Returns zero on success, or a negative errno value. + */ +static int nfs4_try_replacing_one_location(struct nfs_server *server, +		char *page, char *page2, +		const struct nfs4_fs_location *location) +{ +	const size_t addr_bufsize = sizeof(struct sockaddr_storage); +	struct net *net = rpc_net_ns(server->client); +	struct sockaddr *sap; +	unsigned int s; +	size_t salen; +	int error; + +	sap = kmalloc(addr_bufsize, GFP_KERNEL); +	if (sap == NULL) +		return -ENOMEM; + +	error = -ENOENT; +	for (s = 0; s < location->nservers; s++) { +		const struct nfs4_string *buf = &location->servers[s]; +		char *hostname; + +		if (buf->len <= 0 || buf->len > PAGE_SIZE) +			continue; + +		if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len) != NULL) +			continue; + +		salen = nfs_parse_server_name(buf->data, buf->len, +						sap, addr_bufsize, net); +		if (salen == 0) +			continue; +		rpc_set_port(sap, NFS_PORT); + +		error = -ENOMEM; +		hostname = kstrndup(buf->data, buf->len, GFP_KERNEL); +		if (hostname == NULL) +			break; + +		error = nfs4_update_server(server, hostname, sap, salen, net); +		kfree(hostname); +		if (error == 0) +			break; +	} + +	kfree(sap); +	return error; +} + +/** + * nfs4_replace_transport - set up transport to destination server + * + * @server: export being migrated + * @locations: fs_locations array + * + * Returns zero on success, or a negative errno value. + * + * The client tries all the entries in the "locations" array, in the + * order returned by the server, until one works or the end of the + * array is reached. + */ +int nfs4_replace_transport(struct nfs_server *server, +			   const struct nfs4_fs_locations *locations) +{ +	char *page = NULL, *page2 = NULL; +	int loc, error; + +	error = -ENOENT; +	if (locations == NULL || locations->nlocations <= 0) +		goto out; + +	error = -ENOMEM; +	page = (char *) __get_free_page(GFP_USER); +	if (!page) +		goto out; +	page2 = (char *) __get_free_page(GFP_USER); +	if (!page2) +		goto out; + +	for (loc = 0; loc < locations->nlocations; loc++) { +		const struct nfs4_fs_location *location = +						&locations->locations[loc]; + +		if (location == NULL || location->nservers <= 0 || +		    location->rootpath.ncomponents == 0) +			continue; + +		error = nfs4_try_replacing_one_location(server, page, +							page2, location); +		if (error == 0) +			break; +	} + +out: +	free_page((unsigned long)page); +	free_page((unsigned long)page2); +	return error; +} diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 989bb9d3074..4bf3d97cc5a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -105,9 +105,6 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry,  	if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)  		return NULL; -	if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2) -		return NULL; -  	err = security_dentry_init_security(dentry, sattr->ia_mode,  				&dentry->d_name, (void **)&label->label, &label->len);  	if (err == 0) @@ -384,6 +381,14 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc  		case -NFS4ERR_STALE_CLIENTID:  			nfs4_schedule_lease_recovery(clp);  			goto wait_on_recovery; +		case -NFS4ERR_MOVED: +			ret = nfs4_schedule_migration_recovery(server); +			if (ret < 0) +				break; +			goto wait_on_recovery; +		case -NFS4ERR_LEASE_MOVED: +			nfs4_schedule_lease_moved_recovery(clp); +			goto wait_on_recovery;  #if defined(CONFIG_NFS_V4_1)  		case -NFS4ERR_BADSESSION:  		case -NFS4ERR_BADSLOT: @@ -431,6 +436,8 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc  	return nfs4_map_errors(ret);  wait_on_recovery:  	ret = nfs4_wait_clnt_recover(clp); +	if (test_bit(NFS_MIG_FAILED, &server->mig_status)) +		return -EIO;  	if (ret == 0)  		exception->retry = 1;  	return ret; @@ -532,7 +539,7 @@ static int nfs40_sequence_done(struct rpc_task *task,  	struct nfs4_slot *slot = res->sr_slot;  	struct nfs4_slot_table *tbl; -	if (!RPC_WAS_SENT(task)) +	if (slot == NULL)  		goto out;  	tbl = slot->table; @@ -552,15 +559,10 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)  {  	struct nfs4_session *session;  	struct nfs4_slot_table *tbl; +	struct nfs4_slot *slot = res->sr_slot;  	bool send_new_highest_used_slotid = false; -	if (!res->sr_slot) { -		/* just wake up the next guy waiting since -		 * we may have not consumed a slot after all */ -		dprintk("%s: No slot\n", __func__); -		return; -	} -	tbl = res->sr_slot->table; +	tbl = slot->table;  	session = tbl->session;  	spin_lock(&tbl->slot_tbl_lock); @@ -570,11 +572,11 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)  	if (tbl->highest_used_slotid > tbl->target_highest_slotid)  		send_new_highest_used_slotid = true; -	if (nfs41_wake_and_assign_slot(tbl, res->sr_slot)) { +	if (nfs41_wake_and_assign_slot(tbl, slot)) {  		send_new_highest_used_slotid = false;  		goto out_unlock;  	} -	nfs4_free_slot(tbl, res->sr_slot); +	nfs4_free_slot(tbl, slot);  	if (tbl->highest_used_slotid != NFS4_NO_SLOT)  		send_new_highest_used_slotid = false; @@ -585,19 +587,20 @@ out_unlock:  		nfs41_server_notify_highest_slotid_update(session->clp);  } -static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)  {  	struct nfs4_session *session; -	struct nfs4_slot *slot; +	struct nfs4_slot *slot = res->sr_slot;  	struct nfs_client *clp;  	bool interrupted = false;  	int ret = 1; +	if (slot == NULL) +		goto out_noaction;  	/* don't increment the sequence number if the task wasn't sent */  	if (!RPC_WAS_SENT(task))  		goto out; -	slot = res->sr_slot;  	session = slot->table->session;  	if (slot->interrupted) { @@ -672,6 +675,7 @@ out:  	/* The session may be reset by one of the error handlers. */  	dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);  	nfs41_sequence_free_slot(res); +out_noaction:  	return ret;  retry_nowait:  	if (rpc_restart_call_prepare(task)) { @@ -685,6 +689,7 @@ out_retry:  	rpc_delay(task, NFS4_POLL_RETRY_MAX);  	return 0;  } +EXPORT_SYMBOL_GPL(nfs41_sequence_done);  static int nfs4_sequence_done(struct rpc_task *task,  			       struct nfs4_sequence_res *res) @@ -912,6 +917,7 @@ struct nfs4_opendata {  	struct iattr attrs;  	unsigned long timestamp;  	unsigned int rpc_done : 1; +	unsigned int file_created : 1;  	unsigned int is_recover : 1;  	int rpc_status;  	int cancelled; @@ -1062,6 +1068,7 @@ static void nfs4_opendata_free(struct kref *kref)  	dput(p->dentry);  	nfs_sb_deactive(sb);  	nfs_fattr_free_names(&p->f_attr); +	kfree(p->f_attr.mdsthreshold);  	kfree(p);  } @@ -1131,12 +1138,71 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)  	nfs4_state_set_mode_locked(state, state->state | fmode);  } -static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) +{ +	struct nfs_client *clp = state->owner->so_server->nfs_client; +	bool need_recover = false; + +	if (test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags) && state->n_rdonly) +		need_recover = true; +	if (test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags) && state->n_wronly) +		need_recover = true; +	if (test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags) && state->n_rdwr) +		need_recover = true; +	if (need_recover) +		nfs4_state_mark_reclaim_nograce(clp, state); +} + +static bool nfs_need_update_open_stateid(struct nfs4_state *state, +		nfs4_stateid *stateid)  { +	if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0) +		return true; +	if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) { +		nfs_test_and_clear_all_open_stateid(state); +		return true; +	} +	if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) +		return true; +	return false; +} + +static void nfs_clear_open_stateid_locked(struct nfs4_state *state, +		nfs4_stateid *stateid, fmode_t fmode) +{ +	clear_bit(NFS_O_RDWR_STATE, &state->flags); +	switch (fmode & (FMODE_READ|FMODE_WRITE)) { +	case FMODE_WRITE: +		clear_bit(NFS_O_RDONLY_STATE, &state->flags); +		break; +	case FMODE_READ: +		clear_bit(NFS_O_WRONLY_STATE, &state->flags); +		break; +	case 0: +		clear_bit(NFS_O_RDONLY_STATE, &state->flags); +		clear_bit(NFS_O_WRONLY_STATE, &state->flags); +		clear_bit(NFS_OPEN_STATE, &state->flags); +	} +	if (stateid == NULL) +		return; +	if (!nfs_need_update_open_stateid(state, stateid)) +		return;  	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)  		nfs4_stateid_copy(&state->stateid, stateid);  	nfs4_stateid_copy(&state->open_stateid, stateid); -	set_bit(NFS_OPEN_STATE, &state->flags); +} + +static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{ +	write_seqlock(&state->seqlock); +	nfs_clear_open_stateid_locked(state, stateid, fmode); +	write_sequnlock(&state->seqlock); +	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) +		nfs4_schedule_state_manager(state->owner->so_server->nfs_client); +} + +static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{  	switch (fmode) {  		case FMODE_READ:  			set_bit(NFS_O_RDONLY_STATE, &state->flags); @@ -1147,13 +1213,11 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *  		case FMODE_READ|FMODE_WRITE:  			set_bit(NFS_O_RDWR_STATE, &state->flags);  	} -} - -static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) -{ -	write_seqlock(&state->seqlock); -	nfs_set_open_stateid_locked(state, stateid, fmode); -	write_sequnlock(&state->seqlock); +	if (!nfs_need_update_open_stateid(state, stateid)) +		return; +	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +		nfs4_stateid_copy(&state->stateid, stateid); +	nfs4_stateid_copy(&state->open_stateid, stateid);  }  static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode) @@ -1211,6 +1275,8 @@ no_delegation:  		__update_open_stateid(state, open_stateid, NULL, fmode);  		ret = 1;  	} +	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) +		nfs4_schedule_state_manager(state->owner->so_server->nfs_client);  	return ret;  } @@ -1317,31 +1383,24 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)  	int ret;  	if (!data->rpc_done) { -		ret = data->rpc_status; -		goto err; +		if (data->rpc_status) { +			ret = data->rpc_status; +			goto err; +		} +		/* cached opens have already been processed */ +		goto update;  	} -	ret = -ESTALE; -	if (!(data->f_attr.valid & NFS_ATTR_FATTR_TYPE) || -	    !(data->f_attr.valid & NFS_ATTR_FATTR_FILEID) || -	    !(data->f_attr.valid & NFS_ATTR_FATTR_CHANGE)) -		goto err; - -	ret = -ENOMEM; -	state = nfs4_get_open_state(inode, data->owner); -	if (state == NULL) -		goto err; -  	ret = nfs_refresh_inode(inode, &data->f_attr);  	if (ret)  		goto err; -	nfs_setsecurity(inode, &data->f_attr, data->f_label); -  	if (data->o_res.delegation_type != 0)  		nfs4_opendata_check_deleg(data, state); +update:  	update_open_stateid(state, &data->o_res.stateid, NULL,  			    data->o_arg.fmode); +	atomic_inc(&state->count);  	return state;  err: @@ -1451,12 +1510,15 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *  	struct nfs4_state *newstate;  	int ret; +	/* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */ +	clear_bit(NFS_O_RDWR_STATE, &state->flags); +	clear_bit(NFS_O_WRONLY_STATE, &state->flags); +	clear_bit(NFS_O_RDONLY_STATE, &state->flags);  	/* memory barrier prior to reading state->n_* */  	clear_bit(NFS_DELEGATED_STATE, &state->flags);  	clear_bit(NFS_OPEN_STATE, &state->flags);  	smp_rmb();  	if (state->n_rdwr != 0) { -		clear_bit(NFS_O_RDWR_STATE, &state->flags);  		ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);  		if (ret != 0)  			return ret; @@ -1464,7 +1526,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *  			return -ESTALE;  	}  	if (state->n_wronly != 0) { -		clear_bit(NFS_O_WRONLY_STATE, &state->flags);  		ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);  		if (ret != 0)  			return ret; @@ -1472,7 +1533,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *  			return -ESTALE;  	}  	if (state->n_rdonly != 0) { -		clear_bit(NFS_O_RDONLY_STATE, &state->flags);  		ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);  		if (ret != 0)  			return ret; @@ -1574,6 +1634,12 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct  			/* Don't recall a delegation if it was lost */  			nfs4_schedule_lease_recovery(server->nfs_client);  			return -EAGAIN; +		case -NFS4ERR_MOVED: +			nfs4_schedule_migration_recovery(server); +			return -EAGAIN; +		case -NFS4ERR_LEASE_MOVED: +			nfs4_schedule_lease_moved_recovery(server->nfs_client); +			return -EAGAIN;  		case -NFS4ERR_DELEG_REVOKED:  		case -NFS4ERR_ADMIN_REVOKED:  		case -NFS4ERR_BAD_STATEID: @@ -1615,15 +1681,15 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)  {  	struct nfs4_opendata *data = calldata; -	nfs40_setup_sequence(data->o_arg.server, &data->o_arg.seq_args, -				&data->o_res.seq_res, task); +	nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args, +				&data->c_res.seq_res, task);  }  static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)  {  	struct nfs4_opendata *data = calldata; -	nfs40_sequence_done(task, &data->o_res.seq_res); +	nfs40_sequence_done(task, &data->c_res.seq_res);  	data->rpc_status = task->tk_status;  	if (data->rpc_status == 0) { @@ -1681,7 +1747,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)  	};  	int status; -	nfs4_init_sequence(&data->o_arg.seq_args, &data->o_res.seq_res, 1); +	nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1);  	kref_get(&data->kref);  	data->rpc_done = 0;  	data->rpc_status = 0; @@ -1946,8 +2012,13 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)  	nfs_fattr_map_and_free_names(server, &data->f_attr); -	if (o_arg->open_flags & O_CREAT) +	if (o_arg->open_flags & O_CREAT) {  		update_changeattr(dir, &o_res->cinfo); +		if (o_arg->open_flags & O_EXCL) +			data->file_created = 1; +		else if (o_res->cinfo.before != o_res->cinfo.after) +			data->file_created = 1; +	}  	if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)  		server->caps &= ~NFS_CAP_POSIX_LOCK;  	if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { @@ -1956,7 +2027,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)  			return status;  	}  	if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) -		_nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); +		nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);  	return 0;  } @@ -2191,7 +2262,8 @@ static int _nfs4_do_open(struct inode *dir,  			struct nfs_open_context *ctx,  			int flags,  			struct iattr *sattr, -			struct nfs4_label *label) +			struct nfs4_label *label, +			int *opened)  {  	struct nfs4_state_owner  *sp;  	struct nfs4_state     *state = NULL; @@ -2233,10 +2305,12 @@ static int _nfs4_do_open(struct inode *dir,  		}  	} -	if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { -		opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); -		if (!opendata->f_attr.mdsthreshold) -			goto err_free_label; +	if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { +		if (!opendata->f_attr.mdsthreshold) { +			opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); +			if (!opendata->f_attr.mdsthreshold) +				goto err_free_label; +		}  		opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];  	}  	if (dentry->d_inode != NULL) @@ -2261,12 +2335,13 @@ static int _nfs4_do_open(struct inode *dir,  			nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);  		}  	} +	if (opendata->file_created) +		*opened |= FILE_CREATED; -	if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) +	if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {  		*ctx_th = opendata->f_attr.mdsthreshold; -	else -		kfree(opendata->f_attr.mdsthreshold); -	opendata->f_attr.mdsthreshold = NULL; +		opendata->f_attr.mdsthreshold = NULL; +	}  	nfs4_label_free(olabel); @@ -2276,7 +2351,6 @@ static int _nfs4_do_open(struct inode *dir,  err_free_label:  	nfs4_label_free(olabel);  err_opendata_put: -	kfree(opendata->f_attr.mdsthreshold);  	nfs4_opendata_put(opendata);  err_put_state_owner:  	nfs4_put_state_owner(sp); @@ -2289,7 +2363,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,  					struct nfs_open_context *ctx,  					int flags,  					struct iattr *sattr, -					struct nfs4_label *label) +					struct nfs4_label *label, +					int *opened)  {  	struct nfs_server *server = NFS_SERVER(dir);  	struct nfs4_exception exception = { }; @@ -2297,7 +2372,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,  	int status;  	do { -		status = _nfs4_do_open(dir, ctx, flags, sattr, label); +		status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened);  		res = ctx->state;  		trace_nfs4_open_file(ctx, flags, status);  		if (status == 0) @@ -2384,13 +2459,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,  	if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) {  		/* Use that stateid */ -	} else if (truncate && state != NULL && nfs4_valid_open_stateid(state)) { +	} else if (truncate && state != NULL) {  		struct nfs_lockowner lockowner = {  			.l_owner = current->files,  			.l_pid = current->tgid,  		}; -		nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, -				&lockowner); +		if (!nfs4_valid_open_stateid(state)) +			return -EBADF; +		if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, +				&lockowner) == -EIO) +			return -EBADF;  	} else  		nfs4_stateid_copy(&arg.stateid, &zero_stateid); @@ -2462,26 +2540,6 @@ static void nfs4_free_closedata(void *data)  	kfree(calldata);  } -static void nfs4_close_clear_stateid_flags(struct nfs4_state *state, -		fmode_t fmode) -{ -	spin_lock(&state->owner->so_lock); -	clear_bit(NFS_O_RDWR_STATE, &state->flags); -	switch (fmode & (FMODE_READ|FMODE_WRITE)) { -	case FMODE_WRITE: -		clear_bit(NFS_O_RDONLY_STATE, &state->flags); -		break; -	case FMODE_READ: -		clear_bit(NFS_O_WRONLY_STATE, &state->flags); -		break; -	case 0: -		clear_bit(NFS_O_RDONLY_STATE, &state->flags); -		clear_bit(NFS_O_WRONLY_STATE, &state->flags); -		clear_bit(NFS_OPEN_STATE, &state->flags); -	} -	spin_unlock(&state->owner->so_lock); -} -  static void nfs4_close_done(struct rpc_task *task, void *data)  {  	struct nfs4_closedata *calldata = data; @@ -2500,11 +2558,10 @@ static void nfs4_close_done(struct rpc_task *task, void *data)  			if (calldata->roc)  				pnfs_roc_set_barrier(state->inode,  						     calldata->roc_barrier); -			nfs_set_open_stateid(state, &calldata->res.stateid, 0); +			nfs_clear_open_stateid(state, &calldata->res.stateid, 0);  			renew_lease(server, calldata->timestamp); -			nfs4_close_clear_stateid_flags(state, -					calldata->arg.fmode); -			break; +			goto out_release; +		case -NFS4ERR_ADMIN_REVOKED:  		case -NFS4ERR_STALE_STATEID:  		case -NFS4ERR_OLD_STATEID:  		case -NFS4ERR_BAD_STATEID: @@ -2512,9 +2569,13 @@ static void nfs4_close_done(struct rpc_task *task, void *data)  			if (calldata->arg.fmode == 0)  				break;  		default: -			if (nfs4_async_handle_error(task, server, state) == -EAGAIN) +			if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {  				rpc_restart_call_prepare(task); +				goto out_release; +			}  	} +	nfs_clear_open_stateid(state, NULL, calldata->arg.fmode); +out_release:  	nfs_release_seqid(calldata->arg.seqid);  	nfs_refresh_inode(calldata->inode, calldata->res.fattr);  	dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); @@ -2659,7 +2720,8 @@ out:  }  static struct inode * -nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) +nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, +		int open_flags, struct iattr *attr, int *opened)  {  	struct nfs4_state *state;  	struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL; @@ -2667,7 +2729,7 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags  	label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);  	/* Protect against concurrent sillydeletes */ -	state = nfs4_do_open(dir, ctx, open_flags, attr, label); +	state = nfs4_do_open(dir, ctx, open_flags, attr, label, opened);  	nfs4_label_release_security(label); @@ -2686,6 +2748,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)  		nfs4_close_state(ctx->state, ctx->mode);  } +#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL) +#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL) +#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL) +  static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)  {  	struct nfs4_server_caps_arg args = { @@ -2701,13 +2767,27 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f  	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);  	if (status == 0) { +		/* Sanity check the server answers */ +		switch (server->nfs_client->cl_minorversion) { +		case 0: +			res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK; +			res.attr_bitmask[2] = 0; +			break; +		case 1: +			res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK; +			break; +		case 2: +			res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK; +		}  		memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));  		server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|  				NFS_CAP_SYMLINKS|NFS_CAP_FILEID|  				NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|  				NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| -				NFS_CAP_CTIME|NFS_CAP_MTIME); -		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) +				NFS_CAP_CTIME|NFS_CAP_MTIME| +				NFS_CAP_SECURITY_LABEL); +		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && +				res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)  			server->caps |= NFS_CAP_ACLS;  		if (res.has_links != 0)  			server->caps |= NFS_CAP_HARDLINKS; @@ -2735,14 +2815,12 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f  #endif  		memcpy(server->attr_bitmask_nl, res.attr_bitmask,  				sizeof(server->attr_bitmask)); +		server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; -		if (server->caps & NFS_CAP_SECURITY_LABEL) { -			server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; -			res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; -		}  		memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));  		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;  		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; +		server->cache_consistency_bitmask[2] = 0;  		server->acl_bitmask = res.acl_bitmask;  		server->fh_expire_type = res.fh_expire_type;  	} @@ -2853,11 +2931,24 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,  	int status = -EPERM;  	size_t i; -	for (i = 0; i < ARRAY_SIZE(flav_array); i++) { -		status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]); -		if (status == -NFS4ERR_WRONGSEC || status == -EACCES) -			continue; -		break; +	if (server->auth_info.flavor_len > 0) { +		/* try each flavor specified by user */ +		for (i = 0; i < server->auth_info.flavor_len; i++) { +			status = nfs4_lookup_root_sec(server, fhandle, info, +						server->auth_info.flavors[i]); +			if (status == -NFS4ERR_WRONGSEC || status == -EACCES) +				continue; +			break; +		} +	} else { +		/* no flavors specified by user, try default list */ +		for (i = 0; i < ARRAY_SIZE(flav_array); i++) { +			status = nfs4_lookup_root_sec(server, fhandle, info, +						      flav_array[i]); +			if (status == -NFS4ERR_WRONGSEC || status == -EACCES) +				continue; +			break; +		}  	}  	/* @@ -2899,9 +2990,6 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,  		status = nfs4_lookup_root(server, fhandle, info);  		if (status != -NFS4ERR_WRONGSEC)  			break; -		/* Did user force a 'sec=' mount option? */ -		if (server->flags & NFS_MOUNT_SECFLAVOUR) -			break;  	default:  		status = nfs4_do_find_root_sec(server, fhandle, info);  	} @@ -2970,11 +3058,16 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir,  	status = nfs4_proc_fs_locations(client, dir, name, locations, page);  	if (status != 0)  		goto out; -	/* Make sure server returned a different fsid for the referral */ + +	/* +	 * If the fsid didn't change, this is a migration event, not a +	 * referral.  Cause us to drop into the exception handler, which +	 * will kick off migration recovery. +	 */  	if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {  		dprintk("%s: server did not return a different fsid for"  			" a referral at %s\n", __func__, name->name); -		status = -EIO; +		status = -NFS4ERR_MOVED;  		goto out;  	}  	/* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ @@ -3154,10 +3247,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,  			err = -EPERM;  			if (client != *clnt)  				goto out; -			/* No security negotiation if the user specified 'sec=' */ -			if (NFS_SERVER(dir)->flags & NFS_MOUNT_SECFLAVOUR) -				goto out; -			client = nfs4_create_sec_client(client, dir, name); +			client = nfs4_negotiate_security(client, dir, name);  			if (IS_ERR(client))  				return PTR_ERR(client); @@ -3332,6 +3422,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  	struct nfs4_label l, *ilabel = NULL;  	struct nfs_open_context *ctx;  	struct nfs4_state *state; +	int opened = 0;  	int status = 0;  	ctx = alloc_nfs_open_context(dentry, FMODE_READ); @@ -3341,7 +3432,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  	ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);  	sattr->ia_mode &= ~current_umask(); -	state = nfs4_do_open(dir, ctx, flags, sattr, ilabel); +	state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened);  	if (IS_ERR(state)) {  		status = PTR_ERR(state);  		goto out; @@ -3457,49 +3548,6 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,  	return 1;  } -static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, -		struct inode *new_dir, struct qstr *new_name) -{ -	struct nfs_server *server = NFS_SERVER(old_dir); -	struct nfs_renameargs arg = { -		.old_dir = NFS_FH(old_dir), -		.new_dir = NFS_FH(new_dir), -		.old_name = old_name, -		.new_name = new_name, -	}; -	struct nfs_renameres res = { -		.server = server, -	}; -	struct rpc_message msg = { -		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], -		.rpc_argp = &arg, -		.rpc_resp = &res, -	}; -	int status = -ENOMEM; - -	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); -	if (!status) { -		update_changeattr(old_dir, &res.old_cinfo); -		update_changeattr(new_dir, &res.new_cinfo); -	} -	return status; -} - -static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, -		struct inode *new_dir, struct qstr *new_name) -{ -	struct nfs4_exception exception = { }; -	int err; -	do { -		err = _nfs4_proc_rename(old_dir, old_name, -					new_dir, new_name); -		trace_nfs4_rename(old_dir, old_name, new_dir, new_name, err); -		err = nfs4_handle_exception(NFS_SERVER(old_dir), err, -				&exception); -	} while (exception.retry); -	return err; -} -  static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)  {  	struct nfs_server *server = NFS_SERVER(inode); @@ -3726,9 +3774,8 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,  	};  	int			status; -	dprintk("%s: dentry = %s/%s, cookie = %Lu\n", __func__, -			dentry->d_parent->d_name.name, -			dentry->d_name.name, +	dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__, +			dentry,  			(unsigned long long)cookie);  	nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args);  	res.pgbase = args.pgbase; @@ -3965,8 +4012,9 @@ static bool nfs4_stateid_is_current(nfs4_stateid *stateid,  {  	nfs4_stateid current_stateid; -	if (nfs4_set_rw_stateid(¤t_stateid, ctx, l_ctx, fmode)) -		return false; +	/* If the current stateid represents a lost lock, then exit */ +	if (nfs4_set_rw_stateid(¤t_stateid, ctx, l_ctx, fmode) == -EIO) +		return true;  	return nfs4_stateid_match(stateid, ¤t_stateid);  } @@ -3985,12 +4033,12 @@ static bool nfs4_error_stateid_expired(int err)  	return false;  } -void __nfs4_read_done_cb(struct nfs_read_data *data) +void __nfs4_read_done_cb(struct nfs_pgio_data *data)  {  	nfs_invalidate_atime(data->header->inode);  } -static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)  {  	struct nfs_server *server = NFS_SERVER(data->header->inode); @@ -4007,7 +4055,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)  }  static bool nfs4_read_stateid_changed(struct rpc_task *task, -		struct nfs_readargs *args) +		struct nfs_pgio_args *args)  {  	if (!nfs4_error_stateid_expired(task->tk_status) || @@ -4020,7 +4068,7 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task,  	return true;  } -static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data)  {  	dprintk("--> %s\n", __func__); @@ -4029,19 +4077,19 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)  		return -EAGAIN;  	if (nfs4_read_stateid_changed(task, &data->args))  		return -EAGAIN; -	return data->read_done_cb ? data->read_done_cb(task, data) : +	return data->pgio_done_cb ? data->pgio_done_cb(task, data) :  				    nfs4_read_done_cb(task, data);  } -static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)  {  	data->timestamp   = jiffies; -	data->read_done_cb = nfs4_read_done_cb; +	data->pgio_done_cb = nfs4_read_done_cb;  	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];  	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);  } -static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)  {  	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),  			&data->args.seq_args, @@ -4049,14 +4097,14 @@ static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_dat  			task))  		return 0;  	if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context, -				data->args.lock_context, FMODE_READ) == -EIO) +				data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO)  		return -EIO;  	if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))  		return -EIO;  	return 0;  } -static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)  {  	struct inode *inode = data->header->inode; @@ -4073,7 +4121,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data  }  static bool nfs4_write_stateid_changed(struct rpc_task *task, -		struct nfs_writeargs *args) +		struct nfs_pgio_args *args)  {  	if (!nfs4_error_stateid_expired(task->tk_status) || @@ -4086,18 +4134,18 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task,  	return true;  } -static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data)  {  	if (!nfs4_sequence_done(task, &data->res.seq_res))  		return -EAGAIN;  	if (nfs4_write_stateid_changed(task, &data->args))  		return -EAGAIN; -	return data->write_done_cb ? data->write_done_cb(task, data) : +	return data->pgio_done_cb ? data->pgio_done_cb(task, data) :  		nfs4_write_done_cb(task, data);  }  static -bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data) +bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)  {  	const struct nfs_pgio_header *hdr = data->header; @@ -4110,7 +4158,7 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data)  	return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;  } -static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)  {  	struct nfs_server *server = NFS_SERVER(data->header->inode); @@ -4120,8 +4168,8 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag  	} else  		data->args.bitmask = server->cache_consistency_bitmask; -	if (!data->write_done_cb) -		data->write_done_cb = nfs4_write_done_cb; +	if (!data->pgio_done_cb) +		data->pgio_done_cb = nfs4_write_done_cb;  	data->res.server = server;  	data->timestamp   = jiffies; @@ -4129,21 +4177,6 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag  	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);  } -static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) -{ -	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), -			&data->args.seq_args, -			&data->res.seq_res, -			task)) -		return 0; -	if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context, -				data->args.lock_context, FMODE_WRITE) == -EIO) -		return -EIO; -	if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) -		return -EIO; -	return 0; -} -  static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)  {  	nfs4_setup_sequence(NFS_SERVER(data->inode), @@ -4209,7 +4242,13 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)  	unsigned long timestamp = data->timestamp;  	trace_nfs4_renew_async(clp, task->tk_status); -	if (task->tk_status < 0) { +	switch (task->tk_status) { +	case 0: +		break; +	case -NFS4ERR_LEASE_MOVED: +		nfs4_schedule_lease_moved_recovery(clp); +		break; +	default:  		/* Unless we're shutting down, schedule state recovery! */  		if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)  			return; @@ -4268,9 +4307,7 @@ static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)  static inline int nfs4_server_supports_acls(struct nfs_server *server)  { -	return (server->caps & NFS_CAP_ACLS) -		&& (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) -		&& (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL); +	return server->caps & NFS_CAP_ACLS;  }  /* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that @@ -4563,7 +4600,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,  	struct nfs4_label label = {0, 0, buflen, buf};  	u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; -	struct nfs4_getattr_arg args = { +	struct nfs4_getattr_arg arg = {  		.fh		= NFS_FH(inode),  		.bitmask	= bitmask,  	}; @@ -4574,14 +4611,14 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,  	};  	struct rpc_message msg = {  		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_GETATTR], -		.rpc_argp	= &args, +		.rpc_argp	= &arg,  		.rpc_resp	= &res,  	};  	int ret;  	nfs_fattr_init(&fattr); -	ret = rpc_call_sync(server->client, &msg, 0); +	ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0);  	if (ret)  		return ret;  	if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL)) @@ -4618,7 +4655,7 @@ static int _nfs4_do_set_security_label(struct inode *inode,  	struct iattr sattr = {0};  	struct nfs_server *server = NFS_SERVER(inode);  	const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; -	struct nfs_setattrargs args = { +	struct nfs_setattrargs arg = {  		.fh             = NFS_FH(inode),  		.iap            = &sattr,  		.server		= server, @@ -4632,14 +4669,14 @@ static int _nfs4_do_set_security_label(struct inode *inode,  	};  	struct rpc_message msg = {  		.rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], -		.rpc_argp       = &args, +		.rpc_argp       = &arg,  		.rpc_resp       = &res,  	};  	int status; -	nfs4_stateid_copy(&args.stateid, &zero_stateid); +	nfs4_stateid_copy(&arg.stateid, &zero_stateid); -	status = rpc_call_sync(server->client, &msg, 0); +	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);  	if (status)  		dprintk("%s failed: %d\n", __func__, status); @@ -4723,17 +4760,24 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,  			if (state == NULL)  				break;  			if (nfs4_schedule_stateid_recovery(server, state) < 0) -				goto stateid_invalid; +				goto recovery_failed;  			goto wait_on_recovery;  		case -NFS4ERR_EXPIRED:  			if (state != NULL) {  				if (nfs4_schedule_stateid_recovery(server, state) < 0) -					goto stateid_invalid; +					goto recovery_failed;  			}  		case -NFS4ERR_STALE_STATEID:  		case -NFS4ERR_STALE_CLIENTID:  			nfs4_schedule_lease_recovery(clp);  			goto wait_on_recovery; +		case -NFS4ERR_MOVED: +			if (nfs4_schedule_migration_recovery(server) < 0) +				goto recovery_failed; +			goto wait_on_recovery; +		case -NFS4ERR_LEASE_MOVED: +			nfs4_schedule_lease_moved_recovery(clp); +			goto wait_on_recovery;  #if defined(CONFIG_NFS_V4_1)  		case -NFS4ERR_BADSESSION:  		case -NFS4ERR_BADSLOT: @@ -4745,29 +4789,28 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,  			dprintk("%s ERROR %d, Reset session\n", __func__,  				task->tk_status);  			nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); -			task->tk_status = 0; -			return -EAGAIN; +			goto wait_on_recovery;  #endif /* CONFIG_NFS_V4_1 */  		case -NFS4ERR_DELAY:  			nfs_inc_server_stats(server, NFSIOS_DELAY);  		case -NFS4ERR_GRACE:  			rpc_delay(task, NFS4_POLL_RETRY_MAX); -			task->tk_status = 0; -			return -EAGAIN;  		case -NFS4ERR_RETRY_UNCACHED_REP:  		case -NFS4ERR_OLD_STATEID: -			task->tk_status = 0; -			return -EAGAIN; +			goto restart_call;  	}  	task->tk_status = nfs4_map_errors(task->tk_status);  	return 0; -stateid_invalid: +recovery_failed:  	task->tk_status = -EIO;  	return 0;  wait_on_recovery:  	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);  	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)  		rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); +	if (test_bit(NFS_MIG_FAILED, &server->mig_status)) +		goto recovery_failed; +restart_call:  	task->tk_status = 0;  	return -EAGAIN;  } @@ -4824,6 +4867,20 @@ nfs4_init_uniform_client_string(const struct nfs_client *clp,  				nodename);  } +/* + * nfs4_callback_up_net() starts only "tcp" and "tcp6" callback + * services.  Advertise one based on the address family of the + * clientaddr. + */ +static unsigned int +nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len) +{ +	if (strchr(clp->cl_ipaddr, ':') != NULL) +		return scnprintf(buf, len, "tcp6"); +	else +		return scnprintf(buf, len, "tcp"); +} +  /**   * nfs4_proc_setclientid - Negotiate client ID   * @clp: state data structure @@ -4865,12 +4922,10 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,  						setclientid.sc_name,  						sizeof(setclientid.sc_name));  	/* cb_client4 */ -	rcu_read_lock(); -	setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, -				sizeof(setclientid.sc_netid), "%s", -				rpc_peeraddr2str(clp->cl_rpcclient, -							RPC_DISPLAY_NETID)); -	rcu_read_unlock(); +	setclientid.sc_netid_len = +				nfs4_init_callback_netid(clp, +						setclientid.sc_netid, +						sizeof(setclientid.sc_netid));  	setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,  				sizeof(setclientid.sc_uaddr), "%s.%u.%u",  				clp->cl_ipaddr, port >> 8, port & 255); @@ -4931,11 +4986,17 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)  	trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);  	switch (task->tk_status) { -	case -NFS4ERR_STALE_STATEID: -	case -NFS4ERR_EXPIRED:  	case 0:  		renew_lease(data->res.server, data->timestamp);  		break; +	case -NFS4ERR_ADMIN_REVOKED: +	case -NFS4ERR_DELEG_REVOKED: +	case -NFS4ERR_BAD_STATEID: +	case -NFS4ERR_OLD_STATEID: +	case -NFS4ERR_STALE_STATEID: +	case -NFS4ERR_EXPIRED: +		task->tk_status = 0; +		break;  	default:  		if (nfs4_async_handle_error(task, data->res.server, NULL) ==  				-EAGAIN) { @@ -5094,6 +5155,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock  			status = 0;  	}  	request->fl_ops->fl_release_private(request); +	request->fl_ops = NULL;  out:  	return status;  } @@ -5765,21 +5827,36 @@ struct nfs_release_lockowner_data {  	struct nfs4_lock_state *lsp;  	struct nfs_server *server;  	struct nfs_release_lockowner_args args; -	struct nfs4_sequence_args seq_args; -	struct nfs4_sequence_res seq_res; +	struct nfs_release_lockowner_res res; +	unsigned long timestamp;  };  static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)  {  	struct nfs_release_lockowner_data *data = calldata;  	nfs40_setup_sequence(data->server, -				&data->seq_args, &data->seq_res, task); +				&data->args.seq_args, &data->res.seq_res, task); +	data->timestamp = jiffies;  }  static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)  {  	struct nfs_release_lockowner_data *data = calldata; -	nfs40_sequence_done(task, &data->seq_res); +	struct nfs_server *server = data->server; + +	nfs40_sequence_done(task, &data->res.seq_res); + +	switch (task->tk_status) { +	case 0: +		renew_lease(server, data->timestamp); +		break; +	case -NFS4ERR_STALE_CLIENTID: +	case -NFS4ERR_EXPIRED: +	case -NFS4ERR_LEASE_MOVED: +	case -NFS4ERR_DELAY: +		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) +			rpc_restart_call_prepare(task); +	}  }  static void nfs4_release_lockowner_release(void *calldata) @@ -5808,7 +5885,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st  	data = kmalloc(sizeof(*data), GFP_NOFS);  	if (!data)  		return -ENOMEM; -	nfs4_init_sequence(&data->seq_args, &data->seq_res, 0);  	data->lsp = lsp;  	data->server = server;  	data->args.lock_owner.clientid = server->nfs_client->cl_clientid; @@ -5816,6 +5892,8 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st  	data->args.lock_owner.s_dev = server->s_dev;  	msg.rpc_argp = &data->args; +	msg.rpc_resp = &data->res; +	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);  	rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);  	return 0;  } @@ -5978,6 +6056,283 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,  	return err;  } +/* + * This operation also signals the server that this client is + * performing migration recovery.  The server can stop returning + * NFS4ERR_LEASE_MOVED to this client.  A RENEW operation is + * appended to this compound to identify the client ID which is + * performing recovery. + */ +static int _nfs40_proc_get_locations(struct inode *inode, +				     struct nfs4_fs_locations *locations, +				     struct page *page, struct rpc_cred *cred) +{ +	struct nfs_server *server = NFS_SERVER(inode); +	struct rpc_clnt *clnt = server->client; +	u32 bitmask[2] = { +		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, +	}; +	struct nfs4_fs_locations_arg args = { +		.clientid	= server->nfs_client->cl_clientid, +		.fh		= NFS_FH(inode), +		.page		= page, +		.bitmask	= bitmask, +		.migration	= 1,		/* skip LOOKUP */ +		.renew		= 1,		/* append RENEW */ +	}; +	struct nfs4_fs_locations_res res = { +		.fs_locations	= locations, +		.migration	= 1, +		.renew		= 1, +	}; +	struct rpc_message msg = { +		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], +		.rpc_argp	= &args, +		.rpc_resp	= &res, +		.rpc_cred	= cred, +	}; +	unsigned long now = jiffies; +	int status; + +	nfs_fattr_init(&locations->fattr); +	locations->server = server; +	locations->nlocations = 0; + +	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); +	nfs4_set_sequence_privileged(&args.seq_args); +	status = nfs4_call_sync_sequence(clnt, server, &msg, +					&args.seq_args, &res.seq_res); +	if (status) +		return status; + +	renew_lease(server, now); +	return 0; +} + +#ifdef CONFIG_NFS_V4_1 + +/* + * This operation also signals the server that this client is + * performing migration recovery.  The server can stop asserting + * SEQ4_STATUS_LEASE_MOVED for this client.  The client ID + * performing this operation is identified in the SEQUENCE + * operation in this compound. + * + * When the client supports GETATTR(fs_locations_info), it can + * be plumbed in here. + */ +static int _nfs41_proc_get_locations(struct inode *inode, +				     struct nfs4_fs_locations *locations, +				     struct page *page, struct rpc_cred *cred) +{ +	struct nfs_server *server = NFS_SERVER(inode); +	struct rpc_clnt *clnt = server->client; +	u32 bitmask[2] = { +		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, +	}; +	struct nfs4_fs_locations_arg args = { +		.fh		= NFS_FH(inode), +		.page		= page, +		.bitmask	= bitmask, +		.migration	= 1,		/* skip LOOKUP */ +	}; +	struct nfs4_fs_locations_res res = { +		.fs_locations	= locations, +		.migration	= 1, +	}; +	struct rpc_message msg = { +		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], +		.rpc_argp	= &args, +		.rpc_resp	= &res, +		.rpc_cred	= cred, +	}; +	int status; + +	nfs_fattr_init(&locations->fattr); +	locations->server = server; +	locations->nlocations = 0; + +	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); +	nfs4_set_sequence_privileged(&args.seq_args); +	status = nfs4_call_sync_sequence(clnt, server, &msg, +					&args.seq_args, &res.seq_res); +	if (status == NFS4_OK && +	    res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) +		status = -NFS4ERR_LEASE_MOVED; +	return status; +} + +#endif	/* CONFIG_NFS_V4_1 */ + +/** + * nfs4_proc_get_locations - discover locations for a migrated FSID + * @inode: inode on FSID that is migrating + * @locations: result of query + * @page: buffer + * @cred: credential to use for this operation + * + * Returns NFS4_OK on success, a negative NFS4ERR status code if the + * operation failed, or a negative errno if a local error occurred. + * + * On success, "locations" is filled in, but if the server has + * no locations information, NFS_ATTR_FATTR_V4_LOCATIONS is not + * asserted. + * + * -NFS4ERR_LEASE_MOVED is returned if the server still has leases + * from this client that require migration recovery. + */ +int nfs4_proc_get_locations(struct inode *inode, +			    struct nfs4_fs_locations *locations, +			    struct page *page, struct rpc_cred *cred) +{ +	struct nfs_server *server = NFS_SERVER(inode); +	struct nfs_client *clp = server->nfs_client; +	const struct nfs4_mig_recovery_ops *ops = +					clp->cl_mvops->mig_recovery_ops; +	struct nfs4_exception exception = { }; +	int status; + +	dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, +		(unsigned long long)server->fsid.major, +		(unsigned long long)server->fsid.minor, +		clp->cl_hostname); +	nfs_display_fhandle(NFS_FH(inode), __func__); + +	do { +		status = ops->get_locations(inode, locations, page, cred); +		if (status != -NFS4ERR_DELAY) +			break; +		nfs4_handle_exception(server, status, &exception); +	} while (exception.retry); +	return status; +} + +/* + * This operation also signals the server that this client is + * performing "lease moved" recovery.  The server can stop + * returning NFS4ERR_LEASE_MOVED to this client.  A RENEW operation + * is appended to this compound to identify the client ID which is + * performing recovery. + */ +static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ +	struct nfs_server *server = NFS_SERVER(inode); +	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; +	struct rpc_clnt *clnt = server->client; +	struct nfs4_fsid_present_arg args = { +		.fh		= NFS_FH(inode), +		.clientid	= clp->cl_clientid, +		.renew		= 1,		/* append RENEW */ +	}; +	struct nfs4_fsid_present_res res = { +		.renew		= 1, +	}; +	struct rpc_message msg = { +		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT], +		.rpc_argp	= &args, +		.rpc_resp	= &res, +		.rpc_cred	= cred, +	}; +	unsigned long now = jiffies; +	int status; + +	res.fh = nfs_alloc_fhandle(); +	if (res.fh == NULL) +		return -ENOMEM; + +	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); +	nfs4_set_sequence_privileged(&args.seq_args); +	status = nfs4_call_sync_sequence(clnt, server, &msg, +						&args.seq_args, &res.seq_res); +	nfs_free_fhandle(res.fh); +	if (status) +		return status; + +	do_renew_lease(clp, now); +	return 0; +} + +#ifdef CONFIG_NFS_V4_1 + +/* + * This operation also signals the server that this client is + * performing "lease moved" recovery.  The server can stop asserting + * SEQ4_STATUS_LEASE_MOVED for this client.  The client ID performing + * this operation is identified in the SEQUENCE operation in this + * compound. + */ +static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ +	struct nfs_server *server = NFS_SERVER(inode); +	struct rpc_clnt *clnt = server->client; +	struct nfs4_fsid_present_arg args = { +		.fh		= NFS_FH(inode), +	}; +	struct nfs4_fsid_present_res res = { +	}; +	struct rpc_message msg = { +		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT], +		.rpc_argp	= &args, +		.rpc_resp	= &res, +		.rpc_cred	= cred, +	}; +	int status; + +	res.fh = nfs_alloc_fhandle(); +	if (res.fh == NULL) +		return -ENOMEM; + +	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); +	nfs4_set_sequence_privileged(&args.seq_args); +	status = nfs4_call_sync_sequence(clnt, server, &msg, +						&args.seq_args, &res.seq_res); +	nfs_free_fhandle(res.fh); +	if (status == NFS4_OK && +	    res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) +		status = -NFS4ERR_LEASE_MOVED; +	return status; +} + +#endif	/* CONFIG_NFS_V4_1 */ + +/** + * nfs4_proc_fsid_present - Is this FSID present or absent on server? + * @inode: inode on FSID to check + * @cred: credential to use for this operation + * + * Server indicates whether the FSID is present, moved, or not + * recognized.  This operation is necessary to clear a LEASE_MOVED + * condition for this client ID. + * + * Returns NFS4_OK if the FSID is present on this server, + * -NFS4ERR_MOVED if the FSID is no longer present, a negative + *  NFS4ERR code if some error occurred on the server, or a + *  negative errno if a local failure occurred. + */ +int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ +	struct nfs_server *server = NFS_SERVER(inode); +	struct nfs_client *clp = server->nfs_client; +	const struct nfs4_mig_recovery_ops *ops = +					clp->cl_mvops->mig_recovery_ops; +	struct nfs4_exception exception = { }; +	int status; + +	dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, +		(unsigned long long)server->fsid.major, +		(unsigned long long)server->fsid.minor, +		clp->cl_hostname); +	nfs_display_fhandle(NFS_FH(inode), __func__); + +	do { +		status = ops->fsid_present(inode, cred); +		if (status != -NFS4ERR_DELAY) +			break; +		nfs4_handle_exception(server, status, &exception); +	} while (exception.retry); +	return status; +} +  /**   * If 'use_integrity' is true and the state managment nfs_client   * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient @@ -6264,8 +6619,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,  	struct nfs41_exchange_id_args args = {  		.verifier = &verifier,  		.client = clp, +#ifdef CONFIG_NFS_V4_1_MIGRATION  		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | -			EXCHGID4_FLAG_BIND_PRINC_STATEID, +			 EXCHGID4_FLAG_BIND_PRINC_STATEID | +			 EXCHGID4_FLAG_SUPP_MOVED_MIGR, +#else +		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | +			 EXCHGID4_FLAG_BIND_PRINC_STATEID, +#endif  	};  	struct nfs41_exchange_id_res res = {  		0 @@ -7044,9 +7405,9 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)  	struct nfs_server *server = NFS_SERVER(inode);  	struct pnfs_layout_hdr *lo;  	struct nfs4_state *state = NULL; -	unsigned long timeo, giveup; +	unsigned long timeo, now, giveup; -	dprintk("--> %s\n", __func__); +	dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);  	if (!nfs41_sequence_done(task, &lgp->res.seq_res))  		goto out; @@ -7054,12 +7415,38 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)  	switch (task->tk_status) {  	case 0:  		goto out; +	/* +	 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client +	 * (or clients) writing to the same RAID stripe +	 */  	case -NFS4ERR_LAYOUTTRYLATER: +	/* +	 * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall +	 * existing layout before getting a new one). +	 */  	case -NFS4ERR_RECALLCONFLICT:  		timeo = rpc_get_timeout(task->tk_client);  		giveup = lgp->args.timestamp + timeo; -		if (time_after(giveup, jiffies)) -			task->tk_status = -NFS4ERR_DELAY; +		now = jiffies; +		if (time_after(giveup, now)) { +			unsigned long delay; + +			/* Delay for: +			 * - Not less then NFS4_POLL_RETRY_MIN. +			 * - One last time a jiffie before we give up +			 * - exponential backoff (time_now minus start_attempt) +			 */ +			delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN, +				    min((giveup - now - 1), +					now - lgp->args.timestamp)); + +			dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n", +				__func__, delay); +			rpc_delay(task, delay); +			task->tk_status = 0; +			rpc_restart_call_prepare(task); +			goto out; /* Do not call nfs4_async_handle_error() */ +		}  		break;  	case -NFS4ERR_EXPIRED:  	case -NFS4ERR_BAD_STATEID: @@ -7233,7 +7620,14 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)  		return;  	server = NFS_SERVER(lrp->args.inode); -	if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { +	switch (task->tk_status) { +	default: +		task->tk_status = 0; +	case 0: +		break; +	case -NFS4ERR_DELAY: +		if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN) +			break;  		rpc_restart_call_prepare(task);  		return;  	} @@ -7408,10 +7802,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)  	case -NFS4ERR_BADLAYOUT:     /* no layout */  	case -NFS4ERR_GRACE:	    /* loca_recalim always false */  		task->tk_status = 0; -		break;  	case 0: -		nfs_post_op_update_inode_force_wcc(data->args.inode, -						   data->res.fattr);  		break;  	default:  		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { @@ -7426,6 +7817,8 @@ static void nfs4_layoutcommit_release(void *calldata)  	struct nfs4_layoutcommit_data *data = calldata;  	pnfs_cleanup_layoutcommit(data); +	nfs_post_op_update_inode_force_wcc(data->args.inode, +					   data->res.fattr);  	put_rpccred(data->cred);  	kfree(data);  } @@ -7548,7 +7941,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,  		switch (err) {  		case 0:  		case -NFS4ERR_WRONGSEC: -		case -NFS4ERR_NOTSUPP: +		case -ENOTSUPP:  			goto out;  		default:  			err = nfs4_handle_exception(server, err, &exception); @@ -7564,8 +7957,10 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,  {  	int err;  	struct page *page; -	rpc_authflavor_t flavor; +	rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;  	struct nfs4_secinfo_flavors *flavors; +	struct nfs4_secinfo4 *secinfo; +	int i;  	page = alloc_page(GFP_KERNEL);  	if (!page) { @@ -7580,16 +7975,41 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,  	 * Fall back on "guess and check" method if  	 * the server doesn't support SECINFO_NO_NAME  	 */ -	if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) { +	if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) {  		err = nfs4_find_root_sec(server, fhandle, info);  		goto out_freepage;  	}  	if (err)  		goto out_freepage; -	flavor = nfs_find_best_sec(flavors); -	if (err == 0) -		err = nfs4_lookup_root_sec(server, fhandle, info, flavor); +	for (i = 0; i < flavors->num_flavors; i++) { +		secinfo = &flavors->flavors[i]; + +		switch (secinfo->flavor) { +		case RPC_AUTH_NULL: +		case RPC_AUTH_UNIX: +		case RPC_AUTH_GSS: +			flavor = rpcauth_get_pseudoflavor(secinfo->flavor, +					&secinfo->flavor_info); +			break; +		default: +			flavor = RPC_AUTH_MAXFLAVOR; +			break; +		} + +		if (!nfs_auth_info_match(&server->auth_info, flavor)) +			flavor = RPC_AUTH_MAXFLAVOR; + +		if (flavor != RPC_AUTH_MAXFLAVOR) { +			err = nfs4_lookup_root_sec(server, fhandle, +						   info, flavor); +			if (!err) +				break; +		} +	} + +	if (flavor == RPC_AUTH_MAXFLAVOR) +		err = -EPERM;  out_freepage:  	put_page(page); @@ -7851,6 +8271,18 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {  };  #endif +static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = { +	.get_locations = _nfs40_proc_get_locations, +	.fsid_present = _nfs40_proc_fsid_present, +}; + +#if defined(CONFIG_NFS_V4_1) +static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = { +	.get_locations = _nfs41_proc_get_locations, +	.fsid_present = _nfs41_proc_fsid_present, +}; +#endif	/* CONFIG_NFS_V4_1 */ +  static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {  	.minor_version = 0,  	.init_caps = NFS_CAP_READDIRPLUS @@ -7866,6 +8298,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {  	.reboot_recovery_ops = &nfs40_reboot_recovery_ops,  	.nograce_recovery_ops = &nfs40_nograce_recovery_ops,  	.state_renewal_ops = &nfs40_state_renewal_ops, +	.mig_recovery_ops = &nfs40_mig_recovery_ops,  };  #if defined(CONFIG_NFS_V4_1) @@ -7886,6 +8319,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {  	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,  	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,  	.state_renewal_ops = &nfs41_state_renewal_ops, +	.mig_recovery_ops = &nfs41_mig_recovery_ops,  };  #endif @@ -7969,7 +8403,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {  	.unlink_setup	= nfs4_proc_unlink_setup,  	.unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare,  	.unlink_done	= nfs4_proc_unlink_done, -	.rename		= nfs4_proc_rename,  	.rename_setup	= nfs4_proc_rename_setup,  	.rename_rpc_prepare = nfs4_proc_rename_rpc_prepare,  	.rename_done	= nfs4_proc_rename_done, @@ -7984,13 +8417,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = {  	.pathconf	= nfs4_proc_pathconf,  	.set_capabilities = nfs4_server_capabilities,  	.decode_dirent	= nfs4_decode_dirent, +	.pgio_rpc_prepare = nfs4_proc_pgio_rpc_prepare,  	.read_setup	= nfs4_proc_read_setup, -	.read_pageio_init = pnfs_pageio_init_read, -	.read_rpc_prepare = nfs4_proc_read_rpc_prepare,  	.read_done	= nfs4_read_done,  	.write_setup	= nfs4_proc_write_setup, -	.write_pageio_init = pnfs_pageio_init_write, -	.write_rpc_prepare = nfs4_proc_write_rpc_prepare,  	.write_done	= nfs4_write_done,  	.commit_setup	= nfs4_proc_commit_setup,  	.commit_rpc_prepare = nfs4_proc_commit_rpc_prepare, diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index cf883c7ae05..e799dc3c3b1 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -231,14 +231,23 @@ out:  	return ret;  } +/* + * nfs4_release_slot_table - release all slot table entries + */ +static void nfs4_release_slot_table(struct nfs4_slot_table *tbl) +{ +	nfs4_shrink_slot_table(tbl, 0); +} +  /** - * nfs4_release_slot_table - release resources attached to a slot table + * nfs4_shutdown_slot_table - release resources attached to a slot table   * @tbl: slot table to shut down   *   */ -void nfs4_release_slot_table(struct nfs4_slot_table *tbl) +void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl)  { -	nfs4_shrink_slot_table(tbl, 0); +	nfs4_release_slot_table(tbl); +	rpc_destroy_wait_queue(&tbl->slot_tbl_waitq);  }  /** @@ -422,7 +431,7 @@ void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,  	spin_unlock(&tbl->slot_tbl_lock);  } -static void nfs4_destroy_session_slot_tables(struct nfs4_session *session) +static void nfs4_release_session_slot_tables(struct nfs4_session *session)  {  	nfs4_release_slot_table(&session->fc_slot_table);  	nfs4_release_slot_table(&session->bc_slot_table); @@ -450,7 +459,7 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses)  	if (status && tbl->slots == NULL)  		/* Fore and back channel share a connection so get  		 * both slot tables or neither */ -		nfs4_destroy_session_slot_tables(ses); +		nfs4_release_session_slot_tables(ses);  	return status;  } @@ -470,6 +479,12 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)  	return session;  } +static void nfs4_destroy_session_slot_tables(struct nfs4_session *session) +{ +	nfs4_shutdown_slot_table(&session->fc_slot_table); +	nfs4_shutdown_slot_table(&session->bc_slot_table); +} +  void nfs4_destroy_session(struct nfs4_session *session)  {  	struct rpc_xprt *xprt; diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 23230610065..b34ada9bc6a 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -74,7 +74,7 @@ enum nfs4_session_state {  extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,  		unsigned int max_reqs, const char *queue); -extern void nfs4_release_slot_table(struct nfs4_slot_table *tbl); +extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);  extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);  extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);  extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index cc14cbb78b7..848f6853c59 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -239,14 +239,12 @@ static void nfs4_end_drain_session(struct nfs_client *clp)  	}  } -#if defined(CONFIG_NFS_V4_1) -  static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)  {  	set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);  	spin_lock(&tbl->slot_tbl_lock);  	if (tbl->highest_used_slotid != NFS4_NO_SLOT) { -		INIT_COMPLETION(tbl->complete); +		reinit_completion(&tbl->complete);  		spin_unlock(&tbl->slot_tbl_lock);  		return wait_for_completion_interruptible(&tbl->complete);  	} @@ -270,6 +268,8 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)  	return nfs4_drain_slot_tbl(&ses->fc_slot_table);  } +#if defined(CONFIG_NFS_V4_1) +  static int nfs41_setup_state_renewal(struct nfs_client *clp)  {  	int status; @@ -974,9 +974,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,  	else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {  		nfs4_stateid_copy(dst, &lsp->ls_stateid);  		ret = 0; -		smp_rmb(); -		if (!list_empty(&lsp->ls_seqid.list)) -			ret = -EWOULDBLOCK;  	}  	spin_unlock(&state->state_lock);  	nfs4_put_lock_state(lsp); @@ -984,10 +981,9 @@ out:  	return ret;  } -static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)  {  	const nfs4_stateid *src; -	int ret;  	int seq;  	do { @@ -996,12 +992,7 @@ static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)  		if (test_bit(NFS_OPEN_STATE, &state->flags))  			src = &state->open_stateid;  		nfs4_stateid_copy(dst, src); -		ret = 0; -		smp_rmb(); -		if (!list_empty(&state->owner->so_seqid.list)) -			ret = -EWOULDBLOCK;  	} while (read_seqretry(&state->seqlock, seq)); -	return ret;  }  /* @@ -1015,15 +1006,19 @@ int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,  	if (ret == -EIO)  		/* A lost lock - don't even consider delegations */  		goto out; -	if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) +	/* returns true if delegation stateid found and copied */ +	if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) { +		ret = 0;  		goto out; +	}  	if (ret != -ENOENT)  		/* nfs4_copy_delegation_stateid() didn't over-write  		 * dst, so it still has the lock stateid which we now  		 * choose to use.  		 */  		goto out; -	ret = nfs4_copy_open_stateid(dst, state); +	nfs4_copy_open_stateid(dst, state); +	ret = 0;  out:  	if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41))  		dst->seqid = 0; @@ -1071,7 +1066,7 @@ void nfs_free_seqid(struct nfs_seqid *seqid)  /*   * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or   * failed with a seqid incrementing error - - * see comments nfs_fs.h:seqid_mutating_error() + * see comments nfs4.h:seqid_mutating_error()   */  static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)  { @@ -1116,7 +1111,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)  /*   * Increment the seqid if the LOCK/LOCKU succeeded, or   * failed with a seqid incrementing error - - * see comments nfs_fs.h:seqid_mutating_error() + * see comments nfs4.h:seqid_mutating_error()   */  void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)  { @@ -1145,9 +1140,9 @@ static int nfs4_run_state_manager(void *);  static void nfs4_clear_state_manager_bit(struct nfs_client *clp)  { -	smp_mb__before_clear_bit(); +	smp_mb__before_atomic();  	clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); -	smp_mb__after_clear_bit(); +	smp_mb__after_atomic();  	wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING);  	rpc_wake_up(&clp->cl_rpcwaitq);  } @@ -1197,20 +1192,74 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)  }  EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery); +/** + * nfs4_schedule_migration_recovery - trigger migration recovery + * + * @server: FSID that is migrating + * + * Returns zero if recovery has started, otherwise a negative NFS4ERR + * value is returned. + */ +int nfs4_schedule_migration_recovery(const struct nfs_server *server) +{ +	struct nfs_client *clp = server->nfs_client; + +	if (server->fh_expire_type != NFS4_FH_PERSISTENT) { +		pr_err("NFS: volatile file handles not supported (server %s)\n", +				clp->cl_hostname); +		return -NFS4ERR_IO; +	} + +	if (test_bit(NFS_MIG_FAILED, &server->mig_status)) +		return -NFS4ERR_IO; + +	dprintk("%s: scheduling migration recovery for (%llx:%llx) on %s\n", +			__func__, +			(unsigned long long)server->fsid.major, +			(unsigned long long)server->fsid.minor, +			clp->cl_hostname); + +	set_bit(NFS_MIG_IN_TRANSITION, +			&((struct nfs_server *)server)->mig_status); +	set_bit(NFS4CLNT_MOVED, &clp->cl_state); + +	nfs4_schedule_state_manager(clp); +	return 0; +} +EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery); + +/** + * nfs4_schedule_lease_moved_recovery - start lease-moved recovery + * + * @clp: server to check for moved leases + * + */ +void nfs4_schedule_lease_moved_recovery(struct nfs_client *clp) +{ +	dprintk("%s: scheduling lease-moved recovery for client ID %llx on %s\n", +		__func__, clp->cl_clientid, clp->cl_hostname); + +	set_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state); +	nfs4_schedule_state_manager(clp); +} +EXPORT_SYMBOL_GPL(nfs4_schedule_lease_moved_recovery); +  int nfs4_wait_clnt_recover(struct nfs_client *clp)  {  	int res;  	might_sleep(); +	atomic_inc(&clp->cl_count);  	res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,  			nfs_wait_bit_killable, TASK_KILLABLE);  	if (res) -		return res; - +		goto out;  	if (clp->cl_cons_state < 0) -		return clp->cl_cons_state; -	return 0; +		res = clp->cl_cons_state; +out: +	nfs_put_client(clp); +	return res;  }  int nfs4_client_recover_expired_lease(struct nfs_client *clp) @@ -1267,7 +1316,7 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st  	return 1;  } -static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) +int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)  {  	set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);  	clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); @@ -1375,8 +1424,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_  			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:  				goto out;  			default: -				printk(KERN_ERR "NFS: %s: unhandled error %d. " -					"Zeroing state\n", __func__, status); +				printk(KERN_ERR "NFS: %s: unhandled error %d\n", +					 __func__, status);  			case -ENOMEM:  			case -NFS4ERR_DENIED:  			case -NFS4ERR_RECLAIM_BAD: @@ -1407,7 +1456,7 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs  	 * server that doesn't support a grace period.  	 */  	spin_lock(&sp->so_lock); -	write_seqcount_begin(&sp->so_reclaim_seqcount); +	raw_write_seqcount_begin(&sp->so_reclaim_seqcount);  restart:  	list_for_each_entry(state, &sp->so_states, open_states) {  		if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) @@ -1422,7 +1471,7 @@ restart:  		if (status >= 0) {  			status = nfs4_reclaim_locks(state, ops);  			if (status >= 0) { -				if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) { +				if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) {  					spin_lock(&state->state_lock);  					list_for_each_entry(lock, &state->lock_states, ls_locks) {  						if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) @@ -1439,15 +1488,12 @@ restart:  		}  		switch (status) {  			default: -				printk(KERN_ERR "NFS: %s: unhandled error %d. " -					"Zeroing state\n", __func__, status); +				printk(KERN_ERR "NFS: %s: unhandled error %d\n", +					__func__, status);  			case -ENOENT:  			case -ENOMEM:  			case -ESTALE: -				/* -				 * Open state on this file cannot be recovered -				 * All we can do is revert to using the zero stateid. -				 */ +				/* Open state on this file cannot be recovered */  				nfs4_state_mark_recovery_failed(state, status);  				break;  			case -EAGAIN: @@ -1473,13 +1519,13 @@ restart:  		spin_lock(&sp->so_lock);  		goto restart;  	} -	write_seqcount_end(&sp->so_reclaim_seqcount); +	raw_write_seqcount_end(&sp->so_reclaim_seqcount);  	spin_unlock(&sp->so_lock);  	return 0;  out_err:  	nfs4_put_open_state(state);  	spin_lock(&sp->so_lock); -	write_seqcount_end(&sp->so_reclaim_seqcount); +	raw_write_seqcount_end(&sp->so_reclaim_seqcount);  	spin_unlock(&sp->so_lock);  	return status;  } @@ -1628,7 +1674,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)  			nfs4_state_end_reclaim_reboot(clp);  			break;  		case -NFS4ERR_STALE_CLIENTID: -		case -NFS4ERR_LEASE_MOVED:  			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);  			nfs4_state_clear_reclaim_reboot(clp);  			nfs4_state_start_reclaim_reboot(clp); @@ -1829,6 +1874,168 @@ static int nfs4_purge_lease(struct nfs_client *clp)  	return 0;  } +/* + * Try remote migration of one FSID from a source server to a + * destination server.  The source server provides a list of + * potential destinations. + * + * Returns zero or a negative NFS4ERR status code. + */ +static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) +{ +	struct nfs_client *clp = server->nfs_client; +	struct nfs4_fs_locations *locations = NULL; +	struct inode *inode; +	struct page *page; +	int status, result; + +	dprintk("--> %s: FSID %llx:%llx on \"%s\"\n", __func__, +			(unsigned long long)server->fsid.major, +			(unsigned long long)server->fsid.minor, +			clp->cl_hostname); + +	result = 0; +	page = alloc_page(GFP_KERNEL); +	locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); +	if (page == NULL || locations == NULL) { +		dprintk("<-- %s: no memory\n", __func__); +		goto out; +	} + +	inode = server->super->s_root->d_inode; +	result = nfs4_proc_get_locations(inode, locations, page, cred); +	if (result) { +		dprintk("<-- %s: failed to retrieve fs_locations: %d\n", +			__func__, result); +		goto out; +	} + +	result = -NFS4ERR_NXIO; +	if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { +		dprintk("<-- %s: No fs_locations data, migration skipped\n", +			__func__); +		goto out; +	} + +	nfs4_begin_drain_session(clp); + +	status = nfs4_replace_transport(server, locations); +	if (status != 0) { +		dprintk("<-- %s: failed to replace transport: %d\n", +			__func__, status); +		goto out; +	} + +	result = 0; +	dprintk("<-- %s: migration succeeded\n", __func__); + +out: +	if (page != NULL) +		__free_page(page); +	kfree(locations); +	if (result) { +		pr_err("NFS: migration recovery failed (server %s)\n", +				clp->cl_hostname); +		set_bit(NFS_MIG_FAILED, &server->mig_status); +	} +	return result; +} + +/* + * Returns zero or a negative NFS4ERR status code. + */ +static int nfs4_handle_migration(struct nfs_client *clp) +{ +	const struct nfs4_state_maintenance_ops *ops = +				clp->cl_mvops->state_renewal_ops; +	struct nfs_server *server; +	struct rpc_cred *cred; + +	dprintk("%s: migration reported on \"%s\"\n", __func__, +			clp->cl_hostname); + +	spin_lock(&clp->cl_lock); +	cred = ops->get_state_renewal_cred_locked(clp); +	spin_unlock(&clp->cl_lock); +	if (cred == NULL) +		return -NFS4ERR_NOENT; + +	clp->cl_mig_gen++; +restart: +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +		int status; + +		if (server->mig_gen == clp->cl_mig_gen) +			continue; +		server->mig_gen = clp->cl_mig_gen; + +		if (!test_and_clear_bit(NFS_MIG_IN_TRANSITION, +						&server->mig_status)) +			continue; + +		rcu_read_unlock(); +		status = nfs4_try_migration(server, cred); +		if (status < 0) { +			put_rpccred(cred); +			return status; +		} +		goto restart; +	} +	rcu_read_unlock(); +	put_rpccred(cred); +	return 0; +} + +/* + * Test each nfs_server on the clp's cl_superblocks list to see + * if it's moved to another server.  Stop when the server no longer + * returns NFS4ERR_LEASE_MOVED. + */ +static int nfs4_handle_lease_moved(struct nfs_client *clp) +{ +	const struct nfs4_state_maintenance_ops *ops = +				clp->cl_mvops->state_renewal_ops; +	struct nfs_server *server; +	struct rpc_cred *cred; + +	dprintk("%s: lease moved reported on \"%s\"\n", __func__, +			clp->cl_hostname); + +	spin_lock(&clp->cl_lock); +	cred = ops->get_state_renewal_cred_locked(clp); +	spin_unlock(&clp->cl_lock); +	if (cred == NULL) +		return -NFS4ERR_NOENT; + +	clp->cl_mig_gen++; +restart: +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +		struct inode *inode; +		int status; + +		if (server->mig_gen == clp->cl_mig_gen) +			continue; +		server->mig_gen = clp->cl_mig_gen; + +		rcu_read_unlock(); + +		inode = server->super->s_root->d_inode; +		status = nfs4_proc_fsid_present(inode, cred); +		if (status != -NFS4ERR_MOVED) +			goto restart;	/* wasn't this one */ +		if (nfs4_try_migration(server, cred) == -NFS4ERR_LEASE_MOVED) +			goto restart;	/* there are more */ +		goto out; +	} +	rcu_read_unlock(); + +out: +	put_rpccred(cred); +	return 0; +} +  /**   * nfs4_discover_server_trunking - Detect server IP address trunking   * @@ -1868,8 +2075,10 @@ again:  	switch (status) {  	case 0:  		break; -	case -NFS4ERR_DELAY:  	case -ETIMEDOUT: +		if (clnt->cl_softrtry) +			break; +	case -NFS4ERR_DELAY:  	case -EAGAIN:  		ssleep(1);  	case -NFS4ERR_STALE_CLIENTID: @@ -1881,10 +2090,15 @@ again:  			nfs4_root_machine_cred(clp);  			goto again;  		} -		if (i > 2) +		if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX)  			break;  	case -NFS4ERR_CLID_INUSE:  	case -NFS4ERR_WRONGSEC: +		/* No point in retrying if we already used RPC_AUTH_UNIX */ +		if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) { +			status = -EPERM; +			break; +		}  		clnt = rpc_clone_client_set_auth(clnt, RPC_AUTH_UNIX);  		if (IS_ERR(clnt)) {  			status = PTR_ERR(clnt); @@ -2017,9 +2231,10 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)  		nfs41_handle_server_reboot(clp);  	if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |  			    SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | -			    SEQ4_STATUS_ADMIN_STATE_REVOKED | -			    SEQ4_STATUS_LEASE_MOVED)) +			    SEQ4_STATUS_ADMIN_STATE_REVOKED))  		nfs41_handle_state_revoked(clp); +	if (flags & SEQ4_STATUS_LEASE_MOVED) +		nfs4_schedule_lease_moved_recovery(clp);  	if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)  		nfs41_handle_recallable_state_revoked(clp);  	if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT) @@ -2157,7 +2372,20 @@ static void nfs4_state_manager(struct nfs_client *clp)  			status = nfs4_check_lease(clp);  			if (status < 0)  				goto out_error; -			continue; +		} + +		if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) { +			section = "migration"; +			status = nfs4_handle_migration(clp); +			if (status < 0) +				goto out_error; +		} + +		if (test_and_clear_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state)) { +			section = "lease moved"; +			status = nfs4_handle_lease_moved(clp); +			if (status < 0) +				goto out_error;  		}  		/* First recover reboot state... */ diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index e26acdd1a64..6f340f02f2b 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -77,17 +77,9 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)  {  	int ret = nfs_write_inode(inode, wbc); -	if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { -		int status; -		bool sync = true; - -		if (wbc->sync_mode == WB_SYNC_NONE) -			sync = false; - -		status = pnfs_layoutcommit_inode(inode, sync); -		if (status < 0) -			return status; -	} +	if (ret == 0) +		ret = pnfs_layoutcommit_inode(inode, +				wbc->sync_mode == WB_SYNC_ALL);  	return ret;  } @@ -98,7 +90,7 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)   */  static void nfs4_evict_inode(struct inode *inode)  { -	truncate_inode_pages(&inode->i_data, 0); +	truncate_inode_pages_final(&inode->i_data);  	clear_inode(inode);  	pnfs_return_layout(inode);  	pnfs_destroy_layout(NFS_I(inode)); @@ -261,9 +253,9 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name,  	res = nfs_follow_remote_path(root_mnt, export_path); -	dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n", -			IS_ERR(res) ? PTR_ERR(res) : 0, -			IS_ERR(res) ? " [error]" : ""); +	dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", +		 PTR_ERR_OR_ZERO(res), +		 IS_ERR(res) ? " [error]" : "");  	return res;  } @@ -319,9 +311,9 @@ static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,  	data->mnt_path = export_path;  	res = nfs_follow_remote_path(root_mnt, export_path); -	dprintk("<-- nfs4_referral_mount() = %ld%s\n", -			IS_ERR(res) ? PTR_ERR(res) : 0, -			IS_ERR(res) ? " [error]" : ""); +	dprintk("<-- nfs4_referral_mount() = %d%s\n", +		PTR_ERR_OR_ZERO(res), +		IS_ERR(res) ? " [error]" : "");  	return res;  } diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 2628d921b7e..b6ebe7e445f 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -16,7 +16,7 @@ static const int nfs_set_port_min = 0;  static const int nfs_set_port_max = 65535;  static struct ctl_table_header *nfs4_callback_sysctl_table; -static ctl_table nfs4_cb_sysctls[] = { +static struct ctl_table nfs4_cb_sysctls[] = {  	{  		.procname = "nfs_callback_tcpport",  		.data = &nfs_callback_set_tcpport, @@ -36,7 +36,7 @@ static ctl_table nfs4_cb_sysctls[] = {  	{ }  }; -static ctl_table nfs4_cb_sysctl_dir[] = { +static struct ctl_table nfs4_cb_sysctl_dir[] = {  	{  		.procname = "nfs",  		.mode = 0555, @@ -45,7 +45,7 @@ static ctl_table nfs4_cb_sysctl_dir[] = {  	{ }  }; -static ctl_table nfs4_cb_sysctl_root[] = { +static struct ctl_table nfs4_cb_sysctl_root[] = {  	{  		.procname = "fs",  		.mode = 0555, diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 849cf146db3..0a744f3a86f 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -932,7 +932,7 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);  DECLARE_EVENT_CLASS(nfs4_read_event,  		TP_PROTO( -			const struct nfs_read_data *data, +			const struct nfs_pgio_data *data,  			int error  		), @@ -972,7 +972,7 @@ DECLARE_EVENT_CLASS(nfs4_read_event,  #define DEFINE_NFS4_READ_EVENT(name) \  	DEFINE_EVENT(nfs4_read_event, name, \  			TP_PROTO( \ -				const struct nfs_read_data *data, \ +				const struct nfs_pgio_data *data, \  				int error \  			), \  			TP_ARGS(data, error)) @@ -983,7 +983,7 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);  DECLARE_EVENT_CLASS(nfs4_write_event,  		TP_PROTO( -			const struct nfs_write_data *data, +			const struct nfs_pgio_data *data,  			int error  		), @@ -1024,7 +1024,7 @@ DECLARE_EVENT_CLASS(nfs4_write_event,  #define DEFINE_NFS4_WRITE_EVENT(name) \  	DEFINE_EVENT(nfs4_write_event, name, \  			TP_PROTO( \ -				const struct nfs_write_data *data, \ +				const struct nfs_pgio_data *data, \  				int error \  			), \  			TP_ARGS(data, error)) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 79210d23f60..939ae606cfa 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -105,12 +105,8 @@ static int nfs4_stat_to_errno(int);  #ifdef CONFIG_NFS_V4_SECURITY_LABEL  /* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */  #define	nfs4_label_maxsz	(4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN)) -#define encode_readdir_space 24 -#define encode_readdir_bitmask_sz 3  #else  #define	nfs4_label_maxsz	0 -#define encode_readdir_space 20 -#define encode_readdir_bitmask_sz 2  #endif  /* We support only one layout type per file system */  #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) @@ -207,8 +203,7 @@ static int nfs4_stat_to_errno(int);  				 2 + encode_verifier_maxsz + 5 + \  				nfs4_label_maxsz)  #define decode_readdir_maxsz	(op_decode_hdr_maxsz + \ -				 decode_verifier_maxsz + \ -				nfs4_label_maxsz + nfs4_fattr_maxsz) +				 decode_verifier_maxsz)  #define encode_readlink_maxsz	(op_encode_hdr_maxsz)  #define decode_readlink_maxsz	(op_decode_hdr_maxsz + 1)  #define encode_write_maxsz	(op_encode_hdr_maxsz + \ @@ -595,11 +590,13 @@ static int nfs4_stat_to_errno(int);  #define NFS4_enc_getattr_sz	(compound_encode_hdr_maxsz + \  				encode_sequence_maxsz + \  				encode_putfh_maxsz + \ -				encode_getattr_maxsz) +				encode_getattr_maxsz + \ +				encode_renew_maxsz)  #define NFS4_dec_getattr_sz	(compound_decode_hdr_maxsz + \  				decode_sequence_maxsz + \  				decode_putfh_maxsz + \ -				decode_getattr_maxsz) +				decode_getattr_maxsz + \ +				decode_renew_maxsz)  #define NFS4_enc_lookup_sz	(compound_encode_hdr_maxsz + \  				encode_sequence_maxsz + \  				encode_putfh_maxsz + \ @@ -736,13 +733,15 @@ static int nfs4_stat_to_errno(int);  				 encode_sequence_maxsz + \  				 encode_putfh_maxsz + \  				 encode_lookup_maxsz + \ -				 encode_fs_locations_maxsz) +				 encode_fs_locations_maxsz + \ +				 encode_renew_maxsz)  #define NFS4_dec_fs_locations_sz \  				(compound_decode_hdr_maxsz + \  				 decode_sequence_maxsz + \  				 decode_putfh_maxsz + \  				 decode_lookup_maxsz + \ -				 decode_fs_locations_maxsz) +				 decode_fs_locations_maxsz + \ +				 decode_renew_maxsz)  #define NFS4_enc_secinfo_sz 	(compound_encode_hdr_maxsz + \  				encode_sequence_maxsz + \  				encode_putfh_maxsz + \ @@ -751,6 +750,18 @@ static int nfs4_stat_to_errno(int);  				decode_sequence_maxsz + \  				decode_putfh_maxsz + \  				decode_secinfo_maxsz) +#define NFS4_enc_fsid_present_sz \ +				(compound_encode_hdr_maxsz + \ +				 encode_sequence_maxsz + \ +				 encode_putfh_maxsz + \ +				 encode_getfh_maxsz + \ +				 encode_renew_maxsz) +#define NFS4_dec_fsid_present_sz \ +				(compound_decode_hdr_maxsz + \ +				 decode_sequence_maxsz + \ +				 decode_putfh_maxsz + \ +				 decode_getfh_maxsz + \ +				 decode_renew_maxsz)  #if defined(CONFIG_NFS_V4_1)  #define NFS4_enc_bind_conn_to_session_sz \  				(compound_encode_hdr_maxsz + \ @@ -1545,7 +1556,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)  	encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr);  } -static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) +static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args, +			struct compound_hdr *hdr)  {  	__be32 *p; @@ -1565,6 +1577,8 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg  	};  	uint32_t dircount = readdir->count >> 1;  	__be32 *p, verf[2]; +	uint32_t attrlen = 0; +	unsigned int i;  	if (readdir->plus) {  		attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| @@ -1573,26 +1587,27 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg  			FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|  			FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|  			FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; +		attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;  		dircount >>= 1;  	}  	/* Use mounted_on_fileid only if the server supports it */  	if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))  		attrs[0] |= FATTR4_WORD0_FILEID; +	for (i = 0; i < ARRAY_SIZE(attrs); i++) { +		attrs[i] &= readdir->bitmask[i]; +		if (attrs[i] != 0) +			attrlen = i+1; +	}  	encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);  	encode_uint64(xdr, readdir->cookie);  	encode_nfs4_verifier(xdr, &readdir->verifier); -	p = reserve_space(xdr, encode_readdir_space); +	p = reserve_space(xdr, 12 + (attrlen << 2));  	*p++ = cpu_to_be32(dircount);  	*p++ = cpu_to_be32(readdir->count); -	*p++ = cpu_to_be32(encode_readdir_bitmask_sz); -	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); -	*p   = cpu_to_be32(attrs[1] & readdir->bitmask[1]); -	if (encode_readdir_bitmask_sz > 2) { -		if (hdr->minorversion > 1) -			attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; -		p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]); -	} +	*p++ = cpu_to_be32(attrlen); +	for (i = 0; i < attrlen; i++) +		*p++ = cpu_to_be32(attrs[i]);  	memcpy(verf, readdir->verifier.data, sizeof(verf));  	dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n", @@ -1687,7 +1702,8 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4  	encode_nfs4_verifier(xdr, &arg->confirm);  } -static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) +static void encode_write(struct xdr_stream *xdr, const struct nfs_pgio_args *args, +			 struct compound_hdr *hdr)  {  	__be32 *p; @@ -2437,7 +2453,7 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,   * Encode a READ request   */  static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr, -			      struct nfs_readargs *args) +			      struct nfs_pgio_args *args)  {  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args), @@ -2499,7 +2515,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,   * Encode a WRITE request   */  static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr, -			       struct nfs_writeargs *args) +			       struct nfs_pgio_args *args)  {  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args), @@ -2687,11 +2703,20 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,  	encode_compound_hdr(xdr, req, &hdr);  	encode_sequence(xdr, &args->seq_args, &hdr); -	encode_putfh(xdr, args->dir_fh, &hdr); -	encode_lookup(xdr, args->name, &hdr); -	replen = hdr.replen;	/* get the attribute into args->page */ -	encode_fs_locations(xdr, args->bitmask, &hdr); +	if (args->migration) { +		encode_putfh(xdr, args->fh, &hdr); +		replen = hdr.replen; +		encode_fs_locations(xdr, args->bitmask, &hdr); +		if (args->renew) +			encode_renew(xdr, args->clientid, &hdr); +	} else { +		encode_putfh(xdr, args->dir_fh, &hdr); +		encode_lookup(xdr, args->name, &hdr); +		replen = hdr.replen; +		encode_fs_locations(xdr, args->bitmask, &hdr); +	} +	/* Set up reply kvec to capture returned fs_locations array. */  	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,  			0, PAGE_SIZE);  	encode_nops(&hdr); @@ -2715,6 +2740,26 @@ static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req,  	encode_nops(&hdr);  } +/* + * Encode FSID_PRESENT request + */ +static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req, +				      struct xdr_stream *xdr, +				      struct nfs4_fsid_present_arg *args) +{ +	struct compound_hdr hdr = { +		.minorversion = nfs4_xdr_minorversion(&args->seq_args), +	}; + +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_getfh(xdr, &hdr); +	if (args->renew) +		encode_renew(xdr, args->clientid, &hdr); +	encode_nops(&hdr); +} +  #if defined(CONFIG_NFS_V4_1)  /*   * BIND_CONN_TO_SESSION request @@ -3053,7 +3098,8 @@ out_overflow:  	return -EIO;  } -static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, +		int *nfs_retval)  {  	__be32 *p;  	uint32_t opnum; @@ -3063,19 +3109,32 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)  	if (unlikely(!p))  		goto out_overflow;  	opnum = be32_to_cpup(p++); -	if (opnum != expected) { -		dprintk("nfs: Server returned operation" -			" %d but we issued a request for %d\n", -				opnum, expected); -		return -EIO; -	} +	if (unlikely(opnum != expected)) +		goto out_bad_operation;  	nfserr = be32_to_cpup(p); -	if (nfserr != NFS_OK) -		return nfs4_stat_to_errno(nfserr); -	return 0; +	if (nfserr == NFS_OK) +		*nfs_retval = 0; +	else +		*nfs_retval = nfs4_stat_to_errno(nfserr); +	return true; +out_bad_operation: +	dprintk("nfs: Server returned operation" +		" %d but we issued a request for %d\n", +			opnum, expected); +	*nfs_retval = -EREMOTEIO; +	return false;  out_overflow:  	print_overflow_msg(__func__, xdr); -	return -EIO; +	*nfs_retval = -EIO; +	return false; +} + +static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +{ +	int retval; + +	__decode_op_hdr(xdr, expected, &retval); +	return retval;  }  /* Dummy routine */ @@ -3391,7 +3450,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint  {  	__be32 *p; -	*res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL; +	*res = 0;  	if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))  		return -EIO;  	if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { @@ -4957,11 +5016,12 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)  	uint32_t savewords, bmlen, i;  	int status; -	status = decode_op_hdr(xdr, OP_OPEN); -	if (status != -EIO) -		nfs_increment_open_seqid(status, res->seqid); -	if (!status) -		status = decode_stateid(xdr, &res->stateid); +	if (!__decode_op_hdr(xdr, OP_OPEN, &status)) +		return status; +	nfs_increment_open_seqid(status, res->seqid); +	if (status) +		return status; +	status = decode_stateid(xdr, &res->stateid);  	if (unlikely(status))  		return status; @@ -5027,7 +5087,8 @@ static int decode_putrootfh(struct xdr_stream *xdr)  	return decode_op_hdr(xdr, OP_PUTROOTFH);  } -static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res) +static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, +		       struct nfs_pgio_res *res)  {  	__be32 *p;  	uint32_t count, eof, recvd; @@ -5281,7 +5342,7 @@ static int decode_setclientid_confirm(struct xdr_stream *xdr)  	return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM);  } -static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) +static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res)  {  	__be32 *p;  	int status; @@ -6578,7 +6639,7 @@ out:   * Decode Read response   */  static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr, -			     struct nfs_readres *res) +			     struct nfs_pgio_res *res)  {  	struct compound_hdr hdr;  	int status; @@ -6603,7 +6664,7 @@ out:   * Decode WRITE response   */  static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr, -			      struct nfs_writeres *res) +			      struct nfs_pgio_res *res)  {  	struct compound_hdr hdr;  	int status; @@ -6824,13 +6885,26 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,  	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_lookup(xdr); -	if (status) -		goto out; -	xdr_enter_page(xdr, PAGE_SIZE); -	status = decode_getfattr_generic(xdr, &res->fs_locations->fattr, +	if (res->migration) { +		xdr_enter_page(xdr, PAGE_SIZE); +		status = decode_getfattr_generic(xdr, +					&res->fs_locations->fattr,  					 NULL, res->fs_locations,  					 NULL, res->fs_locations->server); +		if (status) +			goto out; +		if (res->renew) +			status = decode_renew(xdr); +	} else { +		status = decode_lookup(xdr); +		if (status) +			goto out; +		xdr_enter_page(xdr, PAGE_SIZE); +		status = decode_getfattr_generic(xdr, +					&res->fs_locations->fattr, +					 NULL, res->fs_locations, +					 NULL, res->fs_locations->server); +	}  out:  	return status;  } @@ -6859,6 +6933,34 @@ out:  	return status;  } +/* + * Decode FSID_PRESENT response + */ +static int nfs4_xdr_dec_fsid_present(struct rpc_rqst *rqstp, +				     struct xdr_stream *xdr, +				     struct nfs4_fsid_present_res *res) +{ +	struct compound_hdr hdr; +	int status; + +	status = decode_compound_hdr(xdr, &hdr); +	if (status) +		goto out; +	status = decode_sequence(xdr, &res->seq_res, rqstp); +	if (status) +		goto out; +	status = decode_putfh(xdr); +	if (status) +		goto out; +	status = decode_getfh(xdr, res->fh); +	if (status) +		goto out; +	if (res->renew) +		status = decode_renew(xdr); +out: +	return status; +} +  #if defined(CONFIG_NFS_V4_1)  /*   * Decode BIND_CONN_TO_SESSION response @@ -7373,6 +7475,7 @@ struct rpc_procinfo	nfs4_procedures[] = {  	PROC(FS_LOCATIONS,	enc_fs_locations,	dec_fs_locations),  	PROC(RELEASE_LOCKOWNER,	enc_release_lockowner,	dec_release_lockowner),  	PROC(SECINFO,		enc_secinfo,		dec_secinfo), +	PROC(FSID_PRESENT,	enc_fsid_present,	dec_fsid_present),  #if defined(CONFIG_NFS_V4_1)  	PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id),  	PROC(CREATE_SESSION,	enc_create_session,	dec_create_session), diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 89fe741e58b..59f838cdc00 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -36,6 +36,7 @@  	__print_flags(v, "|", \  			{ 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \  			{ 1 << NFS_INO_STALE, "STALE" }, \ +			{ 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \  			{ 1 << NFS_INO_FLUSHING, "FLUSHING" }, \  			{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \  			{ 1 << NFS_INO_COMMIT, "COMMIT" }, \ diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 5457745dd4f..611320753db 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -439,7 +439,7 @@ static void _read_done(struct ore_io_state *ios, void *private)  	objlayout_read_done(&objios->oir, status, objios->sync);  } -int objio_read_pagelist(struct nfs_read_data *rdata) +int objio_read_pagelist(struct nfs_pgio_data *rdata)  {  	struct nfs_pgio_header *hdr = rdata->header;  	struct objio_state *objios; @@ -487,7 +487,7 @@ static void _write_done(struct ore_io_state *ios, void *private)  static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)  {  	struct objio_state *objios = priv; -	struct nfs_write_data *wdata = objios->oir.rpcdata; +	struct nfs_pgio_data *wdata = objios->oir.rpcdata;  	struct address_space *mapping = wdata->header->inode->i_mapping;  	pgoff_t index = offset / PAGE_SIZE;  	struct page *page; @@ -531,7 +531,7 @@ static const struct _ore_r4w_op _r4w_op = {  	.put_page = &__r4w_put_page,  }; -int objio_write_pagelist(struct nfs_write_data *wdata, int how) +int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)  {  	struct nfs_pgio_header *hdr = wdata->header;  	struct objio_state *objios; @@ -564,14 +564,22 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how)  	return 0;  } -static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,  			  struct nfs_page *prev, struct nfs_page *req)  { -	if (!pnfs_generic_pg_test(pgio, prev, req)) -		return false; +	unsigned int size; + +	size = pnfs_generic_pg_test(pgio, prev, req); + +	if (!size || pgio->pg_count + req->wb_bytes > +	    (unsigned long)pgio->pg_layout_private) +		return 0; -	return pgio->pg_count + req->wb_bytes <= -			(unsigned long)pgio->pg_layout_private; +	return min(size, req->wb_bytes);  }  static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index e4f9cbfec67..765d3f54e98 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -53,10 +53,10 @@ objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)  	struct objlayout *objlay;  	objlay = kzalloc(sizeof(struct objlayout), gfp_flags); -	if (objlay) { -		spin_lock_init(&objlay->lock); -		INIT_LIST_HEAD(&objlay->err_list); -	} +	if (!objlay) +		return NULL; +	spin_lock_init(&objlay->lock); +	INIT_LIST_HEAD(&objlay->err_list);  	dprintk("%s: Return %p\n", __func__, objlay);  	return &objlay->pnfs_layout;  } @@ -229,11 +229,11 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,  static void _rpc_read_complete(struct work_struct *work)  {  	struct rpc_task *task; -	struct nfs_read_data *rdata; +	struct nfs_pgio_data *rdata;  	dprintk("%s enter\n", __func__);  	task = container_of(work, struct rpc_task, u.tk_work); -	rdata = container_of(task, struct nfs_read_data, task); +	rdata = container_of(task, struct nfs_pgio_data, task);  	pnfs_ld_read_done(rdata);  } @@ -241,7 +241,7 @@ static void _rpc_read_complete(struct work_struct *work)  void  objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)  { -	struct nfs_read_data *rdata = oir->rpcdata; +	struct nfs_pgio_data *rdata = oir->rpcdata;  	oir->status = rdata->task.tk_status = status;  	if (status >= 0) @@ -266,7 +266,7 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)   * Perform sync or async reads.   */  enum pnfs_try_status -objlayout_read_pagelist(struct nfs_read_data *rdata) +objlayout_read_pagelist(struct nfs_pgio_data *rdata)  {  	struct nfs_pgio_header *hdr = rdata->header;  	struct inode *inode = hdr->inode; @@ -312,11 +312,11 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)  static void _rpc_write_complete(struct work_struct *work)  {  	struct rpc_task *task; -	struct nfs_write_data *wdata; +	struct nfs_pgio_data *wdata;  	dprintk("%s enter\n", __func__);  	task = container_of(work, struct rpc_task, u.tk_work); -	wdata = container_of(task, struct nfs_write_data, task); +	wdata = container_of(task, struct nfs_pgio_data, task);  	pnfs_ld_write_done(wdata);  } @@ -324,7 +324,7 @@ static void _rpc_write_complete(struct work_struct *work)  void  objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)  { -	struct nfs_write_data *wdata = oir->rpcdata; +	struct nfs_pgio_data *wdata = oir->rpcdata;  	oir->status = wdata->task.tk_status = status;  	if (status >= 0) { @@ -351,7 +351,7 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)   * Perform sync or async writes.   */  enum pnfs_try_status -objlayout_write_pagelist(struct nfs_write_data *wdata, +objlayout_write_pagelist(struct nfs_pgio_data *wdata,  			 int how)  {  	struct nfs_pgio_header *hdr = wdata->header; diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 87aa1dec612..01e041029a6 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg);   */  extern void objio_free_result(struct objlayout_io_res *oir); -extern int objio_read_pagelist(struct nfs_read_data *rdata); -extern int objio_write_pagelist(struct nfs_write_data *wdata, int how); +extern int objio_read_pagelist(struct nfs_pgio_data *rdata); +extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how);  /*   * callback API @@ -168,10 +168,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg(  extern void objlayout_free_lseg(struct pnfs_layout_segment *);  extern enum pnfs_try_status objlayout_read_pagelist( -	struct nfs_read_data *); +	struct nfs_pgio_data *);  extern enum pnfs_try_status objlayout_write_pagelist( -	struct nfs_write_data *, +	struct nfs_pgio_data *,  	int how);  extern void objlayout_encode_layoutcommit( diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 2ffebf2081c..17fab89f635 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -24,9 +24,12 @@  #include "internal.h"  #include "pnfs.h" +#define NFSDBG_FACILITY		NFSDBG_PAGECACHE +  static struct kmem_cache *nfs_page_cachep; +static const struct rpc_call_ops nfs_pgio_common_ops; -bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) +static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)  {  	p->npages = pagecount;  	if (pagecount <= ARRAY_SIZE(p->page_array)) @@ -95,7 +98,7 @@ nfs_iocounter_dec(struct nfs_io_counter *c)  {  	if (atomic_dec_and_test(&c->io_count)) {  		clear_bit(NFS_IO_INPROGRESS, &c->flags); -		smp_mb__after_clear_bit(); +		smp_mb__after_atomic();  		wake_up_bit(&c->flags, NFS_IO_INPROGRESS);  	}  } @@ -133,11 +136,168 @@ nfs_iocounter_wait(struct nfs_io_counter *c)  	return __nfs_iocounter_wait(c);  } +static int nfs_wait_bit_uninterruptible(void *word) +{ +	io_schedule(); +	return 0; +} + +/* + * nfs_page_group_lock - lock the head of the page group + * @req - request in group that is to be locked + * + * this lock must be held if modifying the page group list + */ +void +nfs_page_group_lock(struct nfs_page *req) +{ +	struct nfs_page *head = req->wb_head; + +	WARN_ON_ONCE(head != head->wb_head); + +	wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, +			nfs_wait_bit_uninterruptible, +			TASK_UNINTERRUPTIBLE); +} + +/* + * nfs_page_group_unlock - unlock the head of the page group + * @req - request in group that is to be unlocked + */ +void +nfs_page_group_unlock(struct nfs_page *req) +{ +	struct nfs_page *head = req->wb_head; + +	WARN_ON_ONCE(head != head->wb_head); + +	smp_mb__before_atomic(); +	clear_bit(PG_HEADLOCK, &head->wb_flags); +	smp_mb__after_atomic(); +	wake_up_bit(&head->wb_flags, PG_HEADLOCK); +} + +/* + * nfs_page_group_sync_on_bit_locked + * + * must be called with page group lock held + */ +static bool +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) +{ +	struct nfs_page *head = req->wb_head; +	struct nfs_page *tmp; + +	WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags)); +	WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags)); + +	tmp = req->wb_this_page; +	while (tmp != req) { +		if (!test_bit(bit, &tmp->wb_flags)) +			return false; +		tmp = tmp->wb_this_page; +	} + +	/* true! reset all bits */ +	tmp = req; +	do { +		clear_bit(bit, &tmp->wb_flags); +		tmp = tmp->wb_this_page; +	} while (tmp != req); + +	return true; +} + +/* + * nfs_page_group_sync_on_bit - set bit on current request, but only + *   return true if the bit is set for all requests in page group + * @req - request in page group + * @bit - PG_* bit that is used to sync page group + */ +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit) +{ +	bool ret; + +	nfs_page_group_lock(req); +	ret = nfs_page_group_sync_on_bit_locked(req, bit); +	nfs_page_group_unlock(req); + +	return ret; +} + +/* + * nfs_page_group_init - Initialize the page group linkage for @req + * @req - a new nfs request + * @prev - the previous request in page group, or NULL if @req is the first + *         or only request in the group (the head). + */ +static inline void +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) +{ +	WARN_ON_ONCE(prev == req); + +	if (!prev) { +		/* a head request */ +		req->wb_head = req; +		req->wb_this_page = req; +	} else { +		/* a subrequest */ +		WARN_ON_ONCE(prev->wb_this_page != prev->wb_head); +		WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags)); +		req->wb_head = prev->wb_head; +		req->wb_this_page = prev->wb_this_page; +		prev->wb_this_page = req; + +		/* All subrequests take a ref on the head request until +		 * nfs_page_group_destroy is called */ +		kref_get(&req->wb_head->wb_kref); + +		/* grab extra ref if head request has extra ref from +		 * the write/commit path to handle handoff between write +		 * and commit lists */ +		if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) { +			set_bit(PG_INODE_REF, &req->wb_flags); +			kref_get(&req->wb_kref); +		} +	} +} + +/* + * nfs_page_group_destroy - sync the destruction of page groups + * @req - request that no longer needs the page group + * + * releases the page group reference from each member once all + * members have called this function. + */ +static void +nfs_page_group_destroy(struct kref *kref) +{ +	struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); +	struct nfs_page *tmp, *next; + +	/* subrequests must release the ref on the head request */ +	if (req->wb_head != req) +		nfs_release_request(req->wb_head); + +	if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN)) +		return; + +	tmp = req; +	do { +		next = tmp->wb_this_page; +		/* unlink and free */ +		tmp->wb_this_page = tmp; +		tmp->wb_head = tmp; +		nfs_free_request(tmp); +		tmp = next; +	} while (tmp != req); +} +  /**   * nfs_create_request - Create an NFS read/write request.   * @ctx: open context to use - * @inode: inode to which the request is attached   * @page: page to write + * @last: last nfs request created for this page group or NULL if head   * @offset: starting offset within the page for the write   * @count: number of bytes to read/write   * @@ -146,9 +306,9 @@ nfs_iocounter_wait(struct nfs_io_counter *c)   * User should ensure it is safe to sleep in this function.   */  struct nfs_page * -nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, -		   struct page *page, -		   unsigned int offset, unsigned int count) +nfs_create_request(struct nfs_open_context *ctx, struct page *page, +		   struct nfs_page *last, unsigned int offset, +		   unsigned int count)  {  	struct nfs_page		*req;  	struct nfs_lock_context *l_ctx; @@ -180,6 +340,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,  	req->wb_bytes   = count;  	req->wb_context = get_nfs_open_context(ctx);  	kref_init(&req->wb_kref); +	nfs_page_group_init(req, last);  	return req;  } @@ -193,9 +354,9 @@ void nfs_unlock_request(struct nfs_page *req)  		printk(KERN_ERR "NFS: Invalid unlock attempted\n");  		BUG();  	} -	smp_mb__before_clear_bit(); +	smp_mb__before_atomic();  	clear_bit(PG_BUSY, &req->wb_flags); -	smp_mb__after_clear_bit(); +	smp_mb__after_atomic();  	wake_up_bit(&req->wb_flags, PG_BUSY);  } @@ -237,16 +398,22 @@ static void nfs_clear_request(struct nfs_page *req)  	}  } -  /**   * nfs_release_request - Release the count on an NFS read/write request   * @req: request to release   *   * Note: Should never be called with the spinlock held!   */ -static void nfs_free_request(struct kref *kref) +void nfs_free_request(struct nfs_page *req)  { -	struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); +	WARN_ON_ONCE(req->wb_this_page != req); + +	/* extra debug: make sure no sync bits are still set */ +	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); +	WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags)); +	WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags)); +	WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags)); +	WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));  	/* Release struct file and open context */  	nfs_clear_request(req); @@ -255,13 +422,7 @@ static void nfs_free_request(struct kref *kref)  void nfs_release_request(struct nfs_page *req)  { -	kref_put(&req->wb_kref, nfs_free_request); -} - -static int nfs_wait_bit_uninterruptible(void *word) -{ -	io_schedule(); -	return 0; +	kref_put(&req->wb_kref, nfs_page_group_destroy);  }  /** @@ -279,22 +440,249 @@ nfs_wait_on_request(struct nfs_page *req)  			TASK_UNINTERRUPTIBLE);  } -bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req) +/* + * nfs_generic_pg_test - determine if requests can be coalesced + * @desc: pointer to descriptor + * @prev: previous request in desc, or NULL + * @req: this request + * + * Returns zero if @req can be coalesced into @desc, otherwise it returns + * the size of the request. + */ +size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, +			   struct nfs_page *prev, struct nfs_page *req)  { -	/* -	 * FIXME: ideally we should be able to coalesce all requests -	 * that are not block boundary aligned, but currently this -	 * is problematic for the case of bsize < PAGE_CACHE_SIZE, -	 * since nfs_flush_multi and nfs_pagein_multi assume you -	 * can have only one struct nfs_page. -	 */ -	if (desc->pg_bsize < PAGE_SIZE) +	if (desc->pg_count > desc->pg_bsize) { +		/* should never happen */ +		WARN_ON_ONCE(1);  		return 0; +	} -	return desc->pg_count + req->wb_bytes <= desc->pg_bsize; +	return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);  }  EXPORT_SYMBOL_GPL(nfs_generic_pg_test); +static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr) +{ +	return container_of(hdr, struct nfs_rw_header, header); +} + +/** + * nfs_rw_header_alloc - Allocate a header for a read or write + * @ops: Read or write function vector + */ +struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops) +{ +	struct nfs_rw_header *header = ops->rw_alloc_header(); + +	if (header) { +		struct nfs_pgio_header *hdr = &header->header; + +		INIT_LIST_HEAD(&hdr->pages); +		spin_lock_init(&hdr->lock); +		atomic_set(&hdr->refcnt, 0); +		hdr->rw_ops = ops; +	} +	return header; +} +EXPORT_SYMBOL_GPL(nfs_rw_header_alloc); + +/* + * nfs_rw_header_free - Free a read or write header + * @hdr: The header to free + */ +void nfs_rw_header_free(struct nfs_pgio_header *hdr) +{ +	hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr)); +} +EXPORT_SYMBOL_GPL(nfs_rw_header_free); + +/** + * nfs_pgio_data_alloc - Allocate pageio data + * @hdr: The header making a request + * @pagecount: Number of pages to create + */ +static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr, +						 unsigned int pagecount) +{ +	struct nfs_pgio_data *data, *prealloc; + +	prealloc = &NFS_RW_HEADER(hdr)->rpc_data; +	if (prealloc->header == NULL) +		data = prealloc; +	else +		data = kzalloc(sizeof(*data), GFP_KERNEL); +	if (!data) +		goto out; + +	if (nfs_pgarray_set(&data->pages, pagecount)) { +		data->header = hdr; +		atomic_inc(&hdr->refcnt); +	} else { +		if (data != prealloc) +			kfree(data); +		data = NULL; +	} +out: +	return data; +} + +/** + * nfs_pgio_data_release - Properly free pageio data + * @data: The data to release + */ +void nfs_pgio_data_release(struct nfs_pgio_data *data) +{ +	struct nfs_pgio_header *hdr = data->header; +	struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr); + +	put_nfs_open_context(data->args.context); +	if (data->pages.pagevec != data->pages.page_array) +		kfree(data->pages.pagevec); +	if (data == &pageio_header->rpc_data) { +		data->header = NULL; +		data = NULL; +	} +	if (atomic_dec_and_test(&hdr->refcnt)) +		hdr->completion_ops->completion(hdr); +	/* Note: we only free the rpc_task after callbacks are done. +	 * See the comment in rpc_free_task() for why +	 */ +	kfree(data); +} +EXPORT_SYMBOL_GPL(nfs_pgio_data_release); + +/** + * nfs_pgio_rpcsetup - Set up arguments for a pageio call + * @data: The pageio data + * @count: Number of bytes to read + * @offset: Initial offset + * @how: How to commit data (writes only) + * @cinfo: Commit information for the call (writes only) + */ +static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data, +			      unsigned int count, unsigned int offset, +			      int how, struct nfs_commit_info *cinfo) +{ +	struct nfs_page *req = data->header->req; + +	/* Set up the RPC argument and reply structs +	 * NB: take care not to mess about with data->commit et al. */ + +	data->args.fh     = NFS_FH(data->header->inode); +	data->args.offset = req_offset(req) + offset; +	/* pnfs_set_layoutcommit needs this */ +	data->mds_offset = data->args.offset; +	data->args.pgbase = req->wb_pgbase + offset; +	data->args.pages  = data->pages.pagevec; +	data->args.count  = count; +	data->args.context = get_nfs_open_context(req->wb_context); +	data->args.lock_context = req->wb_lock_context; +	data->args.stable  = NFS_UNSTABLE; +	switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { +	case 0: +		break; +	case FLUSH_COND_STABLE: +		if (nfs_reqs_to_commit(cinfo)) +			break; +	default: +		data->args.stable = NFS_FILE_SYNC; +	} + +	data->res.fattr   = &data->fattr; +	data->res.count   = count; +	data->res.eof     = 0; +	data->res.verf    = &data->verf; +	nfs_fattr_init(&data->fattr); +} + +/** + * nfs_pgio_prepare - Prepare pageio data to go over the wire + * @task: The current task + * @calldata: pageio data to prepare + */ +static void nfs_pgio_prepare(struct rpc_task *task, void *calldata) +{ +	struct nfs_pgio_data *data = calldata; +	int err; +	err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data); +	if (err) +		rpc_exit(task, err); +} + +int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data, +		      const struct rpc_call_ops *call_ops, int how, int flags) +{ +	struct rpc_task *task; +	struct rpc_message msg = { +		.rpc_argp = &data->args, +		.rpc_resp = &data->res, +		.rpc_cred = data->header->cred, +	}; +	struct rpc_task_setup task_setup_data = { +		.rpc_client = clnt, +		.task = &data->task, +		.rpc_message = &msg, +		.callback_ops = call_ops, +		.callback_data = data, +		.workqueue = nfsiod_workqueue, +		.flags = RPC_TASK_ASYNC | flags, +	}; +	int ret = 0; + +	data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how); + +	dprintk("NFS: %5u initiated pgio call " +		"(req %s/%llu, %u bytes @ offset %llu)\n", +		data->task.tk_pid, +		data->header->inode->i_sb->s_id, +		(unsigned long long)NFS_FILEID(data->header->inode), +		data->args.count, +		(unsigned long long)data->args.offset); + +	task = rpc_run_task(&task_setup_data); +	if (IS_ERR(task)) { +		ret = PTR_ERR(task); +		goto out; +	} +	if (how & FLUSH_SYNC) { +		ret = rpc_wait_for_completion_task(task); +		if (ret == 0) +			ret = task->tk_status; +	} +	rpc_put_task(task); +out: +	return ret; +} +EXPORT_SYMBOL_GPL(nfs_initiate_pgio); + +/** + * nfs_pgio_error - Clean up from a pageio error + * @desc: IO descriptor + * @hdr: pageio header + */ +static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, +			  struct nfs_pgio_header *hdr) +{ +	set_bit(NFS_IOHDR_REDO, &hdr->flags); +	nfs_pgio_data_release(hdr->data); +	hdr->data = NULL; +	desc->pg_completion_ops->error_cleanup(&desc->pg_list); +	return -ENOMEM; +} + +/** + * nfs_pgio_release - Release pageio data + * @calldata: The pageio data to release + */ +static void nfs_pgio_release(void *calldata) +{ +	struct nfs_pgio_data *data = calldata; +	if (data->header->rw_ops->rw_release) +		data->header->rw_ops->rw_release(data); +	nfs_pgio_data_release(data); +} +  /**   * nfs_pageio_init - initialise a page io descriptor   * @desc: pointer to descriptor @@ -307,6 +695,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,  		     struct inode *inode,  		     const struct nfs_pageio_ops *pg_ops,  		     const struct nfs_pgio_completion_ops *compl_ops, +		     const struct nfs_rw_ops *rw_ops,  		     size_t bsize,  		     int io_flags)  { @@ -320,6 +709,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,  	desc->pg_inode = inode;  	desc->pg_ops = pg_ops;  	desc->pg_completion_ops = compl_ops; +	desc->pg_rw_ops = rw_ops;  	desc->pg_ioflags = io_flags;  	desc->pg_error = 0;  	desc->pg_lseg = NULL; @@ -328,6 +718,94 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,  }  EXPORT_SYMBOL_GPL(nfs_pageio_init); +/** + * nfs_pgio_result - Basic pageio error handling + * @task: The task that ran + * @calldata: Pageio data to check + */ +static void nfs_pgio_result(struct rpc_task *task, void *calldata) +{ +	struct nfs_pgio_data *data = calldata; +	struct inode *inode = data->header->inode; + +	dprintk("NFS: %s: %5u, (status %d)\n", __func__, +		task->tk_pid, task->tk_status); + +	if (data->header->rw_ops->rw_done(task, data, inode) != 0) +		return; +	if (task->tk_status < 0) +		nfs_set_pgio_error(data->header, task->tk_status, data->args.offset); +	else +		data->header->rw_ops->rw_result(task, data); +} + +/* + * Create an RPC task for the given read or write request and kick it. + * The page must have been locked by the caller. + * + * It may happen that the page we're passed is not marked dirty. + * This is the case if nfs_updatepage detects a conflicting request + * that has been written but not committed. + */ +int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, +		     struct nfs_pgio_header *hdr) +{ +	struct nfs_page		*req; +	struct page		**pages; +	struct nfs_pgio_data	*data; +	struct list_head *head = &desc->pg_list; +	struct nfs_commit_info cinfo; + +	data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base, +							   desc->pg_count)); +	if (!data) +		return nfs_pgio_error(desc, hdr); + +	nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); +	pages = data->pages.pagevec; +	while (!list_empty(head)) { +		req = nfs_list_entry(head->next); +		nfs_list_remove_request(req); +		nfs_list_add_request(req, &hdr->pages); +		*pages++ = req->wb_page; +	} + +	if ((desc->pg_ioflags & FLUSH_COND_STABLE) && +	    (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) +		desc->pg_ioflags &= ~FLUSH_COND_STABLE; + +	/* Set up the argument struct */ +	nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo); +	hdr->data = data; +	desc->pg_rpc_callops = &nfs_pgio_common_ops; +	return 0; +} +EXPORT_SYMBOL_GPL(nfs_generic_pgio); + +static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) +{ +	struct nfs_rw_header *rw_hdr; +	struct nfs_pgio_header *hdr; +	int ret; + +	rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops); +	if (!rw_hdr) { +		desc->pg_completion_ops->error_cleanup(&desc->pg_list); +		return -ENOMEM; +	} +	hdr = &rw_hdr->header; +	nfs_pgheader_init(desc, hdr, nfs_rw_header_free); +	atomic_inc(&hdr->refcnt); +	ret = nfs_generic_pgio(desc, hdr); +	if (ret == 0) +		ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), +					hdr->data, desc->pg_rpc_callops, +					desc->pg_ioflags, 0); +	if (atomic_dec_and_test(&hdr->refcnt)) +		hdr->completion_ops->completion(hdr); +	return ret; +} +  static bool nfs_match_open_context(const struct nfs_open_context *ctx1,  		const struct nfs_open_context *ctx2)  { @@ -356,18 +834,23 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,  				      struct nfs_page *req,  				      struct nfs_pageio_descriptor *pgio)  { -	if (!nfs_match_open_context(req->wb_context, prev->wb_context)) -		return false; -	if (req->wb_context->dentry->d_inode->i_flock != NULL && -	    !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context)) -		return false; -	if (req->wb_pgbase != 0) -		return false; -	if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) -		return false; -	if (req_offset(req) != req_offset(prev) + prev->wb_bytes) -		return false; -	return pgio->pg_ops->pg_test(pgio, prev, req); +	size_t size; + +	if (prev) { +		if (!nfs_match_open_context(req->wb_context, prev->wb_context)) +			return false; +		if (req->wb_context->dentry->d_inode->i_flock != NULL && +		    !nfs_match_lock_context(req->wb_lock_context, +					    prev->wb_lock_context)) +			return false; +		if (req_offset(req) != req_offset(prev) + prev->wb_bytes) +			return false; +	} +	size = pgio->pg_ops->pg_test(pgio, prev, req); +	WARN_ON_ONCE(size > req->wb_bytes); +	if (size && size < req->wb_bytes) +		req->wb_bytes = size; +	return size > 0;  }  /** @@ -381,17 +864,16 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,  static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,  				     struct nfs_page *req)  { +	struct nfs_page *prev = NULL;  	if (desc->pg_count != 0) { -		struct nfs_page *prev; -  		prev = nfs_list_entry(desc->pg_list.prev); -		if (!nfs_can_coalesce_requests(prev, req, desc)) -			return 0;  	} else {  		if (desc->pg_ops->pg_init)  			desc->pg_ops->pg_init(desc, req);  		desc->pg_base = req->wb_pgbase;  	} +	if (!nfs_can_coalesce_requests(prev, req, desc)) +		return 0;  	nfs_list_remove_request(req);  	nfs_list_add_request(req, &desc->pg_list);  	desc->pg_count += req->wb_bytes; @@ -421,22 +903,72 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)   * @desc: destination io descriptor   * @req: request   * + * This may split a request into subrequests which are all part of the + * same page group. + *   * Returns true if the request 'req' was successfully coalesced into the   * existing list of pages 'desc'.   */  static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,  			   struct nfs_page *req)  { -	while (!nfs_pageio_do_add_request(desc, req)) { -		desc->pg_moreio = 1; -		nfs_pageio_doio(desc); -		if (desc->pg_error < 0) -			return 0; -		desc->pg_moreio = 0; -		if (desc->pg_recoalesce) -			return 0; -	} +	struct nfs_page *subreq; +	unsigned int bytes_left = 0; +	unsigned int offset, pgbase; + +	nfs_page_group_lock(req); + +	subreq = req; +	bytes_left = subreq->wb_bytes; +	offset = subreq->wb_offset; +	pgbase = subreq->wb_pgbase; + +	do { +		if (!nfs_pageio_do_add_request(desc, subreq)) { +			/* make sure pg_test call(s) did nothing */ +			WARN_ON_ONCE(subreq->wb_bytes != bytes_left); +			WARN_ON_ONCE(subreq->wb_offset != offset); +			WARN_ON_ONCE(subreq->wb_pgbase != pgbase); + +			nfs_page_group_unlock(req); +			desc->pg_moreio = 1; +			nfs_pageio_doio(desc); +			if (desc->pg_error < 0) +				return 0; +			if (desc->pg_recoalesce) +				return 0; +			/* retry add_request for this subreq */ +			nfs_page_group_lock(req); +			continue; +		} + +		/* check for buggy pg_test call(s) */ +		WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE); +		WARN_ON_ONCE(subreq->wb_bytes > bytes_left); +		WARN_ON_ONCE(subreq->wb_bytes == 0); + +		bytes_left -= subreq->wb_bytes; +		offset += subreq->wb_bytes; +		pgbase += subreq->wb_bytes; + +		if (bytes_left) { +			subreq = nfs_create_request(req->wb_context, +					req->wb_page, +					subreq, pgbase, bytes_left); +			if (IS_ERR(subreq)) +				goto err_ptr; +			nfs_lock_request(subreq); +			subreq->wb_offset  = offset; +			subreq->wb_index = req->wb_index; +		} +	} while (bytes_left > 0); + +	nfs_page_group_unlock(req);  	return 1; +err_ptr: +	desc->pg_error = PTR_ERR(subreq); +	nfs_page_group_unlock(req); +	return 0;  }  static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) @@ -449,6 +981,7 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)  		desc->pg_count = 0;  		desc->pg_base = 0;  		desc->pg_recoalesce = 0; +		desc->pg_moreio = 0;  		while (!list_empty(&head)) {  			struct nfs_page *req; @@ -535,3 +1068,13 @@ void nfs_destroy_nfspagecache(void)  	kmem_cache_destroy(nfs_page_cachep);  } +static const struct rpc_call_ops nfs_pgio_common_ops = { +	.rpc_call_prepare = nfs_pgio_prepare, +	.rpc_call_done = nfs_pgio_result, +	.rpc_release = nfs_pgio_release, +}; + +const struct nfs_pageio_ops nfs_pgio_rw_ops = { +	.pg_test = nfs_generic_pg_test, +	.pg_doio = nfs_generic_pg_pgios, +}; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d75d938d36c..6fdcd233d6f 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -662,7 +662,18 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)   */  static bool pnfs_seqid_is_newer(u32 s1, u32 s2)  { -	return (s32)s1 - (s32)s2 > 0; +	return (s32)(s1 - s2) > 0; +} + +static void +pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo, +		const nfs4_stateid *new, +		struct list_head *free_me_list) +{ +	if (nfs4_stateid_match_other(&lo->plh_stateid, new)) +		return; +	/* Layout is new! Kill existing layout segments */ +	pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);  }  /* update lo->plh_stateid with new if is more recent */ @@ -1315,6 +1326,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)  	struct nfs4_layoutget_res *res = &lgp->res;  	struct pnfs_layout_segment *lseg;  	struct inode *ino = lo->plh_inode; +	LIST_HEAD(free_me);  	int status = 0;  	/* Inject layout blob into I/O device driver */ @@ -1341,6 +1353,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)  		goto out_forget_reply;  	} +	/* Check that the new stateid matches the old stateid */ +	pnfs_verify_layout_stateid(lo, &res->stateid, &free_me);  	/* Done processing layoutget. Set the layout stateid */  	pnfs_set_layout_stateid(lo, &res->stateid, false); @@ -1355,6 +1369,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)  	}  	spin_unlock(&ino->i_lock); +	pnfs_free_lseg_list(&free_me);  	return lseg;  out:  	return ERR_PTR(status); @@ -1373,11 +1388,6 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r  	WARN_ON_ONCE(pgio->pg_lseg != NULL); -	if (req->wb_offset != req->wb_pgbase) { -		nfs_pageio_reset_read_mds(pgio); -		return; -	} -  	if (pgio->pg_dreq == NULL)  		rd_size = i_size_read(pgio->pg_inode) - req_offset(req);  	else @@ -1402,11 +1412,6 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,  {  	WARN_ON_ONCE(pgio->pg_lseg != NULL); -	if (req->wb_offset != req->wb_pgbase) { -		nfs_pageio_reset_write_mds(pgio); -		return; -	} -  	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,  					   req->wb_context,  					   req_offset(req), @@ -1419,56 +1424,49 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,  }  EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); -void -pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, -		      const struct nfs_pgio_completion_ops *compl_ops) -{ -	struct nfs_server *server = NFS_SERVER(inode); -	struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - -	if (ld == NULL) -		nfs_pageio_init_read(pgio, inode, compl_ops); -	else -		nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, server->rsize, 0); -} - -void -pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, -		       int ioflags, -		       const struct nfs_pgio_completion_ops *compl_ops) -{ -	struct nfs_server *server = NFS_SERVER(inode); -	struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - -	if (ld == NULL) -		nfs_pageio_init_write(pgio, inode, ioflags, compl_ops); -	else -		nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags); -} - -bool +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +size_t  pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,  		     struct nfs_page *req)  { -	if (pgio->pg_lseg == NULL) -		return nfs_generic_pg_test(pgio, prev, req); +	unsigned int size; +	u64 seg_end, req_start, seg_left; + +	size = nfs_generic_pg_test(pgio, prev, req); +	if (!size) +		return 0;  	/* -	 * Test if a nfs_page is fully contained in the pnfs_layout_range. -	 * Note that this test makes several assumptions: -	 * - that the previous nfs_page in the struct nfs_pageio_descriptor -	 *   is known to lie within the range. -	 *   - that the nfs_page being tested is known to be contiguous with the -	 *   previous nfs_page. -	 *   - Layout ranges are page aligned, so we only have to test the -	 *   start offset of the request. +	 * 'size' contains the number of bytes left in the current page (up +	 * to the original size asked for in @req->wb_bytes). +	 * +	 * Calculate how many bytes are left in the layout segment +	 * and if there are less bytes than 'size', return that instead.  	 *  	 * Please also note that 'end_offset' is actually the offset of the  	 * first byte that lies outside the pnfs_layout_range. FIXME?  	 *  	 */ -	return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset, -					 pgio->pg_lseg->pls_range.length); +	if (pgio->pg_lseg) { +		seg_end = end_offset(pgio->pg_lseg->pls_range.offset, +				     pgio->pg_lseg->pls_range.length); +		req_start = req_offset(req); +		WARN_ON_ONCE(req_start > seg_end); +		/* start of request is past the last byte of this segment */ +		if (req_start >= seg_end) +			return 0; + +		/* adjust 'size' iff there are fewer bytes left in the +		 * segment than what nfs_generic_pg_test returned */ +		seg_left = seg_end - req_start; +		if (seg_left < size) +			size = (unsigned int)seg_left; +	} + +	return size;  }  EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); @@ -1481,7 +1479,7 @@ int pnfs_write_done_resend_to_mds(struct inode *inode,  	LIST_HEAD(failed);  	/* Resend all requests through the MDS */ -	nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops); +	nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops);  	pgio.pg_dreq = dreq;  	while (!list_empty(head)) {  		struct nfs_page *req = nfs_list_entry(head->next); @@ -1504,7 +1502,7 @@ int pnfs_write_done_resend_to_mds(struct inode *inode,  }  EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); -static void pnfs_ld_handle_write_error(struct nfs_write_data *data) +static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)  {  	struct nfs_pgio_header *hdr = data->header; @@ -1523,7 +1521,7 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data)  /*   * Called by non rpc-based layout drivers   */ -void pnfs_ld_write_done(struct nfs_write_data *data) +void pnfs_ld_write_done(struct nfs_pgio_data *data)  {  	struct nfs_pgio_header *hdr = data->header; @@ -1539,7 +1537,7 @@ EXPORT_SYMBOL_GPL(pnfs_ld_write_done);  static void  pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, -		struct nfs_write_data *data) +		struct nfs_pgio_data *data)  {  	struct nfs_pgio_header *hdr = data->header; @@ -1548,11 +1546,11 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,  		nfs_pageio_reset_write_mds(desc);  		desc->pg_recoalesce = 1;  	} -	nfs_writedata_release(data); +	nfs_pgio_data_release(data);  }  static enum pnfs_try_status -pnfs_try_to_write_data(struct nfs_write_data *wdata, +pnfs_try_to_write_data(struct nfs_pgio_data *wdata,  			const struct rpc_call_ops *call_ops,  			struct pnfs_layout_segment *lseg,  			int how) @@ -1574,41 +1572,36 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,  }  static void -pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how) +pnfs_do_write(struct nfs_pageio_descriptor *desc, +	      struct nfs_pgio_header *hdr, int how)  { -	struct nfs_write_data *data; +	struct nfs_pgio_data *data = hdr->data;  	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;  	struct pnfs_layout_segment *lseg = desc->pg_lseg; +	enum pnfs_try_status trypnfs;  	desc->pg_lseg = NULL; -	while (!list_empty(head)) { -		enum pnfs_try_status trypnfs; - -		data = list_first_entry(head, struct nfs_write_data, list); -		list_del_init(&data->list); - -		trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); -		if (trypnfs == PNFS_NOT_ATTEMPTED) -			pnfs_write_through_mds(desc, data); -	} +	trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); +	if (trypnfs == PNFS_NOT_ATTEMPTED) +		pnfs_write_through_mds(desc, data);  	pnfs_put_lseg(lseg);  }  static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)  {  	pnfs_put_lseg(hdr->lseg); -	nfs_writehdr_free(hdr); +	nfs_rw_header_free(hdr);  }  EXPORT_SYMBOL_GPL(pnfs_writehdr_free);  int  pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)  { -	struct nfs_write_header *whdr; +	struct nfs_rw_header *whdr;  	struct nfs_pgio_header *hdr;  	int ret; -	whdr = nfs_writehdr_alloc(); +	whdr = nfs_rw_header_alloc(desc->pg_rw_ops);  	if (!whdr) {  		desc->pg_completion_ops->error_cleanup(&desc->pg_list);  		pnfs_put_lseg(desc->pg_lseg); @@ -1619,12 +1612,12 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)  	nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);  	hdr->lseg = pnfs_get_lseg(desc->pg_lseg);  	atomic_inc(&hdr->refcnt); -	ret = nfs_generic_flush(desc, hdr); +	ret = nfs_generic_pgio(desc, hdr);  	if (ret != 0) {  		pnfs_put_lseg(desc->pg_lseg);  		desc->pg_lseg = NULL;  	} else -		pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags); +		pnfs_do_write(desc, hdr, desc->pg_ioflags);  	if (atomic_dec_and_test(&hdr->refcnt))  		hdr->completion_ops->completion(hdr);  	return ret; @@ -1640,7 +1633,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode,  	LIST_HEAD(failed);  	/* Resend all requests through the MDS */ -	nfs_pageio_init_read(&pgio, inode, compl_ops); +	nfs_pageio_init_read(&pgio, inode, true, compl_ops);  	pgio.pg_dreq = dreq;  	while (!list_empty(head)) {  		struct nfs_page *req = nfs_list_entry(head->next); @@ -1659,7 +1652,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode,  }  EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); -static void pnfs_ld_handle_read_error(struct nfs_read_data *data) +static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data)  {  	struct nfs_pgio_header *hdr = data->header; @@ -1678,7 +1671,7 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)  /*   * Called by non rpc-based layout drivers   */ -void pnfs_ld_read_done(struct nfs_read_data *data) +void pnfs_ld_read_done(struct nfs_pgio_data *data)  {  	struct nfs_pgio_header *hdr = data->header; @@ -1694,7 +1687,7 @@ EXPORT_SYMBOL_GPL(pnfs_ld_read_done);  static void  pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, -		struct nfs_read_data *data) +		struct nfs_pgio_data *data)  {  	struct nfs_pgio_header *hdr = data->header; @@ -1703,14 +1696,14 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,  		nfs_pageio_reset_read_mds(desc);  		desc->pg_recoalesce = 1;  	} -	nfs_readdata_release(data); +	nfs_pgio_data_release(data);  }  /*   * Call the appropriate parallel I/O subsystem read function.   */  static enum pnfs_try_status -pnfs_try_to_read_data(struct nfs_read_data *rdata, +pnfs_try_to_read_data(struct nfs_pgio_data *rdata,  		       const struct rpc_call_ops *call_ops,  		       struct pnfs_layout_segment *lseg)  { @@ -1732,41 +1725,35 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,  }  static void -pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head) +pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)  { -	struct nfs_read_data *data; +	struct nfs_pgio_data *data = hdr->data;  	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;  	struct pnfs_layout_segment *lseg = desc->pg_lseg; +	enum pnfs_try_status trypnfs;  	desc->pg_lseg = NULL; -	while (!list_empty(head)) { -		enum pnfs_try_status trypnfs; - -		data = list_first_entry(head, struct nfs_read_data, list); -		list_del_init(&data->list); - -		trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); -		if (trypnfs == PNFS_NOT_ATTEMPTED) -			pnfs_read_through_mds(desc, data); -	} +	trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); +	if (trypnfs == PNFS_NOT_ATTEMPTED) +		pnfs_read_through_mds(desc, data);  	pnfs_put_lseg(lseg);  }  static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)  {  	pnfs_put_lseg(hdr->lseg); -	nfs_readhdr_free(hdr); +	nfs_rw_header_free(hdr);  }  EXPORT_SYMBOL_GPL(pnfs_readhdr_free);  int  pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)  { -	struct nfs_read_header *rhdr; +	struct nfs_rw_header *rhdr;  	struct nfs_pgio_header *hdr;  	int ret; -	rhdr = nfs_readhdr_alloc(); +	rhdr = nfs_rw_header_alloc(desc->pg_rw_ops);  	if (!rhdr) {  		desc->pg_completion_ops->error_cleanup(&desc->pg_list);  		ret = -ENOMEM; @@ -1778,18 +1765,27 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)  	nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);  	hdr->lseg = pnfs_get_lseg(desc->pg_lseg);  	atomic_inc(&hdr->refcnt); -	ret = nfs_generic_pagein(desc, hdr); +	ret = nfs_generic_pgio(desc, hdr);  	if (ret != 0) {  		pnfs_put_lseg(desc->pg_lseg);  		desc->pg_lseg = NULL;  	} else -		pnfs_do_multiple_reads(desc, &hdr->rpc_list); +		pnfs_do_read(desc, hdr);  	if (atomic_dec_and_test(&hdr->refcnt))  		hdr->completion_ops->completion(hdr);  	return ret;  }  EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); +static void pnfs_clear_layoutcommitting(struct inode *inode) +{ +	unsigned long *bitlock = &NFS_I(inode)->flags; + +	clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); +	smp_mb__after_atomic(); +	wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); +} +  /*   * There can be multiple RW segments.   */ @@ -1807,7 +1803,6 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)  static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)  {  	struct pnfs_layout_segment *lseg, *tmp; -	unsigned long *bitlock = &NFS_I(inode)->flags;  	/* Matched by references in pnfs_set_layoutcommit */  	list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { @@ -1815,9 +1810,7 @@ static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *lis  		pnfs_put_lseg(lseg);  	} -	clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); -	smp_mb__after_clear_bit(); -	wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); +	pnfs_clear_layoutcommitting(inode);  }  void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) @@ -1827,7 +1820,7 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)  EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);  void -pnfs_set_layoutcommit(struct nfs_write_data *wdata) +pnfs_set_layoutcommit(struct nfs_pgio_data *wdata)  {  	struct nfs_pgio_header *hdr = wdata->header;  	struct inode *inode = hdr->inode; @@ -1881,43 +1874,37 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)  	struct nfs4_layoutcommit_data *data;  	struct nfs_inode *nfsi = NFS_I(inode);  	loff_t end_pos; -	int status = 0; - -	dprintk("--> %s inode %lu\n", __func__, inode->i_ino); +	int status; -	if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) +	if (!pnfs_layoutcommit_outstanding(inode))  		return 0; -	/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ -	data = kzalloc(sizeof(*data), GFP_NOFS); -	if (!data) { -		status = -ENOMEM; -		goto out; -	} - -	if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) -		goto out_free; +	dprintk("--> %s inode %lu\n", __func__, inode->i_ino); +	status = -EAGAIN;  	if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { -		if (!sync) { -			status = -EAGAIN; -			goto out_free; -		} -		status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, -					nfs_wait_bit_killable, TASK_KILLABLE); +		if (!sync) +			goto out; +		status = wait_on_bit_lock(&nfsi->flags, +				NFS_INO_LAYOUTCOMMITTING, +				nfs_wait_bit_killable, +				TASK_KILLABLE);  		if (status) -			goto out_free; +			goto out;  	} -	INIT_LIST_HEAD(&data->lseg_list); +	status = -ENOMEM; +	/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ +	data = kzalloc(sizeof(*data), GFP_NOFS); +	if (!data) +		goto clear_layoutcommitting; + +	status = 0;  	spin_lock(&inode->i_lock); -	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { -		clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags); -		spin_unlock(&inode->i_lock); -		wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING); -		goto out_free; -	} +	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) +		goto out_unlock; +	INIT_LIST_HEAD(&data->lseg_list);  	pnfs_list_write_lseg(inode, &data->lseg_list);  	end_pos = nfsi->layout->plh_lwb; @@ -1940,8 +1927,11 @@ out:  		mark_inode_dirty_sync(inode);  	dprintk("<-- %s status %d\n", __func__, status);  	return status; -out_free: +out_unlock: +	spin_unlock(&inode->i_lock);  	kfree(data); +clear_layoutcommitting: +	pnfs_clear_layoutcommitting(inode);  	goto out;  } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index a4f41810a7f..4fb309a2b4c 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -113,8 +113,8 @@ struct pnfs_layoutdriver_type {  	 * Return PNFS_ATTEMPTED to indicate the layout code has attempted  	 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS  	 */ -	enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data); -	enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how); +	enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data); +	enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how);  	void (*free_deviceid_node) (struct nfs4_deviceid_node *); @@ -180,11 +180,6 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);  void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);  void pnfs_put_lseg(struct pnfs_layout_segment *lseg); -void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, -			   const struct nfs_pgio_completion_ops *); -void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, -			    int, const struct nfs_pgio_completion_ops *); -  void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);  void unset_pnfs_layoutdriver(struct nfs_server *);  void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); @@ -192,7 +187,8 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);  void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,  			        struct nfs_page *req, u64 wb_size);  int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); -bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); +size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, +			    struct nfs_page *prev, struct nfs_page *req);  void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);  struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);  void pnfs_free_lseg_list(struct list_head *tmp_list); @@ -217,13 +213,13 @@ bool pnfs_roc(struct inode *ino);  void pnfs_roc_release(struct inode *ino);  void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);  bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); -void pnfs_set_layoutcommit(struct nfs_write_data *wdata); +void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata);  void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);  int pnfs_layoutcommit_inode(struct inode *inode, bool sync);  int _pnfs_return_layout(struct inode *);  int pnfs_commit_and_return_layout(struct inode *); -void pnfs_ld_write_done(struct nfs_write_data *); -void pnfs_ld_read_done(struct nfs_read_data *); +void pnfs_ld_write_done(struct nfs_pgio_data *); +void pnfs_ld_read_done(struct nfs_pgio_data *);  struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,  					       struct nfs_open_context *ctx,  					       loff_t pos, @@ -275,7 +271,7 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg)  {  	if (lseg) {  		atomic_inc(&lseg->pls_refcount); -		smp_mb__after_atomic_inc(); +		smp_mb__after_atomic();  	}  	return lseg;  } @@ -359,6 +355,15 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)  		PNFS_LAYOUTRET_ON_SETATTR;  } +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ +	struct nfs_inode *nfsi = NFS_I(inode); + +	return test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags) != 0 || +		test_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags) != 0; +} +  static inline int pnfs_return_layout(struct inode *ino)  {  	struct nfs_inode *nfsi = NFS_I(ino); @@ -452,18 +457,6 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)  {  } -static inline void pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, -					 const struct nfs_pgio_completion_ops *compl_ops) -{ -	nfs_pageio_init_read(pgio, inode, compl_ops); -} - -static inline void pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags, -					  const struct nfs_pgio_completion_ops *compl_ops) -{ -	nfs_pageio_init_write(pgio, inode, ioflags, compl_ops); -} -  static inline int  pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,  		 struct nfs_commit_info *cinfo) @@ -515,6 +508,13 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,  	return false;  } +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ +	return false; +} + +  static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)  {  	return NULL; diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index a8f57c728df..c171ce1a8a3 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -235,7 +235,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  	};  	int status = -ENOMEM; -	dprintk("NFS call  create %s\n", dentry->d_name.name); +	dprintk("NFS call  create %pd\n", dentry);  	data = nfs_alloc_createdata(dir, dentry, sattr);  	if (data == NULL)  		goto out; @@ -265,7 +265,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  	umode_t mode;  	int status = -ENOMEM; -	dprintk("NFS call  mknod %s\n", dentry->d_name.name); +	dprintk("NFS call  mknod %pd\n", dentry);  	mode = sattr->ia_mode;  	if (S_ISFIFO(mode)) { @@ -357,30 +357,6 @@ nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,  }  static int -nfs_proc_rename(struct inode *old_dir, struct qstr *old_name, -		struct inode *new_dir, struct qstr *new_name) -{ -	struct nfs_renameargs	arg = { -		.old_dir	= NFS_FH(old_dir), -		.old_name	= old_name, -		.new_dir	= NFS_FH(new_dir), -		.new_name	= new_name, -	}; -	struct rpc_message msg = { -		.rpc_proc	= &nfs_procedures[NFSPROC_RENAME], -		.rpc_argp	= &arg, -	}; -	int			status; - -	dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name); -	status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); -	nfs_mark_for_revalidate(old_dir); -	nfs_mark_for_revalidate(new_dir); -	dprintk("NFS reply rename: %d\n", status); -	return status; -} - -static int  nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)  {  	struct nfs_linkargs	arg = { @@ -423,7 +399,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,  	};  	int status = -ENAMETOOLONG; -	dprintk("NFS call  symlink %s\n", dentry->d_name.name); +	dprintk("NFS call  symlink %pd\n", dentry);  	if (len > NFS2_MAXPATHLEN)  		goto out; @@ -462,7 +438,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)  	};  	int status = -ENOMEM; -	dprintk("NFS call  mkdir %s\n", dentry->d_name.name); +	dprintk("NFS call  mkdir %pd\n", dentry);  	data = nfs_alloc_createdata(dir, dentry, sattr);  	if (data == NULL)  		goto out; @@ -602,7 +578,7 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,  	return 0;  } -static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data)  {  	struct inode *inode = data->header->inode; @@ -618,18 +594,18 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)  	return 0;  } -static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)  {  	msg->rpc_proc = &nfs_procedures[NFSPROC_READ];  } -static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)  {  	rpc_call_start(task);  	return 0;  } -static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data)  {  	struct inode *inode = data->header->inode; @@ -638,19 +614,13 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)  	return 0;  } -static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)  {  	/* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */  	data->args.stable = NFS_FILE_SYNC;  	msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];  } -static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) -{ -	rpc_call_start(task); -	return 0; -} -  static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)  {  	BUG(); @@ -745,7 +715,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {  	.unlink_setup	= nfs_proc_unlink_setup,  	.unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare,  	.unlink_done	= nfs_proc_unlink_done, -	.rename		= nfs_proc_rename,  	.rename_setup	= nfs_proc_rename_setup,  	.rename_rpc_prepare = nfs_proc_rename_rpc_prepare,  	.rename_done	= nfs_proc_rename_done, @@ -759,13 +728,10 @@ const struct nfs_rpc_ops nfs_v2_clientops = {  	.fsinfo		= nfs_proc_fsinfo,  	.pathconf	= nfs_proc_pathconf,  	.decode_dirent	= nfs2_decode_dirent, +	.pgio_rpc_prepare = nfs_proc_pgio_rpc_prepare,  	.read_setup	= nfs_proc_read_setup, -	.read_pageio_init = nfs_pageio_init_read, -	.read_rpc_prepare = nfs_proc_read_rpc_prepare,  	.read_done	= nfs_read_done,  	.write_setup	= nfs_proc_write_setup, -	.write_pageio_init = nfs_pageio_init_write, -	.write_rpc_prepare = nfs_proc_write_rpc_prepare,  	.write_done	= nfs_write_done,  	.commit_setup	= nfs_proc_commit_setup,  	.commit_rpc_prepare = nfs_proc_commit_rpc_prepare, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 31db5c366b8..e818a475ca6 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -24,85 +24,24 @@  #include "internal.h"  #include "iostat.h"  #include "fscache.h" +#include "pnfs.h"  #define NFSDBG_FACILITY		NFSDBG_PAGECACHE -static const struct nfs_pageio_ops nfs_pageio_read_ops; -static const struct rpc_call_ops nfs_read_common_ops;  static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; +static const struct nfs_rw_ops nfs_rw_read_ops;  static struct kmem_cache *nfs_rdata_cachep; -struct nfs_read_header *nfs_readhdr_alloc(void) +static struct nfs_rw_header *nfs_readhdr_alloc(void)  { -	struct nfs_read_header *rhdr; - -	rhdr = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); -	if (rhdr) { -		struct nfs_pgio_header *hdr = &rhdr->header; - -		INIT_LIST_HEAD(&hdr->pages); -		INIT_LIST_HEAD(&hdr->rpc_list); -		spin_lock_init(&hdr->lock); -		atomic_set(&hdr->refcnt, 0); -	} -	return rhdr; +	return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);  } -EXPORT_SYMBOL_GPL(nfs_readhdr_alloc); -static struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr, -						unsigned int pagecount) +static void nfs_readhdr_free(struct nfs_rw_header *rhdr)  { -	struct nfs_read_data *data, *prealloc; - -	prealloc = &container_of(hdr, struct nfs_read_header, header)->rpc_data; -	if (prealloc->header == NULL) -		data = prealloc; -	else -		data = kzalloc(sizeof(*data), GFP_KERNEL); -	if (!data) -		goto out; - -	if (nfs_pgarray_set(&data->pages, pagecount)) { -		data->header = hdr; -		atomic_inc(&hdr->refcnt); -	} else { -		if (data != prealloc) -			kfree(data); -		data = NULL; -	} -out: -	return data; -} - -void nfs_readhdr_free(struct nfs_pgio_header *hdr) -{ -	struct nfs_read_header *rhdr = container_of(hdr, struct nfs_read_header, header); -  	kmem_cache_free(nfs_rdata_cachep, rhdr);  } -EXPORT_SYMBOL_GPL(nfs_readhdr_free); - -void nfs_readdata_release(struct nfs_read_data *rdata) -{ -	struct nfs_pgio_header *hdr = rdata->header; -	struct nfs_read_header *read_header = container_of(hdr, struct nfs_read_header, header); - -	put_nfs_open_context(rdata->args.context); -	if (rdata->pages.pagevec != rdata->pages.page_array) -		kfree(rdata->pages.pagevec); -	if (rdata == &read_header->rpc_data) { -		rdata->header = NULL; -		rdata = NULL; -	} -	if (atomic_dec_and_test(&hdr->refcnt)) -		hdr->completion_ops->completion(hdr); -	/* Note: we only free the rpc_task after callbacks are done. -	 * See the comment in rpc_free_task() for why -	 */ -	kfree(rdata); -} -EXPORT_SYMBOL_GPL(nfs_readdata_release);  static  int nfs_return_empty_page(struct page *page) @@ -114,17 +53,24 @@ int nfs_return_empty_page(struct page *page)  }  void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, -			      struct inode *inode, +			      struct inode *inode, bool force_mds,  			      const struct nfs_pgio_completion_ops *compl_ops)  { -	nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, compl_ops, -			NFS_SERVER(inode)->rsize, 0); +	struct nfs_server *server = NFS_SERVER(inode); +	const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops; + +#ifdef CONFIG_NFS_V4_1 +	if (server->pnfs_curr_ld && !force_mds) +		pg_ops = server->pnfs_curr_ld->pg_read_ops; +#endif +	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, +			server->rsize, 0);  }  EXPORT_SYMBOL_GPL(nfs_pageio_init_read);  void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)  { -	pgio->pg_ops = &nfs_pageio_read_ops; +	pgio->pg_ops = &nfs_pgio_rw_ops;  	pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;  }  EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); @@ -139,7 +85,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,  	len = nfs_page_length(page);  	if (len == 0)  		return nfs_return_empty_page(page); -	new = nfs_create_request(ctx, inode, page, 0, len); +	new = nfs_create_request(ctx, page, NULL, 0, len);  	if (IS_ERR(new)) {  		unlock_page(page);  		return PTR_ERR(new); @@ -147,7 +93,8 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,  	if (len < PAGE_CACHE_SIZE)  		zero_user_segment(page, len, PAGE_CACHE_SIZE); -	NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops); +	nfs_pageio_init_read(&pgio, inode, false, +			     &nfs_async_read_completion_ops);  	nfs_pageio_add_request(&pgio, new);  	nfs_pageio_complete(&pgio);  	NFS_I(inode)->read_io += pgio.pg_bytes_written; @@ -158,20 +105,31 @@ static void nfs_readpage_release(struct nfs_page *req)  {  	struct inode *d_inode = req->wb_context->dentry->d_inode; -	if (PageUptodate(req->wb_page)) -		nfs_readpage_to_fscache(d_inode, req->wb_page, 0); +	dprintk("NFS: read done (%s/%llu %d@%lld)\n", d_inode->i_sb->s_id, +		(unsigned long long)NFS_FILEID(d_inode), req->wb_bytes, +		(long long)req_offset(req)); -	unlock_page(req->wb_page); +	if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { +		if (PageUptodate(req->wb_page)) +			nfs_readpage_to_fscache(d_inode, req->wb_page, 0); -	dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", +		unlock_page(req->wb_page); +	} + +	dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",  			req->wb_context->dentry->d_inode->i_sb->s_id, -			(long long)NFS_FILEID(req->wb_context->dentry->d_inode), +			(unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),  			req->wb_bytes,  			(long long)req_offset(req));  	nfs_release_request(req);  } -/* Note io was page aligned */ +static void nfs_page_group_set_uptodate(struct nfs_page *req) +{ +	if (nfs_page_group_sync_on_bit(req, PG_UPTODATE)) +		SetPageUptodate(req->wb_page); +} +  static void nfs_read_completion(struct nfs_pgio_header *hdr)  {  	unsigned long bytes = 0; @@ -181,21 +139,32 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)  	while (!list_empty(&hdr->pages)) {  		struct nfs_page *req = nfs_list_entry(hdr->pages.next);  		struct page *page = req->wb_page; +		unsigned long start = req->wb_pgbase; +		unsigned long end = req->wb_pgbase + req->wb_bytes;  		if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { -			if (bytes > hdr->good_bytes) -				zero_user(page, 0, PAGE_SIZE); -			else if (hdr->good_bytes - bytes < PAGE_SIZE) -				zero_user_segment(page, -					hdr->good_bytes & ~PAGE_MASK, -					PAGE_SIZE); +			/* note: regions of the page not covered by a +			 * request are zeroed in nfs_readpage_async / +			 * readpage_async_filler */ +			if (bytes > hdr->good_bytes) { +				/* nothing in this request was good, so zero +				 * the full extent of the request */ +				zero_user_segment(page, start, end); + +			} else if (hdr->good_bytes - bytes < req->wb_bytes) { +				/* part of this request has good bytes, but +				 * not all. zero the bad bytes */ +				start += hdr->good_bytes - bytes; +				WARN_ON(start < req->wb_pgbase); +				zero_user_segment(page, start, end); +			}  		}  		bytes += req->wb_bytes;  		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {  			if (bytes <= hdr->good_bytes) -				SetPageUptodate(page); +				nfs_page_group_set_uptodate(req);  		} else -			SetPageUptodate(page); +			nfs_page_group_set_uptodate(req);  		nfs_list_remove_request(req);  		nfs_readpage_release(req);  	} @@ -203,95 +172,14 @@ out:  	hdr->release(hdr);  } -int nfs_initiate_read(struct rpc_clnt *clnt, -		      struct nfs_read_data *data, -		      const struct rpc_call_ops *call_ops, int flags) +static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg, +			      struct rpc_task_setup *task_setup_data, int how)  {  	struct inode *inode = data->header->inode;  	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; -	struct rpc_task *task; -	struct rpc_message msg = { -		.rpc_argp = &data->args, -		.rpc_resp = &data->res, -		.rpc_cred = data->header->cred, -	}; -	struct rpc_task_setup task_setup_data = { -		.task = &data->task, -		.rpc_client = clnt, -		.rpc_message = &msg, -		.callback_ops = call_ops, -		.callback_data = data, -		.workqueue = nfsiod_workqueue, -		.flags = RPC_TASK_ASYNC | swap_flags | flags, -	}; - -	/* Set up the initial task struct. */ -	NFS_PROTO(inode)->read_setup(data, &msg); - -	dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ " -			"offset %llu)\n", -			data->task.tk_pid, -			inode->i_sb->s_id, -			(long long)NFS_FILEID(inode), -			data->args.count, -			(unsigned long long)data->args.offset); - -	task = rpc_run_task(&task_setup_data); -	if (IS_ERR(task)) -		return PTR_ERR(task); -	rpc_put_task(task); -	return 0; -} -EXPORT_SYMBOL_GPL(nfs_initiate_read); -/* - * Set up the NFS read request struct - */ -static void nfs_read_rpcsetup(struct nfs_read_data *data, -		unsigned int count, unsigned int offset) -{ -	struct nfs_page *req = data->header->req; - -	data->args.fh     = NFS_FH(data->header->inode); -	data->args.offset = req_offset(req) + offset; -	data->args.pgbase = req->wb_pgbase + offset; -	data->args.pages  = data->pages.pagevec; -	data->args.count  = count; -	data->args.context = get_nfs_open_context(req->wb_context); -	data->args.lock_context = req->wb_lock_context; - -	data->res.fattr   = &data->fattr; -	data->res.count   = count; -	data->res.eof     = 0; -	nfs_fattr_init(&data->fattr); -} - -static int nfs_do_read(struct nfs_read_data *data, -		const struct rpc_call_ops *call_ops) -{ -	struct inode *inode = data->header->inode; - -	return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0); -} - -static int -nfs_do_multiple_reads(struct list_head *head, -		const struct rpc_call_ops *call_ops) -{ -	struct nfs_read_data *data; -	int ret = 0; - -	while (!list_empty(head)) { -		int ret2; - -		data = list_first_entry(head, struct nfs_read_data, list); -		list_del_init(&data->list); - -		ret2 = nfs_do_read(data, call_ops); -		if (ret == 0) -			ret = ret2; -	} -	return ret; +	task_setup_data->flags |= swap_flags; +	NFS_PROTO(inode)->read_setup(data, msg);  }  static void @@ -311,143 +199,14 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {  	.completion = nfs_read_completion,  }; -static void nfs_pagein_error(struct nfs_pageio_descriptor *desc, -		struct nfs_pgio_header *hdr) -{ -	set_bit(NFS_IOHDR_REDO, &hdr->flags); -	while (!list_empty(&hdr->rpc_list)) { -		struct nfs_read_data *data = list_first_entry(&hdr->rpc_list, -				struct nfs_read_data, list); -		list_del(&data->list); -		nfs_readdata_release(data); -	} -	desc->pg_completion_ops->error_cleanup(&desc->pg_list); -} - -/* - * Generate multiple requests to fill a single page. - * - * We optimize to reduce the number of read operations on the wire.  If we - * detect that we're reading a page, or an area of a page, that is past the - * end of file, we do not generate NFS read operations but just clear the - * parts of the page that would have come back zero from the server anyway. - * - * We rely on the cached value of i_size to make this determination; another - * client can fill pages on the server past our cached end-of-file, but we - * won't see the new data until our attribute cache is updated.  This is more - * or less conventional NFS client behavior. - */ -static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, -			    struct nfs_pgio_header *hdr) -{ -	struct nfs_page *req = hdr->req; -	struct page *page = req->wb_page; -	struct nfs_read_data *data; -	size_t rsize = desc->pg_bsize, nbytes; -	unsigned int offset; - -	offset = 0; -	nbytes = desc->pg_count; -	do { -		size_t len = min(nbytes,rsize); - -		data = nfs_readdata_alloc(hdr, 1); -		if (!data) { -			nfs_pagein_error(desc, hdr); -			return -ENOMEM; -		} -		data->pages.pagevec[0] = page; -		nfs_read_rpcsetup(data, len, offset); -		list_add(&data->list, &hdr->rpc_list); -		nbytes -= len; -		offset += len; -	} while (nbytes != 0); - -	nfs_list_remove_request(req); -	nfs_list_add_request(req, &hdr->pages); -	desc->pg_rpc_callops = &nfs_read_common_ops; -	return 0; -} - -static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, -			  struct nfs_pgio_header *hdr) -{ -	struct nfs_page		*req; -	struct page		**pages; -	struct nfs_read_data    *data; -	struct list_head *head = &desc->pg_list; - -	data = nfs_readdata_alloc(hdr, nfs_page_array_len(desc->pg_base, -							  desc->pg_count)); -	if (!data) { -		nfs_pagein_error(desc, hdr); -		return -ENOMEM; -	} - -	pages = data->pages.pagevec; -	while (!list_empty(head)) { -		req = nfs_list_entry(head->next); -		nfs_list_remove_request(req); -		nfs_list_add_request(req, &hdr->pages); -		*pages++ = req->wb_page; -	} - -	nfs_read_rpcsetup(data, desc->pg_count, 0); -	list_add(&data->list, &hdr->rpc_list); -	desc->pg_rpc_callops = &nfs_read_common_ops; -	return 0; -} - -int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, -		       struct nfs_pgio_header *hdr) -{ -	if (desc->pg_bsize < PAGE_CACHE_SIZE) -		return nfs_pagein_multi(desc, hdr); -	return nfs_pagein_one(desc, hdr); -} -EXPORT_SYMBOL_GPL(nfs_generic_pagein); - -static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) -{ -	struct nfs_read_header *rhdr; -	struct nfs_pgio_header *hdr; -	int ret; - -	rhdr = nfs_readhdr_alloc(); -	if (!rhdr) { -		desc->pg_completion_ops->error_cleanup(&desc->pg_list); -		return -ENOMEM; -	} -	hdr = &rhdr->header; -	nfs_pgheader_init(desc, hdr, nfs_readhdr_free); -	atomic_inc(&hdr->refcnt); -	ret = nfs_generic_pagein(desc, hdr); -	if (ret == 0) -		ret = nfs_do_multiple_reads(&hdr->rpc_list, -					    desc->pg_rpc_callops); -	if (atomic_dec_and_test(&hdr->refcnt)) -		hdr->completion_ops->completion(hdr); -	return ret; -} - -static const struct nfs_pageio_ops nfs_pageio_read_ops = { -	.pg_test = nfs_generic_pg_test, -	.pg_doio = nfs_generic_pg_readpages, -}; -  /*   * This is the callback from RPC telling us whether a reply was   * received or some error occurred (timeout or socket shutdown).   */ -int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data, +			     struct inode *inode)  { -	struct inode *inode = data->header->inode; -	int status; - -	dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid, -			task->tk_status); - -	status = NFS_PROTO(inode)->read_done(task, data); +	int status = NFS_PROTO(inode)->read_done(task, data);  	if (status != 0)  		return status; @@ -460,10 +219,10 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)  	return 0;  } -static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data) +static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data)  { -	struct nfs_readargs *argp = &data->args; -	struct nfs_readres *resp = &data->res; +	struct nfs_pgio_args *argp = &data->args; +	struct nfs_pgio_res  *resp = &data->res;  	/* This is a short read! */  	nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD); @@ -480,17 +239,11 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data  	rpc_restart_call_prepare(task);  } -static void nfs_readpage_result_common(struct rpc_task *task, void *calldata) +static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data)  { -	struct nfs_read_data *data = calldata;  	struct nfs_pgio_header *hdr = data->header; -	/* Note the only returns of nfs_readpage_result are 0 and -EAGAIN */ -	if (nfs_readpage_result(task, data) != 0) -		return; -	if (task->tk_status < 0) -		nfs_set_pgio_error(hdr, task->tk_status, data->args.offset); -	else if (data->res.eof) { +	if (data->res.eof) {  		loff_t bound;  		bound = data->args.offset + data->res.count; @@ -505,26 +258,6 @@ static void nfs_readpage_result_common(struct rpc_task *task, void *calldata)  		nfs_readpage_retry(task, data);  } -static void nfs_readpage_release_common(void *calldata) -{ -	nfs_readdata_release(calldata); -} - -void nfs_read_prepare(struct rpc_task *task, void *calldata) -{ -	struct nfs_read_data *data = calldata; -	int err; -	err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data); -	if (err) -		rpc_exit(task, err); -} - -static const struct rpc_call_ops nfs_read_common_ops = { -	.rpc_call_prepare = nfs_read_prepare, -	.rpc_call_done = nfs_readpage_result_common, -	.rpc_release = nfs_readpage_release_common, -}; -  /*   * Read a page over NFS.   * We read the page synchronously in the following case: @@ -592,7 +325,6 @@ static int  readpage_async_filler(void *data, struct page *page)  {  	struct nfs_readdesc *desc = (struct nfs_readdesc *)data; -	struct inode *inode = page_file_mapping(page)->host;  	struct nfs_page *new;  	unsigned int len;  	int error; @@ -601,7 +333,7 @@ readpage_async_filler(void *data, struct page *page)  	if (len == 0)  		return nfs_return_empty_page(page); -	new = nfs_create_request(desc->ctx, inode, page, 0, len); +	new = nfs_create_request(desc->ctx, page, NULL, 0, len);  	if (IS_ERR(new))  		goto out_error; @@ -630,9 +362,9 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,  	unsigned long npages;  	int ret = -ESTALE; -	dprintk("NFS: nfs_readpages (%s/%Ld %d)\n", +	dprintk("NFS: nfs_readpages (%s/%Lu %d)\n",  			inode->i_sb->s_id, -			(long long)NFS_FILEID(inode), +			(unsigned long long)NFS_FILEID(inode),  			nr_pages);  	nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); @@ -654,7 +386,8 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,  	if (ret == 0)  		goto read_complete; /* all pages were read */ -	NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops); +	nfs_pageio_init_read(&pgio, inode, false, +			     &nfs_async_read_completion_ops);  	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); @@ -671,7 +404,7 @@ out:  int __init nfs_init_readpagecache(void)  {  	nfs_rdata_cachep = kmem_cache_create("nfs_read_data", -					     sizeof(struct nfs_read_header), +					     sizeof(struct nfs_rw_header),  					     0, SLAB_HWCACHE_ALIGN,  					     NULL);  	if (nfs_rdata_cachep == NULL) @@ -684,3 +417,12 @@ void nfs_destroy_readpagecache(void)  {  	kmem_cache_destroy(nfs_rdata_cachep);  } + +static const struct nfs_rw_ops nfs_rw_read_ops = { +	.rw_mode		= FMODE_READ, +	.rw_alloc_header	= nfs_readhdr_alloc, +	.rw_free_header		= nfs_readhdr_free, +	.rw_done		= nfs_readpage_done, +	.rw_result		= nfs_readpage_result, +	.rw_initiate		= nfs_initiate_read, +}; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a03b9c6f948..084af1060d7 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -497,7 +497,8 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)  	static const struct {  		rpc_authflavor_t flavour;  		const char *str; -	} sec_flavours[] = { +	} sec_flavours[NFS_AUTH_INFO_MAX_FLAVORS] = { +		/* update NFS_AUTH_INFO_MAX_FLAVORS when this list changes! */  		{ RPC_AUTH_NULL, "null" },  		{ RPC_AUTH_UNIX, "sys" },  		{ RPC_AUTH_GSS_KRB5, "krb5" }, @@ -923,8 +924,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)  		data->mount_server.port	= NFS_UNSPEC_PORT;  		data->nfs_server.port	= NFS_UNSPEC_PORT;  		data->nfs_server.protocol = XPRT_TRANSPORT_TCP; -		data->auth_flavors[0]	= RPC_AUTH_MAXFLAVOR; -		data->auth_flavor_len	= 0; +		data->selected_flavor	= RPC_AUTH_MAXFLAVOR;  		data->minorversion	= 0;  		data->need_mount	= true;  		data->net		= current->nsproxy->net_ns; @@ -1019,14 +1019,53 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)  	}  } -static void nfs_set_auth_parsed_mount_data(struct nfs_parsed_mount_data *data, -		rpc_authflavor_t pseudoflavor) +/* + * Add 'flavor' to 'auth_info' if not already present. + * Returns true if 'flavor' ends up in the list, false otherwise + */ +static bool nfs_auth_info_add(struct nfs_auth_info *auth_info, +			      rpc_authflavor_t flavor)  { -	data->auth_flavors[0] = pseudoflavor; -	data->auth_flavor_len = 1; +	unsigned int i; +	unsigned int max_flavor_len = (sizeof(auth_info->flavors) / +				       sizeof(auth_info->flavors[0])); + +	/* make sure this flavor isn't already in the list */ +	for (i = 0; i < auth_info->flavor_len; i++) { +		if (flavor == auth_info->flavors[i]) +			return true; +	} + +	if (auth_info->flavor_len + 1 >= max_flavor_len) { +		dfprintk(MOUNT, "NFS: too many sec= flavors\n"); +		return false; +	} + +	auth_info->flavors[auth_info->flavor_len++] = flavor; +	return true;  }  /* + * Return true if 'match' is in auth_info or auth_info is empty. + * Return false otherwise. + */ +bool nfs_auth_info_match(const struct nfs_auth_info *auth_info, +			 rpc_authflavor_t match) +{ +	int i; + +	if (!auth_info->flavor_len) +		return true; + +	for (i = 0; i < auth_info->flavor_len; i++) { +		if (auth_info->flavors[i] == match) +			return true; +	} +	return false; +} +EXPORT_SYMBOL_GPL(nfs_auth_info_match); + +/*   * Parse the value of the 'sec=' option.   */  static int nfs_parse_security_flavors(char *value, @@ -1034,49 +1073,55 @@ static int nfs_parse_security_flavors(char *value,  {  	substring_t args[MAX_OPT_ARGS];  	rpc_authflavor_t pseudoflavor; +	char *p;  	dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value); -	switch (match_token(value, nfs_secflavor_tokens, args)) { -	case Opt_sec_none: -		pseudoflavor = RPC_AUTH_NULL; -		break; -	case Opt_sec_sys: -		pseudoflavor = RPC_AUTH_UNIX; -		break; -	case Opt_sec_krb5: -		pseudoflavor = RPC_AUTH_GSS_KRB5; -		break; -	case Opt_sec_krb5i: -		pseudoflavor = RPC_AUTH_GSS_KRB5I; -		break; -	case Opt_sec_krb5p: -		pseudoflavor = RPC_AUTH_GSS_KRB5P; -		break; -	case Opt_sec_lkey: -		pseudoflavor = RPC_AUTH_GSS_LKEY; -		break; -	case Opt_sec_lkeyi: -		pseudoflavor = RPC_AUTH_GSS_LKEYI; -		break; -	case Opt_sec_lkeyp: -		pseudoflavor = RPC_AUTH_GSS_LKEYP; -		break; -	case Opt_sec_spkm: -		pseudoflavor = RPC_AUTH_GSS_SPKM; -		break; -	case Opt_sec_spkmi: -		pseudoflavor = RPC_AUTH_GSS_SPKMI; -		break; -	case Opt_sec_spkmp: -		pseudoflavor = RPC_AUTH_GSS_SPKMP; -		break; -	default: -		return 0; +	while ((p = strsep(&value, ":")) != NULL) { +		switch (match_token(p, nfs_secflavor_tokens, args)) { +		case Opt_sec_none: +			pseudoflavor = RPC_AUTH_NULL; +			break; +		case Opt_sec_sys: +			pseudoflavor = RPC_AUTH_UNIX; +			break; +		case Opt_sec_krb5: +			pseudoflavor = RPC_AUTH_GSS_KRB5; +			break; +		case Opt_sec_krb5i: +			pseudoflavor = RPC_AUTH_GSS_KRB5I; +			break; +		case Opt_sec_krb5p: +			pseudoflavor = RPC_AUTH_GSS_KRB5P; +			break; +		case Opt_sec_lkey: +			pseudoflavor = RPC_AUTH_GSS_LKEY; +			break; +		case Opt_sec_lkeyi: +			pseudoflavor = RPC_AUTH_GSS_LKEYI; +			break; +		case Opt_sec_lkeyp: +			pseudoflavor = RPC_AUTH_GSS_LKEYP; +			break; +		case Opt_sec_spkm: +			pseudoflavor = RPC_AUTH_GSS_SPKM; +			break; +		case Opt_sec_spkmi: +			pseudoflavor = RPC_AUTH_GSS_SPKMI; +			break; +		case Opt_sec_spkmp: +			pseudoflavor = RPC_AUTH_GSS_SPKMP; +			break; +		default: +			dfprintk(MOUNT, +				 "NFS: sec= option '%s' not recognized\n", p); +			return 0; +		} + +		if (!nfs_auth_info_add(&mnt->auth_info, pseudoflavor)) +			return 0;  	} -	mnt->flags |= NFS_MOUNT_SECFLAVOUR; -	nfs_set_auth_parsed_mount_data(mnt, pseudoflavor);  	return 1;  } @@ -1569,7 +1614,7 @@ static int nfs_parse_mount_options(char *raw,  		goto out_minorversion_mismatch;  	if (mnt->options & NFS_OPTION_MIGRATION && -	    mnt->version != 4 && mnt->minorversion != 0) +	    (mnt->version != 4 || mnt->minorversion != 0))  		goto out_migration_misuse;  	/* @@ -1623,12 +1668,14 @@ out_security_failure:  }  /* - * Ensure that the specified authtype in args->auth_flavors[0] is supported by - * the server. Returns 0 if it's ok, and -EACCES if not. + * Ensure that a specified authtype in args->auth_info is supported by + * the server. Returns 0 and sets args->selected_flavor if it's ok, and + * -EACCES if not.   */ -static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args, +static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,  			rpc_authflavor_t *server_authlist, unsigned int count)  { +	rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;  	unsigned int i;  	/* @@ -1640,17 +1687,20 @@ static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,  	 * can be used.  	 */  	for (i = 0; i < count; i++) { -		if (args->auth_flavors[0] == server_authlist[i] || -		    server_authlist[i] == RPC_AUTH_NULL) +		flavor = server_authlist[i]; + +		if (nfs_auth_info_match(&args->auth_info, flavor) || +		    flavor == RPC_AUTH_NULL)  			goto out;  	} -	dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n", -		args->auth_flavors[0]); +	dfprintk(MOUNT, +		 "NFS: specified auth flavors not supported by server\n");  	return -EACCES;  out: -	dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]); +	args->selected_flavor = flavor; +	dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor);  	return 0;  } @@ -1738,9 +1788,10 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf  	 * Was a sec= authflavor specified in the options? First, verify  	 * whether the server supports it, and then just try to use it if so.  	 */ -	if (args->auth_flavor_len > 0) { -		status = nfs_verify_authflavor(args, authlist, authlist_len); -		dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]); +	if (args->auth_info.flavor_len > 0) { +		status = nfs_verify_authflavors(args, authlist, authlist_len); +		dfprintk(MOUNT, "NFS: using auth flavor %u\n", +			 args->selected_flavor);  		if (status)  			return ERR_PTR(status);  		return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); @@ -1769,7 +1820,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf  			/* Fallthrough */  		}  		dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor); -		nfs_set_auth_parsed_mount_data(args, flavor); +		args->selected_flavor = flavor;  		server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);  		if (!IS_ERR(server))  			return server; @@ -1785,7 +1836,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf  	/* Last chance! Try AUTH_UNIX */  	dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX); -	nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX); +	args->selected_flavor = RPC_AUTH_UNIX;  	return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);  } @@ -1972,9 +2023,9 @@ static int nfs23_validate_mount_data(void *options,  		args->bsize		= data->bsize;  		if (data->flags & NFS_MOUNT_SECFLAVOUR) -			nfs_set_auth_parsed_mount_data(args, data->pseudoflavor); +			args->selected_flavor = data->pseudoflavor;  		else -			nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX); +			args->selected_flavor = RPC_AUTH_UNIX;  		if (!args->nfs_server.hostname)  			goto out_nomem; @@ -2108,9 +2159,6 @@ static int nfs_validate_text_mount_data(void *options,  	nfs_set_port(sap, &args->nfs_server.port, port); -	if (args->auth_flavor_len > 1) -		goto out_bad_auth; -  	return nfs_parse_devname(dev_name,  				   &args->nfs_server.hostname,  				   max_namelen, @@ -2130,21 +2178,31 @@ out_invalid_transport_udp:  out_no_address:  	dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n");  	return -EINVAL; - -out_bad_auth: -	dfprintk(MOUNT, "NFS: Too many RPC auth flavours specified\n"); -	return -EINVAL;  } +#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ +		| NFS_MOUNT_SECURE \ +		| NFS_MOUNT_TCP \ +		| NFS_MOUNT_VER3 \ +		| NFS_MOUNT_KERBEROS \ +		| NFS_MOUNT_NONLM \ +		| NFS_MOUNT_BROKEN_SUID \ +		| NFS_MOUNT_STRICTLOCK \ +		| NFS_MOUNT_UNSHARED \ +		| NFS_MOUNT_NORESVPORT \ +		| NFS_MOUNT_LEGACY_INTERFACE) +  static int  nfs_compare_remount_data(struct nfs_server *nfss,  			 struct nfs_parsed_mount_data *data)  { -	if (data->flags != nfss->flags || +	if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK ||  	    data->rsize != nfss->rsize ||  	    data->wsize != nfss->wsize || +	    data->version != nfss->nfs_client->rpc_ops->version || +	    data->minorversion != nfss->nfs_client->cl_minorversion ||  	    data->retrans != nfss->client->cl_timeout->to_retries || -	    data->auth_flavors[0] != nfss->client->cl_auth->au_flavor || +	    data->selected_flavor != nfss->client->cl_auth->au_flavor ||  	    data->acregmin != nfss->acregmin / HZ ||  	    data->acregmax != nfss->acregmax / HZ ||  	    data->acdirmin != nfss->acdirmin / HZ || @@ -2169,6 +2227,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)  	struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;  	u32 nfsvers = nfss->nfs_client->rpc_ops->version; +	sync_filesystem(sb); +  	/*  	 * Userspace mount programs that send binary options generally send  	 * them populated with default values. We have no way to know which @@ -2189,7 +2249,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)  	data->rsize = nfss->rsize;  	data->wsize = nfss->wsize;  	data->retrans = nfss->client->cl_timeout->to_retries; -	nfs_set_auth_parsed_mount_data(data, nfss->client->cl_auth->au_flavor); +	data->selected_flavor = nfss->client->cl_auth->au_flavor; +	data->auth_info = nfss->auth_info;  	data->acregmin = nfss->acregmin / HZ;  	data->acregmax = nfss->acregmax / HZ;  	data->acdirmin = nfss->acdirmin / HZ; @@ -2197,12 +2258,15 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)  	data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;  	data->nfs_server.port = nfss->port;  	data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; +	data->version = nfsvers; +	data->minorversion = nfss->nfs_client->cl_minorversion; +	data->net = current->nsproxy->net_ns;  	memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,  		data->nfs_server.addrlen);  	/* overwrite those values with any that were specified */ -	error = nfs_parse_mount_options((char *)options, data); -	if (error < 0) +	error = -EINVAL; +	if (!nfs_parse_mount_options((char *)options, data))  		goto out;  	/* @@ -2296,18 +2360,6 @@ void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)   	nfs_initialise_sb(sb);  } -#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ -		| NFS_MOUNT_SECURE \ -		| NFS_MOUNT_TCP \ -		| NFS_MOUNT_VER3 \ -		| NFS_MOUNT_KERBEROS \ -		| NFS_MOUNT_NONLM \ -		| NFS_MOUNT_BROKEN_SUID \ -		| NFS_MOUNT_STRICTLOCK \ -		| NFS_MOUNT_UNSHARED \ -		| NFS_MOUNT_NORESVPORT \ -		| NFS_MOUNT_LEGACY_INTERFACE) -  static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)  {  	const struct nfs_server *a = s->s_fs_info; @@ -2332,7 +2384,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n  		goto Ebusy;  	if (a->acdirmax != b->acdirmax)  		goto Ebusy; -	if (b->flags & NFS_MOUNT_SECFLAVOUR && +	if (b->auth_info.flavor_len > 0 &&  	   clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)  		goto Ebusy;  	return 1; @@ -2530,6 +2582,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,  			mntroot = ERR_PTR(error);  			goto error_splat_bdi;  		} +		server->super = s;  	}  	if (!s->s_root) { @@ -2713,9 +2766,9 @@ static int nfs4_validate_mount_data(void *options,  					   data->auth_flavours,  					   sizeof(pseudoflavor)))  				return -EFAULT; -			nfs_set_auth_parsed_mount_data(args, pseudoflavor); +			args->selected_flavor = pseudoflavor;  		} else -			nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX); +			args->selected_flavor = RPC_AUTH_UNIX;  		c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);  		if (IS_ERR(c)) diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index 6b3f2535a3e..bb6ed810fa6 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -13,7 +13,7 @@  static struct ctl_table_header *nfs_callback_sysctl_table; -static ctl_table nfs_cb_sysctls[] = { +static struct ctl_table nfs_cb_sysctls[] = {  	{  		.procname	= "nfs_mountpoint_timeout",  		.data		= &nfs_mountpoint_expiry_timeout, @@ -31,7 +31,7 @@ static ctl_table nfs_cb_sysctls[] = {  	{ }  }; -static ctl_table nfs_cb_sysctl_dir[] = { +static struct ctl_table nfs_cb_sysctl_dir[] = {  	{  		.procname = "nfs",  		.mode = 0555, @@ -40,7 +40,7 @@ static ctl_table nfs_cb_sysctl_dir[] = {  	{ }  }; -static ctl_table nfs_cb_sysctl_root[] = { +static struct ctl_table nfs_cb_sysctl_root[] = {  	{  		.procname = "fs",  		.mode = 0555, diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index bb939edd4c9..de54129336c 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -14,6 +14,7 @@  #include <linux/sched.h>  #include <linux/wait.h>  #include <linux/namei.h> +#include <linux/fsnotify.h>  #include "internal.h"  #include "nfs4_fs.h" @@ -353,8 +354,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)  		return;  	} -	if (task->tk_status != 0) -		nfs_cancel_async_unlink(old_dentry); +	if (data->complete) +		data->complete(task, data);  }  /** @@ -399,9 +400,10 @@ static const struct rpc_call_ops nfs_rename_ops = {   *   * It's expected that valid references to the dentries and inodes are held   */ -static struct rpc_task * +struct rpc_task *  nfs_async_rename(struct inode *old_dir, struct inode *new_dir, -		 struct dentry *old_dentry, struct dentry *new_dentry) +		 struct dentry *old_dentry, struct dentry *new_dentry, +		 void (*complete)(struct rpc_task *, struct nfs_renamedata *))  {  	struct nfs_renamedata *data;  	struct rpc_message msg = { }; @@ -438,6 +440,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,  	data->new_dentry = dget(new_dentry);  	nfs_fattr_init(&data->old_fattr);  	nfs_fattr_init(&data->new_fattr); +	data->complete = complete;  	/* set up nfs_renameargs */  	data->args.old_dir = NFS_FH(old_dir); @@ -456,6 +459,27 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,  	return rpc_run_task(&task_setup_data);  } +/* + * Perform tasks needed when a sillyrename is done such as cancelling the + * queued async unlink if it failed. + */ +static void +nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data) +{ +	struct dentry *dentry = data->old_dentry; + +	if (task->tk_status != 0) { +		nfs_cancel_async_unlink(dentry); +		return; +	} + +	/* +	 * vfs_unlink and the like do not issue this when a file is +	 * sillyrenamed, so do it here. +	 */ +	fsnotify_nameremove(dentry, 0); +} +  #define SILLYNAME_PREFIX ".nfs"  #define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1)  #define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1) @@ -493,17 +517,15 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)  	unsigned long long fileid;  	struct dentry *sdentry;  	struct rpc_task *task; -	int            error = -EIO; +	int            error = -EBUSY; -	dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name, -		d_count(dentry)); +	dfprintk(VFS, "NFS: silly-rename(%pd2, ct=%d)\n", +		dentry, d_count(dentry));  	nfs_inc_stats(dir, NFSIOS_SILLYRENAME);  	/*  	 * We don't allow a dentry to be silly-renamed twice.  	 */ -	error = -EBUSY;  	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)  		goto out; @@ -522,8 +544,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)  				SILLYNAME_FILEID_LEN, fileid,  				SILLYNAME_COUNTER_LEN, sillycounter); -		dfprintk(VFS, "NFS: trying to rename %s to %s\n", -				dentry->d_name.name, silly); +		dfprintk(VFS, "NFS: trying to rename %pd to %s\n", +				dentry, silly);  		sdentry = lookup_one_len(silly, dentry->d_parent, slen);  		/* @@ -550,7 +572,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)  	}  	/* run the rename task, undo unlink if it fails */ -	task = nfs_async_rename(dir, dir, dentry, sdentry); +	task = nfs_async_rename(dir, dir, dentry, sdentry, +					nfs_complete_sillyrename);  	if (IS_ERR(task)) {  		error = -EBUSY;  		nfs_cancel_async_unlink(dentry); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ac1dc331ba3..5e2f1030454 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -42,10 +42,11 @@   * Local function declarations   */  static void nfs_redirty_request(struct nfs_page *req); -static const struct rpc_call_ops nfs_write_common_ops;  static const struct rpc_call_ops nfs_commit_ops;  static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;  static const struct nfs_commit_completion_ops nfs_commit_completion_ops; +static const struct nfs_rw_ops nfs_rw_write_ops; +static void nfs_clear_request_commit(struct nfs_page *req);  static struct kmem_cache *nfs_wdata_cachep;  static mempool_t *nfs_wdata_mempool; @@ -70,76 +71,19 @@ void nfs_commit_free(struct nfs_commit_data *p)  }  EXPORT_SYMBOL_GPL(nfs_commit_free); -struct nfs_write_header *nfs_writehdr_alloc(void) +static struct nfs_rw_header *nfs_writehdr_alloc(void)  { -	struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); - -	if (p) { -		struct nfs_pgio_header *hdr = &p->header; +	struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); +	if (p)  		memset(p, 0, sizeof(*p)); -		INIT_LIST_HEAD(&hdr->pages); -		INIT_LIST_HEAD(&hdr->rpc_list); -		spin_lock_init(&hdr->lock); -		atomic_set(&hdr->refcnt, 0); -		hdr->verf = &p->verf; -	}  	return p;  } -EXPORT_SYMBOL_GPL(nfs_writehdr_alloc); -static struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr, -						  unsigned int pagecount) +static void nfs_writehdr_free(struct nfs_rw_header *whdr)  { -	struct nfs_write_data *data, *prealloc; - -	prealloc = &container_of(hdr, struct nfs_write_header, header)->rpc_data; -	if (prealloc->header == NULL) -		data = prealloc; -	else -		data = kzalloc(sizeof(*data), GFP_KERNEL); -	if (!data) -		goto out; - -	if (nfs_pgarray_set(&data->pages, pagecount)) { -		data->header = hdr; -		atomic_inc(&hdr->refcnt); -	} else { -		if (data != prealloc) -			kfree(data); -		data = NULL; -	} -out: -	return data; -} - -void nfs_writehdr_free(struct nfs_pgio_header *hdr) -{ -	struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header);  	mempool_free(whdr, nfs_wdata_mempool);  } -EXPORT_SYMBOL_GPL(nfs_writehdr_free); - -void nfs_writedata_release(struct nfs_write_data *wdata) -{ -	struct nfs_pgio_header *hdr = wdata->header; -	struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header); - -	put_nfs_open_context(wdata->args.context); -	if (wdata->pages.pagevec != wdata->pages.page_array) -		kfree(wdata->pages.pagevec); -	if (wdata == &write_header->rpc_data) { -		wdata->header = NULL; -		wdata = NULL; -	} -	if (atomic_dec_and_test(&hdr->refcnt)) -		hdr->completion_ops->completion(hdr); -	/* Note: we only free the rpc_task after callbacks are done. -	 * See the comment in rpc_free_task() for why -	 */ -	kfree(wdata); -} -EXPORT_SYMBOL_GPL(nfs_writedata_release);  static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)  { @@ -148,8 +92,15 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)  	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);  } +/* + * nfs_page_find_head_request_locked - find head request associated with @page + * + * must be called while holding the inode lock. + * + * returns matching head request with reference held, or NULL if not found. + */  static struct nfs_page * -nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page) +nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)  {  	struct nfs_page *req = NULL; @@ -161,25 +112,33 @@ nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page)  		/* Linearly search the commit list for the correct req */  		list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) {  			if (freq->wb_page == page) { -				req = freq; +				req = freq->wb_head;  				break;  			}  		}  	} -	if (req) +	if (req) { +		WARN_ON_ONCE(req->wb_head != req); +  		kref_get(&req->wb_kref); +	}  	return req;  } -static struct nfs_page *nfs_page_find_request(struct page *page) +/* + * nfs_page_find_head_request - find head request associated with @page + * + * returns matching head request with reference held, or NULL if not found. + */ +static struct nfs_page *nfs_page_find_head_request(struct page *page)  {  	struct inode *inode = page_file_mapping(page)->host;  	struct nfs_page *req = NULL;  	spin_lock(&inode->i_lock); -	req = nfs_page_find_request_locked(NFS_I(inode), page); +	req = nfs_page_find_head_request_locked(NFS_I(inode), page);  	spin_unlock(&inode->i_lock);  	return req;  } @@ -211,18 +170,78 @@ static void nfs_set_pageerror(struct page *page)  	nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));  } +/* + * nfs_page_group_search_locked + * @head - head request of page group + * @page_offset - offset into page + * + * Search page group with head @head to find a request that contains the + * page offset @page_offset. + * + * Returns a pointer to the first matching nfs request, or NULL if no + * match is found. + * + * Must be called with the page group lock held + */ +static struct nfs_page * +nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) +{ +	struct nfs_page *req; + +	WARN_ON_ONCE(head != head->wb_head); +	WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags)); + +	req = head; +	do { +		if (page_offset >= req->wb_pgbase && +		    page_offset < (req->wb_pgbase + req->wb_bytes)) +			return req; + +		req = req->wb_this_page; +	} while (req != head); + +	return NULL; +} + +/* + * nfs_page_group_covers_page + * @head - head request of page group + * + * Return true if the page group with head @head covers the whole page, + * returns false otherwise + */ +static bool nfs_page_group_covers_page(struct nfs_page *req) +{ +	struct nfs_page *tmp; +	unsigned int pos = 0; +	unsigned int len = nfs_page_length(req->wb_page); + +	nfs_page_group_lock(req); + +	do { +		tmp = nfs_page_group_search_locked(req->wb_head, pos); +		if (tmp) { +			/* no way this should happen */ +			WARN_ON_ONCE(tmp->wb_pgbase != pos); +			pos += tmp->wb_bytes - (pos - tmp->wb_pgbase); +		} +	} while (tmp && pos < len); + +	nfs_page_group_unlock(req); +	WARN_ON_ONCE(pos > len); +	return pos == len; +} +  /* We can set the PG_uptodate flag if we see that a write request   * covers the full page.   */ -static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count) +static void nfs_mark_uptodate(struct nfs_page *req)  { -	if (PageUptodate(page)) -		return; -	if (base != 0) +	if (PageUptodate(req->wb_page))  		return; -	if (count != nfs_page_length(page)) +	if (!nfs_page_group_covers_page(req))  		return; -	SetPageUptodate(page); +	SetPageUptodate(req->wb_page);  }  static int wb_priority(struct writeback_control *wbc) @@ -258,46 +277,259 @@ static void nfs_set_page_writeback(struct page *page)  	}  } -static void nfs_end_page_writeback(struct page *page) +static void nfs_end_page_writeback(struct nfs_page *req)  { -	struct inode *inode = page_file_mapping(page)->host; +	struct inode *inode = page_file_mapping(req->wb_page)->host;  	struct nfs_server *nfss = NFS_SERVER(inode); -	end_page_writeback(page); +	if (!nfs_page_group_sync_on_bit(req, PG_WB_END)) +		return; + +	end_page_writeback(req->wb_page);  	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)  		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);  } -static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) + +/* nfs_page_group_clear_bits + *   @req - an nfs request + * clears all page group related bits from @req + */ +static void +nfs_page_group_clear_bits(struct nfs_page *req) +{ +	clear_bit(PG_TEARDOWN, &req->wb_flags); +	clear_bit(PG_UNLOCKPAGE, &req->wb_flags); +	clear_bit(PG_UPTODATE, &req->wb_flags); +	clear_bit(PG_WB_END, &req->wb_flags); +	clear_bit(PG_REMOVE, &req->wb_flags); +} + + +/* + * nfs_unroll_locks_and_wait -  unlock all newly locked reqs and wait on @req + * + * this is a helper function for nfs_lock_and_join_requests + * + * @inode - inode associated with request page group, must be holding inode lock + * @head  - head request of page group, must be holding head lock + * @req   - request that couldn't lock and needs to wait on the req bit lock + * @nonblock - if true, don't actually wait + * + * NOTE: this must be called holding page_group bit lock and inode spin lock + *       and BOTH will be released before returning. + * + * returns 0 on success, < 0 on error. + */ +static int +nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, +			  struct nfs_page *req, bool nonblock) +	__releases(&inode->i_lock) +{ +	struct nfs_page *tmp; +	int ret; + +	/* relinquish all the locks successfully grabbed this run */ +	for (tmp = head ; tmp != req; tmp = tmp->wb_this_page) +		nfs_unlock_request(tmp); + +	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); + +	/* grab a ref on the request that will be waited on */ +	kref_get(&req->wb_kref); + +	nfs_page_group_unlock(head); +	spin_unlock(&inode->i_lock); + +	/* release ref from nfs_page_find_head_request_locked */ +	nfs_release_request(head); + +	if (!nonblock) +		ret = nfs_wait_on_request(req); +	else +		ret = -EAGAIN; +	nfs_release_request(req); + +	return ret; +} + +/* + * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests + * + * @destroy_list - request list (using wb_this_page) terminated by @old_head + * @old_head - the old head of the list + * + * All subrequests must be locked and removed from all lists, so at this point + * they are only "active" in this function, and possibly in nfs_wait_on_request + * with a reference held by some other context. + */ +static void +nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, +				 struct nfs_page *old_head) +{ +	while (destroy_list) { +		struct nfs_page *subreq = destroy_list; + +		destroy_list = (subreq->wb_this_page == old_head) ? +				   NULL : subreq->wb_this_page; + +		WARN_ON_ONCE(old_head != subreq->wb_head); + +		/* make sure old group is not used */ +		subreq->wb_head = subreq; +		subreq->wb_this_page = subreq; + +		nfs_clear_request_commit(subreq); + +		/* subreq is now totally disconnected from page group or any +		 * write / commit lists. last chance to wake any waiters */ +		nfs_unlock_request(subreq); + +		if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) { +			/* release ref on old head request */ +			nfs_release_request(old_head); + +			nfs_page_group_clear_bits(subreq); + +			/* release the PG_INODE_REF reference */ +			if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) +				nfs_release_request(subreq); +			else +				WARN_ON_ONCE(1); +		} else { +			WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags)); +			/* zombie requests have already released the last +			 * reference and were waiting on the rest of the +			 * group to complete. Since it's no longer part of a +			 * group, simply free the request */ +			nfs_page_group_clear_bits(subreq); +			nfs_free_request(subreq); +		} +	} +} + +/* + * nfs_lock_and_join_requests - join all subreqs to the head req and return + *                              a locked reference, cancelling any pending + *                              operations for this page. + * + * @page - the page used to lookup the "page group" of nfs_page structures + * @nonblock - if true, don't block waiting for request locks + * + * This function joins all sub requests to the head request by first + * locking all requests in the group, cancelling any pending operations + * and finally updating the head request to cover the whole range covered by + * the (former) group.  All subrequests are removed from any write or commit + * lists, unlinked from the group and destroyed. + * + * Returns a locked, referenced pointer to the head request - which after + * this call is guaranteed to be the only request associated with the page. + * Returns NULL if no requests are found for @page, or a ERR_PTR if an + * error was encountered. + */ +static struct nfs_page * +nfs_lock_and_join_requests(struct page *page, bool nonblock)  {  	struct inode *inode = page_file_mapping(page)->host; -	struct nfs_page *req; +	struct nfs_page *head, *subreq; +	struct nfs_page *destroy_list = NULL; +	unsigned int total_bytes;  	int ret; +try_again: +	total_bytes = 0; + +	WARN_ON_ONCE(destroy_list); +  	spin_lock(&inode->i_lock); -	for (;;) { -		req = nfs_page_find_request_locked(NFS_I(inode), page); -		if (req == NULL) -			break; -		if (nfs_lock_request(req)) -			break; -		/* Note: If we hold the page lock, as is the case in nfs_writepage, -		 *	 then the call to nfs_lock_request() will always -		 *	 succeed provided that someone hasn't already marked the -		 *	 request as dirty (in which case we don't care). -		 */ + +	/* +	 * A reference is taken only on the head request which acts as a +	 * reference to the whole page group - the group will not be destroyed +	 * until the head reference is released. +	 */ +	head = nfs_page_find_head_request_locked(NFS_I(inode), page); + +	if (!head) {  		spin_unlock(&inode->i_lock); -		if (!nonblock) -			ret = nfs_wait_on_request(req); -		else -			ret = -EAGAIN; -		nfs_release_request(req); -		if (ret != 0) +		return NULL; +	} + +	/* lock each request in the page group */ +	nfs_page_group_lock(head); +	subreq = head; +	do { +		/* +		 * Subrequests are always contiguous, non overlapping +		 * and in order. If not, it's a programming error. +		 */ +		WARN_ON_ONCE(subreq->wb_offset != +		     (head->wb_offset + total_bytes)); + +		/* keep track of how many bytes this group covers */ +		total_bytes += subreq->wb_bytes; + +		if (!nfs_lock_request(subreq)) { +			/* releases page group bit lock and +			 * inode spin lock and all references */ +			ret = nfs_unroll_locks_and_wait(inode, head, +				subreq, nonblock); + +			if (ret == 0) +				goto try_again; +  			return ERR_PTR(ret); -		spin_lock(&inode->i_lock); +		} + +		subreq = subreq->wb_this_page; +	} while (subreq != head); + +	/* Now that all requests are locked, make sure they aren't on any list. +	 * Commit list removal accounting is done after locks are dropped */ +	subreq = head; +	do { +		nfs_list_remove_request(subreq); +		subreq = subreq->wb_this_page; +	} while (subreq != head); + +	/* unlink subrequests from head, destroy them later */ +	if (head->wb_this_page != head) { +		/* destroy list will be terminated by head */ +		destroy_list = head->wb_this_page; +		head->wb_this_page = head; + +		/* change head request to cover whole range that +		 * the former page group covered */ +		head->wb_bytes = total_bytes;  	} + +	/* +	 * prepare head request to be added to new pgio descriptor +	 */ +	nfs_page_group_clear_bits(head); + +	/* +	 * some part of the group was still on the inode list - otherwise +	 * the group wouldn't be involved in async write. +	 * grab a reference for the head request, iff it needs one. +	 */ +	if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags)) +		kref_get(&head->wb_kref); + +	nfs_page_group_unlock(head); + +	/* drop lock to clear_request_commit the head req and clean up +	 * requests on destroy list */  	spin_unlock(&inode->i_lock); -	return req; + +	nfs_destroy_unlinked_subrequests(destroy_list, head); + +	/* clean up commit list state */ +	nfs_clear_request_commit(head); + +	/* still holds ref on head from nfs_page_find_head_request_locked +	 * and still has lock on head from lock loop */ +	return head;  }  /* @@ -310,7 +542,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,  	struct nfs_page *req;  	int ret = 0; -	req = nfs_find_and_lock_request(page, nonblock); +	req = nfs_lock_and_join_requests(page, nonblock);  	if (!req)  		goto out;  	ret = PTR_ERR(req); @@ -354,10 +586,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc  	struct nfs_pageio_descriptor pgio;  	int err; -	NFS_PROTO(page_file_mapping(page)->host)->write_pageio_init(&pgio, -							  page->mapping->host, -							  wb_priority(wbc), -							  &nfs_async_write_completion_ops); +	nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc), +				false, &nfs_async_write_completion_ops);  	err = nfs_do_writepage(page, wbc, &pgio);  	nfs_pageio_complete(&pgio);  	if (err < 0) @@ -400,12 +630,13 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)  	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); -	NFS_PROTO(inode)->write_pageio_init(&pgio, inode, wb_priority(wbc), &nfs_async_write_completion_ops); +	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, +				&nfs_async_write_completion_ops);  	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);  	nfs_pageio_complete(&pgio);  	clear_bit_unlock(NFS_INO_FLUSHING, bitlock); -	smp_mb__after_clear_bit(); +	smp_mb__after_atomic();  	wake_up_bit(bitlock, NFS_INO_FLUSHING);  	if (err < 0) @@ -425,6 +656,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)  {  	struct nfs_inode *nfsi = NFS_I(inode); +	WARN_ON_ONCE(req->wb_this_page != req); +  	/* Lock the request! */  	nfs_lock_request(req); @@ -441,6 +674,9 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)  		set_page_private(req->wb_page, (unsigned long)req);  	}  	nfsi->npages++; +	/* this a head request for a page group - mark it as having an +	 * extra reference so sub groups can follow suit */ +	WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags));  	kref_get(&req->wb_kref);  	spin_unlock(&inode->i_lock);  } @@ -452,16 +688,23 @@ static void nfs_inode_remove_request(struct nfs_page *req)  {  	struct inode *inode = req->wb_context->dentry->d_inode;  	struct nfs_inode *nfsi = NFS_I(inode); +	struct nfs_page *head; -	spin_lock(&inode->i_lock); -	if (likely(!PageSwapCache(req->wb_page))) { -		set_page_private(req->wb_page, 0); -		ClearPagePrivate(req->wb_page); -		clear_bit(PG_MAPPED, &req->wb_flags); +	if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { +		head = req->wb_head; + +		spin_lock(&inode->i_lock); +		if (likely(!PageSwapCache(head->wb_page))) { +			set_page_private(head->wb_page, 0); +			ClearPagePrivate(head->wb_page); +			clear_bit(PG_MAPPED, &head->wb_flags); +		} +		nfsi->npages--; +		spin_unlock(&inode->i_lock);  	} -	nfsi->npages--; -	spin_unlock(&inode->i_lock); -	nfs_release_request(req); + +	if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) +		nfs_release_request(req);  }  static void @@ -583,7 +826,7 @@ nfs_clear_request_commit(struct nfs_page *req)  }  static inline -int nfs_write_need_commit(struct nfs_write_data *data) +int nfs_write_need_commit(struct nfs_pgio_data *data)  {  	if (data->verf.committed == NFS_DATA_SYNC)  		return data->header->lseg == NULL; @@ -614,7 +857,7 @@ nfs_clear_request_commit(struct nfs_page *req)  }  static inline -int nfs_write_need_commit(struct nfs_write_data *data) +int nfs_write_need_commit(struct nfs_pgio_data *data)  {  	return 0;  } @@ -645,7 +888,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)  			goto next;  		}  		if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { -			memcpy(&req->wb_verf, &hdr->verf->verifier, sizeof(req->wb_verf)); +			memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));  			nfs_mark_request_commit(req, hdr->lseg, &cinfo);  			goto next;  		} @@ -653,7 +896,7 @@ remove_req:  		nfs_inode_remove_request(req);  next:  		nfs_unlock_request(req); -		nfs_end_page_writeback(req->wb_page); +		nfs_end_page_writeback(req);  		nfs_release_request(req);  	}  out: @@ -661,7 +904,7 @@ out:  }  #if  IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) -static unsigned long +unsigned long  nfs_reqs_to_commit(struct nfs_commit_info *cinfo)  {  	return cinfo->mds->ncommit; @@ -718,7 +961,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,  }  #else -static unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)  {  	return 0;  } @@ -754,10 +997,14 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,  	spin_lock(&inode->i_lock);  	for (;;) { -		req = nfs_page_find_request_locked(NFS_I(inode), page); +		req = nfs_page_find_head_request_locked(NFS_I(inode), page);  		if (req == NULL)  			goto out_unlock; +		/* should be handled by nfs_flush_incompatible */ +		WARN_ON_ONCE(req->wb_head != req); +		WARN_ON_ONCE(req->wb_this_page != req); +  		rqend = req->wb_offset + req->wb_bytes;  		/*  		 * Tell the caller to flush out the request if @@ -819,7 +1066,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,  	req = nfs_try_to_update_request(inode, page, offset, bytes);  	if (req != NULL)  		goto out; -	req = nfs_create_request(ctx, inode, page, offset, bytes); +	req = nfs_create_request(ctx, page, NULL, offset, bytes);  	if (IS_ERR(req))  		goto out;  	nfs_inode_add_request(inode, req); @@ -837,7 +1084,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,  		return PTR_ERR(req);  	/* Update file length */  	nfs_grow_file(page, offset, count); -	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); +	nfs_mark_uptodate(req);  	nfs_mark_request_dirty(req);  	nfs_unlock_and_release_request(req);  	return 0; @@ -858,11 +1105,13 @@ int nfs_flush_incompatible(struct file *file, struct page *page)  	 * dropped page.  	 */  	do { -		req = nfs_page_find_request(page); +		req = nfs_page_find_head_request(page);  		if (req == NULL)  			return 0;  		l_ctx = req->wb_lock_context;  		do_flush = req->wb_page != page || req->wb_context != ctx; +		/* for now, flush if more than 1 request in page_group */ +		do_flush |= req->wb_this_page != req;  		if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {  			do_flush |= l_ctx->lockowner.l_owner != current->files  				|| l_ctx->lockowner.l_pid != current->tgid; @@ -909,11 +1158,18 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)   */  static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)  { +	struct nfs_inode *nfsi = NFS_I(inode); +  	if (nfs_have_delegated_attributes(inode))  		goto out; -	if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE)) +	if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) +		return false; +	smp_rmb(); +	if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))  		return false;  out: +	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) +		return false;  	return PageUptodate(page) != 0;  } @@ -922,19 +1178,20 @@ out:   * extend the write to cover the entire page in order to avoid fragmentation   * inefficiencies.   * - * If the file is opened for synchronous writes or if we have a write delegation - * from the server then we can just skip the rest of the checks. + * If the file is opened for synchronous writes then we can just skip the rest + * of the checks.   */  static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)  {  	if (file->f_flags & O_DSYNC)  		return 0; +	if (!nfs_write_pageuptodate(page, inode)) +		return 0;  	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))  		return 1; -	if (nfs_write_pageuptodate(page, inode) && (inode->i_flock == NULL || -			(inode->i_flock->fl_start == 0 && +	if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 &&  			inode->i_flock->fl_end == OFFSET_MAX && -			inode->i_flock->fl_type != F_RDLCK))) +			inode->i_flock->fl_type != F_RDLCK))  		return 1;  	return 0;  } @@ -954,10 +1211,8 @@ int nfs_updatepage(struct file *file, struct page *page,  	nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); -	dprintk("NFS:       nfs_updatepage(%s/%s %d@%lld)\n", -		file->f_path.dentry->d_parent->d_name.name, -		file->f_path.dentry->d_name.name, count, -		(long long)(page_file_offset(page) + offset)); +	dprintk("NFS:       nfs_updatepage(%pD2 %d@%lld)\n", +		file, count, (long long)(page_file_offset(page) + offset));  	if (nfs_can_extend_write(file, page, inode)) {  		count = max(count + offset, nfs_page_length(page)); @@ -986,126 +1241,17 @@ static int flush_task_priority(int how)  	return RPC_PRIORITY_NORMAL;  } -int nfs_initiate_write(struct rpc_clnt *clnt, -		       struct nfs_write_data *data, -		       const struct rpc_call_ops *call_ops, -		       int how, int flags) +static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg, +			       struct rpc_task_setup *task_setup_data, int how)  {  	struct inode *inode = data->header->inode;  	int priority = flush_task_priority(how); -	struct rpc_task *task; -	struct rpc_message msg = { -		.rpc_argp = &data->args, -		.rpc_resp = &data->res, -		.rpc_cred = data->header->cred, -	}; -	struct rpc_task_setup task_setup_data = { -		.rpc_client = clnt, -		.task = &data->task, -		.rpc_message = &msg, -		.callback_ops = call_ops, -		.callback_data = data, -		.workqueue = nfsiod_workqueue, -		.flags = RPC_TASK_ASYNC | flags, -		.priority = priority, -	}; -	int ret = 0; - -	/* Set up the initial task struct.  */ -	NFS_PROTO(inode)->write_setup(data, &msg); -	dprintk("NFS: %5u initiated write call " -		"(req %s/%lld, %u bytes @ offset %llu)\n", -		data->task.tk_pid, -		inode->i_sb->s_id, -		(long long)NFS_FILEID(inode), -		data->args.count, -		(unsigned long long)data->args.offset); +	task_setup_data->priority = priority; +	NFS_PROTO(inode)->write_setup(data, msg);  	nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client, -				 &task_setup_data.rpc_client, &msg, data); - -	task = rpc_run_task(&task_setup_data); -	if (IS_ERR(task)) { -		ret = PTR_ERR(task); -		goto out; -	} -	if (how & FLUSH_SYNC) { -		ret = rpc_wait_for_completion_task(task); -		if (ret == 0) -			ret = task->tk_status; -	} -	rpc_put_task(task); -out: -	return ret; -} -EXPORT_SYMBOL_GPL(nfs_initiate_write); - -/* - * Set up the argument/result storage required for the RPC call. - */ -static void nfs_write_rpcsetup(struct nfs_write_data *data, -		unsigned int count, unsigned int offset, -		int how, struct nfs_commit_info *cinfo) -{ -	struct nfs_page *req = data->header->req; - -	/* Set up the RPC argument and reply structs -	 * NB: take care not to mess about with data->commit et al. */ - -	data->args.fh     = NFS_FH(data->header->inode); -	data->args.offset = req_offset(req) + offset; -	/* pnfs_set_layoutcommit needs this */ -	data->mds_offset = data->args.offset; -	data->args.pgbase = req->wb_pgbase + offset; -	data->args.pages  = data->pages.pagevec; -	data->args.count  = count; -	data->args.context = get_nfs_open_context(req->wb_context); -	data->args.lock_context = req->wb_lock_context; -	data->args.stable  = NFS_UNSTABLE; -	switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { -	case 0: -		break; -	case FLUSH_COND_STABLE: -		if (nfs_reqs_to_commit(cinfo)) -			break; -	default: -		data->args.stable = NFS_FILE_SYNC; -	} - -	data->res.fattr   = &data->fattr; -	data->res.count   = count; -	data->res.verf    = &data->verf; -	nfs_fattr_init(&data->fattr); -} - -static int nfs_do_write(struct nfs_write_data *data, -		const struct rpc_call_ops *call_ops, -		int how) -{ -	struct inode *inode = data->header->inode; - -	return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0); -} - -static int nfs_do_multiple_writes(struct list_head *head, -		const struct rpc_call_ops *call_ops, -		int how) -{ -	struct nfs_write_data *data; -	int ret = 0; - -	while (!list_empty(head)) { -		int ret2; - -		data = list_first_entry(head, struct nfs_write_data, list); -		list_del_init(&data->list); -		 -		ret2 = nfs_do_write(data, call_ops, how); -		 if (ret == 0) -			 ret = ret2; -	} -	return ret; +				 &task_setup_data->rpc_client, msg, data);  }  /* If a nfs_flush_* function fails, it should remove reqs from @head and @@ -1116,7 +1262,7 @@ static void nfs_redirty_request(struct nfs_page *req)  {  	nfs_mark_request_dirty(req);  	nfs_unlock_request(req); -	nfs_end_page_writeback(req->wb_page); +	nfs_end_page_writeback(req);  	nfs_release_request(req);  } @@ -1136,173 +1282,30 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {  	.completion = nfs_write_completion,  }; -static void nfs_flush_error(struct nfs_pageio_descriptor *desc, -		struct nfs_pgio_header *hdr) -{ -	set_bit(NFS_IOHDR_REDO, &hdr->flags); -	while (!list_empty(&hdr->rpc_list)) { -		struct nfs_write_data *data = list_first_entry(&hdr->rpc_list, -				struct nfs_write_data, list); -		list_del(&data->list); -		nfs_writedata_release(data); -	} -	desc->pg_completion_ops->error_cleanup(&desc->pg_list); -} - -/* - * Generate multiple small requests to write out a single - * contiguous dirty area on one page. - */ -static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, -			   struct nfs_pgio_header *hdr) -{ -	struct nfs_page *req = hdr->req; -	struct page *page = req->wb_page; -	struct nfs_write_data *data; -	size_t wsize = desc->pg_bsize, nbytes; -	unsigned int offset; -	int requests = 0; -	struct nfs_commit_info cinfo; - -	nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); - -	if ((desc->pg_ioflags & FLUSH_COND_STABLE) && -	    (desc->pg_moreio || nfs_reqs_to_commit(&cinfo) || -	     desc->pg_count > wsize)) -		desc->pg_ioflags &= ~FLUSH_COND_STABLE; - - -	offset = 0; -	nbytes = desc->pg_count; -	do { -		size_t len = min(nbytes, wsize); - -		data = nfs_writedata_alloc(hdr, 1); -		if (!data) { -			nfs_flush_error(desc, hdr); -			return -ENOMEM; -		} -		data->pages.pagevec[0] = page; -		nfs_write_rpcsetup(data, len, offset, desc->pg_ioflags, &cinfo); -		list_add(&data->list, &hdr->rpc_list); -		requests++; -		nbytes -= len; -		offset += len; -	} while (nbytes != 0); -	nfs_list_remove_request(req); -	nfs_list_add_request(req, &hdr->pages); -	desc->pg_rpc_callops = &nfs_write_common_ops; -	return 0; -} - -/* - * Create an RPC task for the given write request and kick it. - * The page must have been locked by the caller. - * - * It may happen that the page we're passed is not marked dirty. - * This is the case if nfs_updatepage detects a conflicting request - * that has been written but not committed. - */ -static int nfs_flush_one(struct nfs_pageio_descriptor *desc, -			 struct nfs_pgio_header *hdr) -{ -	struct nfs_page		*req; -	struct page		**pages; -	struct nfs_write_data	*data; -	struct list_head *head = &desc->pg_list; -	struct nfs_commit_info cinfo; - -	data = nfs_writedata_alloc(hdr, nfs_page_array_len(desc->pg_base, -							   desc->pg_count)); -	if (!data) { -		nfs_flush_error(desc, hdr); -		return -ENOMEM; -	} - -	nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); -	pages = data->pages.pagevec; -	while (!list_empty(head)) { -		req = nfs_list_entry(head->next); -		nfs_list_remove_request(req); -		nfs_list_add_request(req, &hdr->pages); -		*pages++ = req->wb_page; -	} - -	if ((desc->pg_ioflags & FLUSH_COND_STABLE) && -	    (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) -		desc->pg_ioflags &= ~FLUSH_COND_STABLE; - -	/* Set up the argument struct */ -	nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo); -	list_add(&data->list, &hdr->rpc_list); -	desc->pg_rpc_callops = &nfs_write_common_ops; -	return 0; -} - -int nfs_generic_flush(struct nfs_pageio_descriptor *desc, -		      struct nfs_pgio_header *hdr) -{ -	if (desc->pg_bsize < PAGE_CACHE_SIZE) -		return nfs_flush_multi(desc, hdr); -	return nfs_flush_one(desc, hdr); -} -EXPORT_SYMBOL_GPL(nfs_generic_flush); - -static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) -{ -	struct nfs_write_header *whdr; -	struct nfs_pgio_header *hdr; -	int ret; - -	whdr = nfs_writehdr_alloc(); -	if (!whdr) { -		desc->pg_completion_ops->error_cleanup(&desc->pg_list); -		return -ENOMEM; -	} -	hdr = &whdr->header; -	nfs_pgheader_init(desc, hdr, nfs_writehdr_free); -	atomic_inc(&hdr->refcnt); -	ret = nfs_generic_flush(desc, hdr); -	if (ret == 0) -		ret = nfs_do_multiple_writes(&hdr->rpc_list, -					     desc->pg_rpc_callops, -					     desc->pg_ioflags); -	if (atomic_dec_and_test(&hdr->refcnt)) -		hdr->completion_ops->completion(hdr); -	return ret; -} - -static const struct nfs_pageio_ops nfs_pageio_write_ops = { -	.pg_test = nfs_generic_pg_test, -	.pg_doio = nfs_generic_pg_writepages, -}; -  void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, -			       struct inode *inode, int ioflags, +			       struct inode *inode, int ioflags, bool force_mds,  			       const struct nfs_pgio_completion_ops *compl_ops)  { -	nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, compl_ops, -				NFS_SERVER(inode)->wsize, ioflags); +	struct nfs_server *server = NFS_SERVER(inode); +	const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops; + +#ifdef CONFIG_NFS_V4_1 +	if (server->pnfs_curr_ld && !force_mds) +		pg_ops = server->pnfs_curr_ld->pg_write_ops; +#endif +	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, +			server->wsize, ioflags);  }  EXPORT_SYMBOL_GPL(nfs_pageio_init_write);  void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)  { -	pgio->pg_ops = &nfs_pageio_write_ops; +	pgio->pg_ops = &nfs_pgio_rw_ops;  	pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;  }  EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); -void nfs_write_prepare(struct rpc_task *task, void *calldata) -{ -	struct nfs_write_data *data = calldata; -	int err; -	err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data); -	if (err) -		rpc_exit(task, err); -} -  void nfs_commit_prepare(struct rpc_task *task, void *calldata)  {  	struct nfs_commit_data *data = calldata; @@ -1310,23 +1313,8 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)  	NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);  } -/* - * Handle a write reply that flushes a whole page. - * - * FIXME: There is an inherent race with invalidate_inode_pages and - *	  writebacks since the page->count is kept > 1 for as long - *	  as the page has a write request pending. - */ -static void nfs_writeback_done_common(struct rpc_task *task, void *calldata) -{ -	struct nfs_write_data	*data = calldata; - -	nfs_writeback_done(task, data); -} - -static void nfs_writeback_release_common(void *calldata) +static void nfs_writeback_release_common(struct nfs_pgio_data *data)  { -	struct nfs_write_data	*data = calldata;  	struct nfs_pgio_header *hdr = data->header;  	int status = data->task.tk_status; @@ -1335,34 +1323,46 @@ static void nfs_writeback_release_common(void *calldata)  		if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))  			; /* Do nothing */  		else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) -			memcpy(hdr->verf, &data->verf, sizeof(*hdr->verf)); -		else if (memcmp(hdr->verf, &data->verf, sizeof(*hdr->verf))) +			memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf)); +		else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf)))  			set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);  		spin_unlock(&hdr->lock);  	} -	nfs_writedata_release(data);  } -static const struct rpc_call_ops nfs_write_common_ops = { -	.rpc_call_prepare = nfs_write_prepare, -	.rpc_call_done = nfs_writeback_done_common, -	.rpc_release = nfs_writeback_release_common, -}; +/* + * Special version of should_remove_suid() that ignores capabilities. + */ +static int nfs_should_remove_suid(const struct inode *inode) +{ +	umode_t mode = inode->i_mode; +	int kill = 0; +	/* suid always must be killed */ +	if (unlikely(mode & S_ISUID)) +		kill = ATTR_KILL_SUID; + +	/* +	 * sgid without any exec bits is just a mandatory locking mark; leave +	 * it alone.  If some exec bits are set, it's a real sgid; kill it. +	 */ +	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) +		kill |= ATTR_KILL_SGID; + +	if (unlikely(kill && S_ISREG(mode))) +		return kill; + +	return 0; +}  /*   * This function is called when the WRITE call is complete.   */ -void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data, +			      struct inode *inode)  { -	struct nfs_writeargs	*argp = &data->args; -	struct nfs_writeres	*resp = &data->res; -	struct inode		*inode = data->header->inode;  	int status; -	dprintk("NFS: %5u nfs_writeback_done (status %d)\n", -		task->tk_pid, task->tk_status); -  	/*  	 * ->write_done will attempt to use post-op attributes to detect  	 * conflicting writes by other clients.  A strict interpretation @@ -1372,11 +1372,11 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)  	 */  	status = NFS_PROTO(inode)->write_done(task, data);  	if (status != 0) -		return; -	nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, resp->count); +		return status; +	nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count);  #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) -	if (resp->verf->committed < argp->stable && task->tk_status >= 0) { +	if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) {  		/* We tried a write call, but the server did not  		 * commit data to stable storage even though we  		 * requested it. @@ -1392,18 +1392,31 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)  			dprintk("NFS:       faulty NFS server %s:"  				" (committed = %d) != (stable = %d)\n",  				NFS_SERVER(inode)->nfs_client->cl_hostname, -				resp->verf->committed, argp->stable); +				data->res.verf->committed, data->args.stable);  			complain = jiffies + 300 * HZ;  		}  	}  #endif -	if (task->tk_status < 0) -		nfs_set_pgio_error(data->header, task->tk_status, argp->offset); -	else if (resp->count < argp->count) { + +	/* Deal with the suid/sgid bit corner case */ +	if (nfs_should_remove_suid(inode)) +		nfs_mark_for_revalidate(inode); +	return 0; +} + +/* + * This function is called when the WRITE call is complete. + */ +static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data) +{ +	struct nfs_pgio_args	*argp = &data->args; +	struct nfs_pgio_res	*resp = &data->res; + +	if (resp->count < argp->count) {  		static unsigned long    complain;  		/* This a short write! */ -		nfs_inc_stats(inode, NFSIOS_SHORTWRITE); +		nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE);  		/* Has the server at least made some progress? */  		if (resp->count == 0) { @@ -1454,7 +1467,7 @@ static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)  static void nfs_commit_clear_lock(struct nfs_inode *nfsi)  {  	clear_bit(NFS_INO_COMMIT, &nfsi->flags); -	smp_mb__after_clear_bit(); +	smp_mb__after_atomic();  	wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);  } @@ -1608,9 +1621,9 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)  		nfs_list_remove_request(req);  		nfs_clear_page_commit(req->wb_page); -		dprintk("NFS:       commit (%s/%lld %d@%lld)", +		dprintk("NFS:       commit (%s/%llu %d@%lld)",  			req->wb_context->dentry->d_sb->s_id, -			(long long)NFS_FILEID(req->wb_context->dentry->d_inode), +			(unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),  			req->wb_bytes,  			(long long)req_offset(req));  		if (status < 0) { @@ -1784,27 +1797,28 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)  	struct nfs_page *req;  	int ret = 0; -	for (;;) { -		wait_on_page_writeback(page); -		req = nfs_page_find_request(page); -		if (req == NULL) -			break; -		if (nfs_lock_request(req)) { -			nfs_clear_request_commit(req); -			nfs_inode_remove_request(req); -			/* -			 * In case nfs_inode_remove_request has marked the -			 * page as being dirty -			 */ -			cancel_dirty_page(page, PAGE_CACHE_SIZE); -			nfs_unlock_and_release_request(req); -			break; -		} -		ret = nfs_wait_on_request(req); -		nfs_release_request(req); -		if (ret < 0) -			break; +	wait_on_page_writeback(page); + +	/* blocking call to cancel all requests and join to a single (head) +	 * request */ +	req = nfs_lock_and_join_requests(page, false); + +	if (IS_ERR(req)) { +		ret = PTR_ERR(req); +	} else if (req) { +		/* all requests from this page have been cancelled by +		 * nfs_lock_and_join_requests, so just remove the head +		 * request from the inode / page_private pointer and +		 * release it */ +		nfs_inode_remove_request(req); +		/* +		 * In case nfs_inode_remove_request has marked the +		 * page as being dirty +		 */ +		cancel_dirty_page(page, PAGE_CACHE_SIZE); +		nfs_unlock_and_release_request(req);  	} +  	return ret;  } @@ -1870,7 +1884,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,  int __init nfs_init_writepagecache(void)  {  	nfs_wdata_cachep = kmem_cache_create("nfs_write_data", -					     sizeof(struct nfs_write_header), +					     sizeof(struct nfs_rw_header),  					     0, SLAB_HWCACHE_ALIGN,  					     NULL);  	if (nfs_wdata_cachep == NULL) @@ -1932,3 +1946,12 @@ void nfs_destroy_writepagecache(void)  	kmem_cache_destroy(nfs_wdata_cachep);  } +static const struct nfs_rw_ops nfs_rw_write_ops = { +	.rw_mode		= FMODE_WRITE, +	.rw_alloc_header	= nfs_writehdr_alloc, +	.rw_free_header		= nfs_writehdr_free, +	.rw_release		= nfs_writeback_release_common, +	.rw_done		= nfs_writeback_done, +	.rw_result		= nfs_writeback_result, +	.rw_initiate		= nfs_initiate_write, +};  | 
