diff options
Diffstat (limited to 'fs/nfs')
79 files changed, 35061 insertions, 13038 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index ba306658a6d..3dece03f2fc 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -29,9 +29,20 @@ config NFS_FS  	  If unsure, say N. +config NFS_V2 +	tristate "NFS client support for NFS version 2" +	depends on NFS_FS +	default y +	help +	  This option enables support for version 2 of the NFS protocol +	  (RFC 1094) in the kernel's NFS client. + +	  If unsure, say Y. +  config NFS_V3 -	bool "NFS client support for NFS version 3" +	tristate "NFS client support for NFS version 3"  	depends on NFS_FS +	default y  	help  	  This option enables support for version 3 of the NFS protocol  	  (RFC 1813) in the kernel's NFS client. @@ -61,9 +72,10 @@ config NFS_V3_ACL  	  If unsure, say N.  config NFS_V4 -	bool "NFS client support for NFS version 4" +	tristate "NFS client support for NFS version 4"  	depends on NFS_FS  	select SUNRPC_GSS +	select KEYS  	help  	  This option enables support for version 4 of the NFS protocol  	  (RFC 3530) in the kernel's NFS client. @@ -74,18 +86,75 @@ config NFS_V4  	  If unsure, say Y. +config NFS_SWAP +	bool "Provide swap over NFS support" +	default n +	depends on NFS_FS +	select SUNRPC_SWAP +	help +	  This option enables swapon to work on files located on NFS mounts. +  config NFS_V4_1 -	bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" -	depends on NFS_FS && NFS_V4 && EXPERIMENTAL -	select PNFS_FILE_LAYOUT +	bool "NFS client support for NFSv4.1" +	depends on NFS_V4 +	select SUNRPC_BACKCHANNEL  	help  	  This option enables support for minor version 1 of the NFSv4 protocol  	  (RFC 5661) in the kernel's NFS client.  	  If unsure, say N. +config NFS_V4_2 +	bool "NFS client support for NFSv4.2" +	depends on NFS_V4_1 +	help +	  This option enables support for minor version 2 of the NFSv4 protocol +	  in the kernel's NFS client. + +	  If unsure, say N. +  config PNFS_FILE_LAYOUT  	tristate +	depends on NFS_V4_1 +	default NFS_V4 + +config PNFS_BLOCK +	tristate +	depends on NFS_V4_1 && BLK_DEV_DM +	default NFS_V4 + +config PNFS_OBJLAYOUT +	tristate +	depends on NFS_V4_1 && SCSI_OSD_ULD +	default NFS_V4 + +config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN +	string "NFSv4.1 Implementation ID Domain" +	depends on NFS_V4_1 +	default "kernel.org" +	help +	  This option defines the domain portion of the implementation ID that +	  may be sent in the NFS exchange_id operation.  The value must be in +	  the format of a DNS domain name and should be set to the DNS domain +	  name of the distribution. +	  If the NFS client is unchanged from the upstream kernel, this +	  option should be set to the default "kernel.org". + +config NFS_V4_1_MIGRATION +	bool "NFSv4.1 client support for migration" +	depends on NFS_V4_1 +	default n +	help +	  This option makes the NFS client advertise to NFSv4.1 servers that +          it can support NFSv4 migration. + +          The NFSv4.1 pieces of the Linux NFSv4 migration implementation are +          still experimental.  If you are not an NFSv4 developer, say N here. + +config NFS_V4_SECURITY_LABEL +	bool +	depends on NFS_V4_2 && SECURITY +	default y  config ROOT_NFS  	bool "Root file system on NFS" @@ -119,16 +188,10 @@ config NFS_USE_KERNEL_DNS  	bool  	depends on NFS_V4 && !NFS_USE_LEGACY_DNS  	select DNS_RESOLVER -	select KEYS  	default y -config NFS_USE_NEW_IDMAPPER -	bool "Use the new idmapper upcall routine" -	depends on NFS_V4 && KEYS -	help -	  Say Y here if you want NFS to use the new idmapper upcall functions. -	  You will need /sbin/request-key (usually provided by the keyutils -	  package).  For details, read -	  <file:Documentation/filesystems/nfs/idmapper.txt>. - -	  If you are unsure, say N. +config NFS_DEBUG +	bool +	depends on NFS_FS && SUNRPC_DEBUG +	select CRC32 +	default y diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 4776ff9e381..4782e0840dc 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -4,20 +4,31 @@  obj-$(CONFIG_NFS_FS) += nfs.o -nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ -			   direct.o pagelist.o proc.o read.o symlink.o unlink.o \ -			   write.o namespace.o mount_clnt.o \ -			   dns_resolve.o cache_lib.o +CFLAGS_nfstrace.o += -I$(src) +nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o \ +			   direct.o pagelist.o read.o symlink.o unlink.o \ +			   write.o namespace.o mount_clnt.o nfstrace.o  nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o -nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o -nfs-$(CONFIG_NFS_V3_ACL)	+= nfs3acl.o -nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ -			   delegation.o idmap.o \ -			   callback.o callback_xdr.o callback_proc.o \ -			   nfs4namespace.o -nfs-$(CONFIG_NFS_V4_1)	+= pnfs.o -nfs-$(CONFIG_SYSCTL) += sysctl.o +nfs-$(CONFIG_SYSCTL)	+= sysctl.o  nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o -obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o -nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o +obj-$(CONFIG_NFS_V2) += nfsv2.o +nfsv2-y := nfs2super.o proc.o nfs2xdr.o + +obj-$(CONFIG_NFS_V3) += nfsv3.o +nfsv3-y := nfs3super.o nfs3client.o nfs3proc.o nfs3xdr.o +nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o + +obj-$(CONFIG_NFS_V4) += nfsv4.o +CFLAGS_nfs4trace.o += -I$(src) +nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ +	  delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ +	  nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \ +	  dns_resolve.o nfs4trace.o +nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o +nfsv4-$(CONFIG_SYSCTL)	+= nfs4sysctl.o +nfsv4-$(CONFIG_NFS_V4_1)	+= pnfs.o pnfs_dev.o + +obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile new file mode 100644 index 00000000000..d5815505c02 --- /dev/null +++ b/fs/nfs/blocklayout/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS block layout driver kernel module +# +obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o +blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c new file mode 100644 index 00000000000..9b431f44fad --- /dev/null +++ b/fs/nfs/blocklayout/blocklayout.c @@ -0,0 +1,1458 @@ +/* + *  linux/fs/nfs/blocklayout/blocklayout.c + * + *  Module for the NFSv4.1 pNFS block layout driver. + * + *  Copyright (c) 2006 The Regents of the University of Michigan. + *  All rights reserved. + * + *  Andy Adamson <andros@citi.umich.edu> + *  Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization.  if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose.  the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/bio.h>		/* struct bio */ +#include <linux/buffer_head.h>	/* various write calls */ +#include <linux/prefetch.h> +#include <linux/pagevec.h> + +#include "../pnfs.h" +#include "../nfs4session.h" +#include "../internal.h" +#include "blocklayout.h" + +#define NFSDBG_FACILITY	NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); +MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); + +static void print_page(struct page *page) +{ +	dprintk("PRINTPAGE page %p\n", page); +	dprintk("	PagePrivate %d\n", PagePrivate(page)); +	dprintk("	PageUptodate %d\n", PageUptodate(page)); +	dprintk("	PageError %d\n", PageError(page)); +	dprintk("	PageDirty %d\n", PageDirty(page)); +	dprintk("	PageReferenced %d\n", PageReferenced(page)); +	dprintk("	PageLocked %d\n", PageLocked(page)); +	dprintk("	PageWriteback %d\n", PageWriteback(page)); +	dprintk("	PageMappedToDisk %d\n", PageMappedToDisk(page)); +	dprintk("\n"); +} + +/* Given the be associated with isect, determine if page data needs to be + * initialized. + */ +static int is_hole(struct pnfs_block_extent *be, sector_t isect) +{ +	if (be->be_state == PNFS_BLOCK_NONE_DATA) +		return 1; +	else if (be->be_state != PNFS_BLOCK_INVALID_DATA) +		return 0; +	else +		return !bl_is_sector_init(be->be_inval, isect); +} + +/* Given the be associated with isect, determine if page data can be + * written to disk. + */ +static int is_writable(struct pnfs_block_extent *be, sector_t isect) +{ +	return (be->be_state == PNFS_BLOCK_READWRITE_DATA || +		be->be_state == PNFS_BLOCK_INVALID_DATA); +} + +/* The data we are handed might be spread across several bios.  We need + * to track when the last one is finished. + */ +struct parallel_io { +	struct kref refcnt; +	void (*pnfs_callback) (void *data, int num_se); +	void *data; +	int bse_count; +}; + +static inline struct parallel_io *alloc_parallel(void *data) +{ +	struct parallel_io *rv; + +	rv  = kmalloc(sizeof(*rv), GFP_NOFS); +	if (rv) { +		rv->data = data; +		kref_init(&rv->refcnt); +		rv->bse_count = 0; +	} +	return rv; +} + +static inline void get_parallel(struct parallel_io *p) +{ +	kref_get(&p->refcnt); +} + +static void destroy_parallel(struct kref *kref) +{ +	struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); + +	dprintk("%s enter\n", __func__); +	p->pnfs_callback(p->data, p->bse_count); +	kfree(p); +} + +static inline void put_parallel(struct parallel_io *p) +{ +	kref_put(&p->refcnt, destroy_parallel); +} + +static struct bio * +bl_submit_bio(int rw, struct bio *bio) +{ +	if (bio) { +		get_parallel(bio->bi_private); +		dprintk("%s submitting %s bio %u@%llu\n", __func__, +			rw == READ ? "read" : "write", bio->bi_iter.bi_size, +			(unsigned long long)bio->bi_iter.bi_sector); +		submit_bio(rw, bio); +	} +	return NULL; +} + +static struct bio *bl_alloc_init_bio(int npg, sector_t isect, +				     struct pnfs_block_extent *be, +				     void (*end_io)(struct bio *, int err), +				     struct parallel_io *par) +{ +	struct bio *bio; + +	npg = min(npg, BIO_MAX_PAGES); +	bio = bio_alloc(GFP_NOIO, npg); +	if (!bio && (current->flags & PF_MEMALLOC)) { +		while (!bio && (npg /= 2)) +			bio = bio_alloc(GFP_NOIO, npg); +	} + +	if (bio) { +		bio->bi_iter.bi_sector = isect - be->be_f_offset + +			be->be_v_offset; +		bio->bi_bdev = be->be_mdev; +		bio->bi_end_io = end_io; +		bio->bi_private = par; +	} +	return bio; +} + +static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, +				      sector_t isect, struct page *page, +				      struct pnfs_block_extent *be, +				      void (*end_io)(struct bio *, int err), +				      struct parallel_io *par, +				      unsigned int offset, int len) +{ +	isect = isect + (offset >> SECTOR_SHIFT); +	dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, +		npg, rw, (unsigned long long)isect, offset, len); +retry: +	if (!bio) { +		bio = bl_alloc_init_bio(npg, isect, be, end_io, par); +		if (!bio) +			return ERR_PTR(-ENOMEM); +	} +	if (bio_add_page(bio, page, len, offset) < len) { +		bio = bl_submit_bio(rw, bio); +		goto retry; +	} +	return bio; +} + +static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, +				      sector_t isect, struct page *page, +				      struct pnfs_block_extent *be, +				      void (*end_io)(struct bio *, int err), +				      struct parallel_io *par) +{ +	return do_add_page_to_bio(bio, npg, rw, isect, page, be, +				  end_io, par, 0, PAGE_CACHE_SIZE); +} + +/* This is basically copied from mpage_end_io_read */ +static void bl_end_io_read(struct bio *bio, int err) +{ +	struct parallel_io *par = bio->bi_private; +	struct bio_vec *bvec; +	int i; + +	if (!err) +		bio_for_each_segment_all(bvec, bio, i) +			SetPageUptodate(bvec->bv_page); + +	if (err) { +		struct nfs_pgio_data *rdata = par->data; +		struct nfs_pgio_header *header = rdata->header; + +		if (!header->pnfs_error) +			header->pnfs_error = -EIO; +		pnfs_set_lo_fail(header->lseg); +	} +	bio_put(bio); +	put_parallel(par); +} + +static void bl_read_cleanup(struct work_struct *work) +{ +	struct rpc_task *task; +	struct nfs_pgio_data *rdata; +	dprintk("%s enter\n", __func__); +	task = container_of(work, struct rpc_task, u.tk_work); +	rdata = container_of(task, struct nfs_pgio_data, task); +	pnfs_ld_read_done(rdata); +} + +static void +bl_end_par_io_read(void *data, int unused) +{ +	struct nfs_pgio_data *rdata = data; + +	rdata->task.tk_status = rdata->header->pnfs_error; +	INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); +	schedule_work(&rdata->task.u.tk_work); +} + +static enum pnfs_try_status +bl_read_pagelist(struct nfs_pgio_data *rdata) +{ +	struct nfs_pgio_header *header = rdata->header; +	int i, hole; +	struct bio *bio = NULL; +	struct pnfs_block_extent *be = NULL, *cow_read = NULL; +	sector_t isect, extent_length = 0; +	struct parallel_io *par; +	loff_t f_offset = rdata->args.offset; +	size_t bytes_left = rdata->args.count; +	unsigned int pg_offset, pg_len; +	struct page **pages = rdata->args.pages; +	int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; +	const bool is_dio = (header->dreq != NULL); + +	dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, +	       rdata->pages.npages, f_offset, (unsigned int)rdata->args.count); + +	par = alloc_parallel(rdata); +	if (!par) +		goto use_mds; +	par->pnfs_callback = bl_end_par_io_read; +	/* At this point, we can no longer jump to use_mds */ + +	isect = (sector_t) (f_offset >> SECTOR_SHIFT); +	/* Code assumes extents are page-aligned */ +	for (i = pg_index; i < rdata->pages.npages; i++) { +		if (!extent_length) { +			/* We've used up the previous extent */ +			bl_put_extent(be); +			bl_put_extent(cow_read); +			bio = bl_submit_bio(READ, bio); +			/* Get the next one */ +			be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), +					     isect, &cow_read); +			if (!be) { +				header->pnfs_error = -EIO; +				goto out; +			} +			extent_length = be->be_length - +				(isect - be->be_f_offset); +			if (cow_read) { +				sector_t cow_length = cow_read->be_length - +					(isect - cow_read->be_f_offset); +				extent_length = min(extent_length, cow_length); +			} +		} + +		if (is_dio) { +			pg_offset = f_offset & ~PAGE_CACHE_MASK; +			if (pg_offset + bytes_left > PAGE_CACHE_SIZE) +				pg_len = PAGE_CACHE_SIZE - pg_offset; +			else +				pg_len = bytes_left; + +			f_offset += pg_len; +			bytes_left -= pg_len; +			isect += (pg_offset >> SECTOR_SHIFT); +		} else { +			pg_offset = 0; +			pg_len = PAGE_CACHE_SIZE; +		} + +		hole = is_hole(be, isect); +		if (hole && !cow_read) { +			bio = bl_submit_bio(READ, bio); +			/* Fill hole w/ zeroes w/o accessing device */ +			dprintk("%s Zeroing page for hole\n", __func__); +			zero_user_segment(pages[i], pg_offset, pg_len); +			print_page(pages[i]); +			SetPageUptodate(pages[i]); +		} else { +			struct pnfs_block_extent *be_read; + +			be_read = (hole && cow_read) ? cow_read : be; +			bio = do_add_page_to_bio(bio, rdata->pages.npages - i, +						 READ, +						 isect, pages[i], be_read, +						 bl_end_io_read, par, +						 pg_offset, pg_len); +			if (IS_ERR(bio)) { +				header->pnfs_error = PTR_ERR(bio); +				bio = NULL; +				goto out; +			} +		} +		isect += (pg_len >> SECTOR_SHIFT); +		extent_length -= PAGE_CACHE_SECTORS; +	} +	if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { +		rdata->res.eof = 1; +		rdata->res.count = header->inode->i_size - rdata->args.offset; +	} else { +		rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset; +	} +out: +	bl_put_extent(be); +	bl_put_extent(cow_read); +	bl_submit_bio(READ, bio); +	put_parallel(par); +	return PNFS_ATTEMPTED; + + use_mds: +	dprintk("Giving up and using normal NFS\n"); +	return PNFS_NOT_ATTEMPTED; +} + +static void mark_extents_written(struct pnfs_block_layout *bl, +				 __u64 offset, __u32 count) +{ +	sector_t isect, end; +	struct pnfs_block_extent *be; +	struct pnfs_block_short_extent *se; + +	dprintk("%s(%llu, %u)\n", __func__, offset, count); +	if (count == 0) +		return; +	isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT; +	end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); +	end >>= SECTOR_SHIFT; +	while (isect < end) { +		sector_t len; +		be = bl_find_get_extent(bl, isect, NULL); +		BUG_ON(!be); /* FIXME */ +		len = min(end, be->be_f_offset + be->be_length) - isect; +		if (be->be_state == PNFS_BLOCK_INVALID_DATA) { +			se = bl_pop_one_short_extent(be->be_inval); +			BUG_ON(!se); +			bl_mark_for_commit(be, isect, len, se); +		} +		isect += len; +		bl_put_extent(be); +	} +} + +static void bl_end_io_write_zero(struct bio *bio, int err) +{ +	struct parallel_io *par = bio->bi_private; +	struct bio_vec *bvec; +	int i; + +	bio_for_each_segment_all(bvec, bio, i) { +		/* This is the zeroing page we added */ +		end_page_writeback(bvec->bv_page); +		page_cache_release(bvec->bv_page); +	} + +	if (unlikely(err)) { +		struct nfs_pgio_data *data = par->data; +		struct nfs_pgio_header *header = data->header; + +		if (!header->pnfs_error) +			header->pnfs_error = -EIO; +		pnfs_set_lo_fail(header->lseg); +	} +	bio_put(bio); +	put_parallel(par); +} + +static void bl_end_io_write(struct bio *bio, int err) +{ +	struct parallel_io *par = bio->bi_private; +	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); +	struct nfs_pgio_data *data = par->data; +	struct nfs_pgio_header *header = data->header; + +	if (!uptodate) { +		if (!header->pnfs_error) +			header->pnfs_error = -EIO; +		pnfs_set_lo_fail(header->lseg); +	} +	bio_put(bio); +	put_parallel(par); +} + +/* Function scheduled for call during bl_end_par_io_write, + * it marks sectors as written and extends the commitlist. + */ +static void bl_write_cleanup(struct work_struct *work) +{ +	struct rpc_task *task; +	struct nfs_pgio_data *wdata; +	dprintk("%s enter\n", __func__); +	task = container_of(work, struct rpc_task, u.tk_work); +	wdata = container_of(task, struct nfs_pgio_data, task); +	if (likely(!wdata->header->pnfs_error)) { +		/* Marks for LAYOUTCOMMIT */ +		mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg), +				     wdata->args.offset, wdata->args.count); +	} +	pnfs_ld_write_done(wdata); +} + +/* Called when last of bios associated with a bl_write_pagelist call finishes */ +static void bl_end_par_io_write(void *data, int num_se) +{ +	struct nfs_pgio_data *wdata = data; + +	if (unlikely(wdata->header->pnfs_error)) { +		bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval, +					num_se); +	} + +	wdata->task.tk_status = wdata->header->pnfs_error; +	wdata->verf.committed = NFS_FILE_SYNC; +	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); +	schedule_work(&wdata->task.u.tk_work); +} + +/* FIXME STUB - mark intersection of layout and page as bad, so is not + * used again. + */ +static void mark_bad_read(void) +{ +	return; +} + +/* + * map_block:  map a requested I/0 block (isect) into an offset in the LVM + * block_device + */ +static void +map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) +{ +	dprintk("%s enter be=%p\n", __func__, be); + +	set_buffer_mapped(bh); +	bh->b_bdev = be->be_mdev; +	bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> +	    (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT); + +	dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n", +		__func__, (unsigned long long)isect, (long)bh->b_blocknr, +		bh->b_size); +	return; +} + +static void +bl_read_single_end_io(struct bio *bio, int error) +{ +	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; +	struct page *page = bvec->bv_page; + +	/* Only one page in bvec */ +	unlock_page(page); +} + +static int +bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be, +		    unsigned int offset, unsigned int len) +{ +	struct bio *bio; +	struct page *shadow_page; +	sector_t isect; +	char *kaddr, *kshadow_addr; +	int ret = 0; + +	dprintk("%s: offset %u len %u\n", __func__, offset, len); + +	shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); +	if (shadow_page == NULL) +		return -ENOMEM; + +	bio = bio_alloc(GFP_NOIO, 1); +	if (bio == NULL) +		return -ENOMEM; + +	isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) + +		(offset / SECTOR_SIZE); + +	bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset; +	bio->bi_bdev = be->be_mdev; +	bio->bi_end_io = bl_read_single_end_io; + +	lock_page(shadow_page); +	if (bio_add_page(bio, shadow_page, +			 SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) { +		unlock_page(shadow_page); +		bio_put(bio); +		return -EIO; +	} + +	submit_bio(READ, bio); +	wait_on_page_locked(shadow_page); +	if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) { +		ret = -EIO; +	} else { +		kaddr = kmap_atomic(page); +		kshadow_addr = kmap_atomic(shadow_page); +		memcpy(kaddr + offset, kshadow_addr + offset, len); +		kunmap_atomic(kshadow_addr); +		kunmap_atomic(kaddr); +	} +	__free_page(shadow_page); +	bio_put(bio); + +	return ret; +} + +static int +bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be, +			  unsigned int dirty_offset, unsigned int dirty_len, +			  bool full_page) +{ +	int ret = 0; +	unsigned int start, end; + +	if (full_page) { +		start = 0; +		end = PAGE_CACHE_SIZE; +	} else { +		start = round_down(dirty_offset, SECTOR_SIZE); +		end = round_up(dirty_offset + dirty_len, SECTOR_SIZE); +	} + +	dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len); +	if (!be) { +		zero_user_segments(page, start, dirty_offset, +				   dirty_offset + dirty_len, end); +		if (start == 0 && end == PAGE_CACHE_SIZE && +		    trylock_page(page)) { +			SetPageUptodate(page); +			unlock_page(page); +		} +		return ret; +	} + +	if (start != dirty_offset) +		ret = bl_do_readpage_sync(page, be, start, dirty_offset - start); + +	if (!ret && (dirty_offset + dirty_len < end)) +		ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len, +					  end - dirty_offset - dirty_len); + +	return ret; +} + +/* Given an unmapped page, zero it or read in page for COW, page is locked + * by caller. + */ +static int +init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) +{ +	struct buffer_head *bh = NULL; +	int ret = 0; +	sector_t isect; + +	dprintk("%s enter, %p\n", __func__, page); +	BUG_ON(PageUptodate(page)); +	if (!cow_read) { +		zero_user_segment(page, 0, PAGE_SIZE); +		SetPageUptodate(page); +		goto cleanup; +	} + +	bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); +	if (!bh) { +		ret = -ENOMEM; +		goto cleanup; +	} + +	isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; +	map_block(bh, isect, cow_read); +	if (!bh_uptodate_or_lock(bh)) +		ret = bh_submit_read(bh); +	if (ret) +		goto cleanup; +	SetPageUptodate(page); + +cleanup: +	if (bh) +		free_buffer_head(bh); +	if (ret) { +		/* Need to mark layout with bad read...should now +		 * just use nfs4 for reads and writes. +		 */ +		mark_bad_read(); +	} +	return ret; +} + +/* Find or create a zeroing page marked being writeback. + * Return ERR_PTR on error, NULL to indicate skip this page and page itself + * to indicate write out. + */ +static struct page * +bl_find_get_zeroing_page(struct inode *inode, pgoff_t index, +			struct pnfs_block_extent *cow_read) +{ +	struct page *page; +	int locked = 0; +	page = find_get_page(inode->i_mapping, index); +	if (page) +		goto check_page; + +	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); +	if (unlikely(!page)) { +		dprintk("%s oom\n", __func__); +		return ERR_PTR(-ENOMEM); +	} +	locked = 1; + +check_page: +	/* PageDirty: Other will write this out +	 * PageWriteback: Other is writing this out +	 * PageUptodate: It was read before +	 */ +	if (PageDirty(page) || PageWriteback(page)) { +		print_page(page); +		if (locked) +			unlock_page(page); +		page_cache_release(page); +		return NULL; +	} + +	if (!locked) { +		lock_page(page); +		locked = 1; +		goto check_page; +	} +	if (!PageUptodate(page)) { +		/* New page, readin or zero it */ +		init_page_for_write(page, cow_read); +	} +	set_page_writeback(page); +	unlock_page(page); + +	return page; +} + +static enum pnfs_try_status +bl_write_pagelist(struct nfs_pgio_data *wdata, int sync) +{ +	struct nfs_pgio_header *header = wdata->header; +	int i, ret, npg_zero, pg_index, last = 0; +	struct bio *bio = NULL; +	struct pnfs_block_extent *be = NULL, *cow_read = NULL; +	sector_t isect, last_isect = 0, extent_length = 0; +	struct parallel_io *par = NULL; +	loff_t offset = wdata->args.offset; +	size_t count = wdata->args.count; +	unsigned int pg_offset, pg_len, saved_len; +	struct page **pages = wdata->args.pages; +	struct page *page; +	pgoff_t index; +	u64 temp; +	int npg_per_block = +	    NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; + +	dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); + +	if (header->dreq != NULL && +	    (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) || +	     !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) { +		dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n"); +		goto out_mds; +	} +	/* At this point, wdata->pages is a (sequential) list of nfs_pages. +	 * We want to write each, and if there is an error set pnfs_error +	 * to have it redone using nfs. +	 */ +	par = alloc_parallel(wdata); +	if (!par) +		goto out_mds; +	par->pnfs_callback = bl_end_par_io_write; +	/* At this point, have to be more careful with error handling */ + +	isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); +	be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read); +	if (!be || !is_writable(be, isect)) { +		dprintk("%s no matching extents!\n", __func__); +		goto out_mds; +	} + +	/* First page inside INVALID extent */ +	if (be->be_state == PNFS_BLOCK_INVALID_DATA) { +		if (likely(!bl_push_one_short_extent(be->be_inval))) +			par->bse_count++; +		else +			goto out_mds; +		temp = offset >> PAGE_CACHE_SHIFT; +		npg_zero = do_div(temp, npg_per_block); +		isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & +				     (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); +		extent_length = be->be_length - (isect - be->be_f_offset); + +fill_invalid_ext: +		dprintk("%s need to zero %d pages\n", __func__, npg_zero); +		for (;npg_zero > 0; npg_zero--) { +			if (bl_is_sector_init(be->be_inval, isect)) { +				dprintk("isect %llu already init\n", +					(unsigned long long)isect); +				goto next_page; +			} +			/* page ref released in bl_end_io_write_zero */ +			index = isect >> PAGE_CACHE_SECTOR_SHIFT; +			dprintk("%s zero %dth page: index %lu isect %llu\n", +				__func__, npg_zero, index, +				(unsigned long long)isect); +			page = bl_find_get_zeroing_page(header->inode, index, +							cow_read); +			if (unlikely(IS_ERR(page))) { +				header->pnfs_error = PTR_ERR(page); +				goto out; +			} else if (page == NULL) +				goto next_page; + +			ret = bl_mark_sectors_init(be->be_inval, isect, +						       PAGE_CACHE_SECTORS); +			if (unlikely(ret)) { +				dprintk("%s bl_mark_sectors_init fail %d\n", +					__func__, ret); +				end_page_writeback(page); +				page_cache_release(page); +				header->pnfs_error = ret; +				goto out; +			} +			if (likely(!bl_push_one_short_extent(be->be_inval))) +				par->bse_count++; +			else { +				end_page_writeback(page); +				page_cache_release(page); +				header->pnfs_error = -ENOMEM; +				goto out; +			} +			/* FIXME: This should be done in bi_end_io */ +			mark_extents_written(BLK_LSEG2EXT(header->lseg), +					     page->index << PAGE_CACHE_SHIFT, +					     PAGE_CACHE_SIZE); + +			bio = bl_add_page_to_bio(bio, npg_zero, WRITE, +						 isect, page, be, +						 bl_end_io_write_zero, par); +			if (IS_ERR(bio)) { +				header->pnfs_error = PTR_ERR(bio); +				bio = NULL; +				goto out; +			} +next_page: +			isect += PAGE_CACHE_SECTORS; +			extent_length -= PAGE_CACHE_SECTORS; +		} +		if (last) +			goto write_done; +	} +	bio = bl_submit_bio(WRITE, bio); + +	/* Middle pages */ +	pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; +	for (i = pg_index; i < wdata->pages.npages; i++) { +		if (!extent_length) { +			/* We've used up the previous extent */ +			bl_put_extent(be); +			bl_put_extent(cow_read); +			bio = bl_submit_bio(WRITE, bio); +			/* Get the next one */ +			be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), +					     isect, &cow_read); +			if (!be || !is_writable(be, isect)) { +				header->pnfs_error = -EINVAL; +				goto out; +			} +			if (be->be_state == PNFS_BLOCK_INVALID_DATA) { +				if (likely(!bl_push_one_short_extent( +								be->be_inval))) +					par->bse_count++; +				else { +					header->pnfs_error = -ENOMEM; +					goto out; +				} +			} +			extent_length = be->be_length - +			    (isect - be->be_f_offset); +		} + +		dprintk("%s offset %lld count %Zu\n", __func__, offset, count); +		pg_offset = offset & ~PAGE_CACHE_MASK; +		if (pg_offset + count > PAGE_CACHE_SIZE) +			pg_len = PAGE_CACHE_SIZE - pg_offset; +		else +			pg_len = count; + +		saved_len = pg_len; +		if (be->be_state == PNFS_BLOCK_INVALID_DATA && +		    !bl_is_sector_init(be->be_inval, isect)) { +			ret = bl_read_partial_page_sync(pages[i], cow_read, +							pg_offset, pg_len, true); +			if (ret) { +				dprintk("%s bl_read_partial_page_sync fail %d\n", +					__func__, ret); +				header->pnfs_error = ret; +				goto out; +			} + +			ret = bl_mark_sectors_init(be->be_inval, isect, +						       PAGE_CACHE_SECTORS); +			if (unlikely(ret)) { +				dprintk("%s bl_mark_sectors_init fail %d\n", +					__func__, ret); +				header->pnfs_error = ret; +				goto out; +			} + +			/* Expand to full page write */ +			pg_offset = 0; +			pg_len = PAGE_CACHE_SIZE; +		} else if  ((pg_offset & (SECTOR_SIZE - 1)) || +			    (pg_len & (SECTOR_SIZE - 1))){ +			/* ahh, nasty case. We have to do sync full sector +			 * read-modify-write cycles. +			 */ +			unsigned int saved_offset = pg_offset; +			ret = bl_read_partial_page_sync(pages[i], be, pg_offset, +							pg_len, false); +			pg_offset = round_down(pg_offset, SECTOR_SIZE); +			pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE) +				 - pg_offset; +		} + + +		bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, +					 isect, pages[i], be, +					 bl_end_io_write, par, +					 pg_offset, pg_len); +		if (IS_ERR(bio)) { +			header->pnfs_error = PTR_ERR(bio); +			bio = NULL; +			goto out; +		} +		offset += saved_len; +		count -= saved_len; +		isect += PAGE_CACHE_SECTORS; +		last_isect = isect; +		extent_length -= PAGE_CACHE_SECTORS; +	} + +	/* Last page inside INVALID extent */ +	if (be->be_state == PNFS_BLOCK_INVALID_DATA) { +		bio = bl_submit_bio(WRITE, bio); +		temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; +		npg_zero = npg_per_block - do_div(temp, npg_per_block); +		if (npg_zero < npg_per_block) { +			last = 1; +			goto fill_invalid_ext; +		} +	} + +write_done: +	wdata->res.count = wdata->args.count; +out: +	bl_put_extent(be); +	bl_put_extent(cow_read); +	bl_submit_bio(WRITE, bio); +	put_parallel(par); +	return PNFS_ATTEMPTED; +out_mds: +	bl_put_extent(be); +	bl_put_extent(cow_read); +	kfree(par); +	return PNFS_NOT_ATTEMPTED; +} + +/* FIXME - range ignored */ +static void +release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range) +{ +	int i; +	struct pnfs_block_extent *be; + +	spin_lock(&bl->bl_ext_lock); +	for (i = 0; i < EXTENT_LISTS; i++) { +		while (!list_empty(&bl->bl_extents[i])) { +			be = list_first_entry(&bl->bl_extents[i], +					      struct pnfs_block_extent, +					      be_node); +			list_del(&be->be_node); +			bl_put_extent(be); +		} +	} +	spin_unlock(&bl->bl_ext_lock); +} + +static void +release_inval_marks(struct pnfs_inval_markings *marks) +{ +	struct pnfs_inval_tracking *pos, *temp; +	struct pnfs_block_short_extent *se, *stemp; + +	list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { +		list_del(&pos->it_link); +		kfree(pos); +	} + +	list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) { +		list_del(&se->bse_node); +		kfree(se); +	} +	return; +} + +static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ +	struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + +	dprintk("%s enter\n", __func__); +	release_extents(bl, NULL); +	release_inval_marks(&bl->bl_inval); +	kfree(bl); +} + +static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, +						   gfp_t gfp_flags) +{ +	struct pnfs_block_layout *bl; + +	dprintk("%s enter\n", __func__); +	bl = kzalloc(sizeof(*bl), gfp_flags); +	if (!bl) +		return NULL; +	spin_lock_init(&bl->bl_ext_lock); +	INIT_LIST_HEAD(&bl->bl_extents[0]); +	INIT_LIST_HEAD(&bl->bl_extents[1]); +	INIT_LIST_HEAD(&bl->bl_commit); +	INIT_LIST_HEAD(&bl->bl_committing); +	bl->bl_count = 0; +	bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT; +	BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); +	return &bl->bl_layout; +} + +static void bl_free_lseg(struct pnfs_layout_segment *lseg) +{ +	dprintk("%s enter\n", __func__); +	kfree(lseg); +} + +/* We pretty much ignore lseg, and store all data layout wide, so we + * can correctly merge. + */ +static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, +						 struct nfs4_layoutget_res *lgr, +						 gfp_t gfp_flags) +{ +	struct pnfs_layout_segment *lseg; +	int status; + +	dprintk("%s enter\n", __func__); +	lseg = kzalloc(sizeof(*lseg), gfp_flags); +	if (!lseg) +		return ERR_PTR(-ENOMEM); +	status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); +	if (status) { +		/* We don't want to call the full-blown bl_free_lseg, +		 * since on error extents were not touched. +		 */ +		kfree(lseg); +		return ERR_PTR(status); +	} +	return lseg; +} + +static void +bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, +		       const struct nfs4_layoutcommit_args *arg) +{ +	dprintk("%s enter\n", __func__); +	encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); +} + +static void +bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) +{ +	struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; + +	dprintk("%s enter\n", __func__); +	clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); +} + +static void free_blk_mountid(struct block_mount_id *mid) +{ +	if (mid) { +		struct pnfs_block_dev *dev, *tmp; + +		/* No need to take bm_lock as we are last user freeing bm_devlist */ +		list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) { +			list_del(&dev->bm_node); +			bl_free_block_dev(dev); +		} +		kfree(mid); +	} +} + +/* This is mostly copied from the filelayout_get_device_info function. + * It seems much of this should be at the generic pnfs level. + */ +static struct pnfs_block_dev * +nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, +			struct nfs4_deviceid *d_id) +{ +	struct pnfs_device *dev; +	struct pnfs_block_dev *rv; +	u32 max_resp_sz; +	int max_pages; +	struct page **pages = NULL; +	int i, rc; + +	/* +	 * Use the session max response size as the basis for setting +	 * GETDEVICEINFO's maxcount +	 */ +	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; +	max_pages = nfs_page_array_len(0, max_resp_sz); +	dprintk("%s max_resp_sz %u max_pages %d\n", +		__func__, max_resp_sz, max_pages); + +	dev = kmalloc(sizeof(*dev), GFP_NOFS); +	if (!dev) { +		dprintk("%s kmalloc failed\n", __func__); +		return ERR_PTR(-ENOMEM); +	} + +	pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); +	if (pages == NULL) { +		kfree(dev); +		return ERR_PTR(-ENOMEM); +	} +	for (i = 0; i < max_pages; i++) { +		pages[i] = alloc_page(GFP_NOFS); +		if (!pages[i]) { +			rv = ERR_PTR(-ENOMEM); +			goto out_free; +		} +	} + +	memcpy(&dev->dev_id, d_id, sizeof(*d_id)); +	dev->layout_type = LAYOUT_BLOCK_VOLUME; +	dev->pages = pages; +	dev->pgbase = 0; +	dev->pglen = PAGE_SIZE * max_pages; +	dev->mincount = 0; +	dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; + +	dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); +	rc = nfs4_proc_getdeviceinfo(server, dev, NULL); +	dprintk("%s getdevice info returns %d\n", __func__, rc); +	if (rc) { +		rv = ERR_PTR(rc); +		goto out_free; +	} + +	rv = nfs4_blk_decode_device(server, dev); + out_free: +	for (i = 0; i < max_pages; i++) +		__free_page(pages[i]); +	kfree(pages); +	kfree(dev); +	return rv; +} + +static int +bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) +{ +	struct block_mount_id *b_mt_id = NULL; +	struct pnfs_devicelist *dlist = NULL; +	struct pnfs_block_dev *bdev; +	LIST_HEAD(block_disklist); +	int status, i; + +	dprintk("%s enter\n", __func__); + +	if (server->pnfs_blksize == 0) { +		dprintk("%s Server did not return blksize\n", __func__); +		return -EINVAL; +	} +	b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS); +	if (!b_mt_id) { +		status = -ENOMEM; +		goto out_error; +	} +	/* Initialize nfs4 block layout mount id */ +	spin_lock_init(&b_mt_id->bm_lock); +	INIT_LIST_HEAD(&b_mt_id->bm_devlist); + +	dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS); +	if (!dlist) { +		status = -ENOMEM; +		goto out_error; +	} +	dlist->eof = 0; +	while (!dlist->eof) { +		status = nfs4_proc_getdevicelist(server, fh, dlist); +		if (status) +			goto out_error; +		dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", +			__func__, dlist->num_devs, dlist->eof); +		for (i = 0; i < dlist->num_devs; i++) { +			bdev = nfs4_blk_get_deviceinfo(server, fh, +						       &dlist->dev_id[i]); +			if (IS_ERR(bdev)) { +				status = PTR_ERR(bdev); +				goto out_error; +			} +			spin_lock(&b_mt_id->bm_lock); +			list_add(&bdev->bm_node, &b_mt_id->bm_devlist); +			spin_unlock(&b_mt_id->bm_lock); +		} +	} +	dprintk("%s SUCCESS\n", __func__); +	server->pnfs_ld_data = b_mt_id; + + out_return: +	kfree(dlist); +	return status; + + out_error: +	free_blk_mountid(b_mt_id); +	goto out_return; +} + +static int +bl_clear_layoutdriver(struct nfs_server *server) +{ +	struct block_mount_id *b_mt_id = server->pnfs_ld_data; + +	dprintk("%s enter\n", __func__); +	free_blk_mountid(b_mt_id); +	dprintk("%s RETURNS\n", __func__); +	return 0; +} + +static bool +is_aligned_req(struct nfs_page *req, unsigned int alignment) +{ +	return IS_ALIGNED(req->wb_offset, alignment) && +	       IS_ALIGNED(req->wb_bytes, alignment); +} + +static void +bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ +	if (pgio->pg_dreq != NULL && +	    !is_aligned_req(req, SECTOR_SIZE)) +		nfs_pageio_reset_read_mds(pgio); +	else +		pnfs_generic_pg_init_read(pgio, req); +} + +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t +bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, +		struct nfs_page *req) +{ +	if (pgio->pg_dreq != NULL && +	    !is_aligned_req(req, SECTOR_SIZE)) +		return 0; + +	return pnfs_generic_pg_test(pgio, prev, req); +} + +/* + * Return the number of contiguous bytes for a given inode + * starting at page frame idx. + */ +static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) +{ +	struct address_space *mapping = inode->i_mapping; +	pgoff_t end; + +	/* Optimize common case that writes from 0 to end of file */ +	end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); +	if (end != NFS_I(inode)->npages) { +		rcu_read_lock(); +		end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); +		rcu_read_unlock(); +	} + +	if (!end) +		return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT); +	else +		return (end - idx) << PAGE_CACHE_SHIFT; +} + +static void +bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ +	if (pgio->pg_dreq != NULL && +	    !is_aligned_req(req, PAGE_CACHE_SIZE)) { +		nfs_pageio_reset_write_mds(pgio); +	} else { +		u64 wb_size; +		if (pgio->pg_dreq == NULL) +			wb_size = pnfs_num_cont_bytes(pgio->pg_inode, +						      req->wb_index); +		else +			wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + +		pnfs_generic_pg_init_write(pgio, req, wb_size); +	} +} + +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t +bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, +		 struct nfs_page *req) +{ +	if (pgio->pg_dreq != NULL && +	    !is_aligned_req(req, PAGE_CACHE_SIZE)) +		return 0; + +	return pnfs_generic_pg_test(pgio, prev, req); +} + +static const struct nfs_pageio_ops bl_pg_read_ops = { +	.pg_init = bl_pg_init_read, +	.pg_test = bl_pg_test_read, +	.pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops bl_pg_write_ops = { +	.pg_init = bl_pg_init_write, +	.pg_test = bl_pg_test_write, +	.pg_doio = pnfs_generic_pg_writepages, +}; + +static struct pnfs_layoutdriver_type blocklayout_type = { +	.id				= LAYOUT_BLOCK_VOLUME, +	.name				= "LAYOUT_BLOCK_VOLUME", +	.owner				= THIS_MODULE, +	.read_pagelist			= bl_read_pagelist, +	.write_pagelist			= bl_write_pagelist, +	.alloc_layout_hdr		= bl_alloc_layout_hdr, +	.free_layout_hdr		= bl_free_layout_hdr, +	.alloc_lseg			= bl_alloc_lseg, +	.free_lseg			= bl_free_lseg, +	.encode_layoutcommit		= bl_encode_layoutcommit, +	.cleanup_layoutcommit		= bl_cleanup_layoutcommit, +	.set_layoutdriver		= bl_set_layoutdriver, +	.clear_layoutdriver		= bl_clear_layoutdriver, +	.pg_read_ops			= &bl_pg_read_ops, +	.pg_write_ops			= &bl_pg_write_ops, +}; + +static const struct rpc_pipe_ops bl_upcall_ops = { +	.upcall		= rpc_pipe_generic_upcall, +	.downcall	= bl_pipe_downcall, +	.destroy_msg	= bl_pipe_destroy_msg, +}; + +static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, +					    struct rpc_pipe *pipe) +{ +	struct dentry *dir, *dentry; + +	dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); +	if (dir == NULL) +		return ERR_PTR(-ENOENT); +	dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); +	dput(dir); +	return dentry; +} + +static void nfs4blocklayout_unregister_sb(struct super_block *sb, +					  struct rpc_pipe *pipe) +{ +	if (pipe->dentry) +		rpc_unlink(pipe->dentry); +} + +static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, +			   void *ptr) +{ +	struct super_block *sb = ptr; +	struct net *net = sb->s_fs_info; +	struct nfs_net *nn = net_generic(net, nfs_net_id); +	struct dentry *dentry; +	int ret = 0; + +	if (!try_module_get(THIS_MODULE)) +		return 0; + +	if (nn->bl_device_pipe == NULL) { +		module_put(THIS_MODULE); +		return 0; +	} + +	switch (event) { +	case RPC_PIPEFS_MOUNT: +		dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); +		if (IS_ERR(dentry)) { +			ret = PTR_ERR(dentry); +			break; +		} +		nn->bl_device_pipe->dentry = dentry; +		break; +	case RPC_PIPEFS_UMOUNT: +		if (nn->bl_device_pipe->dentry) +			nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); +		break; +	default: +		ret = -ENOTSUPP; +		break; +	} +	module_put(THIS_MODULE); +	return ret; +} + +static struct notifier_block nfs4blocklayout_block = { +	.notifier_call = rpc_pipefs_event, +}; + +static struct dentry *nfs4blocklayout_register_net(struct net *net, +						   struct rpc_pipe *pipe) +{ +	struct super_block *pipefs_sb; +	struct dentry *dentry; + +	pipefs_sb = rpc_get_sb_net(net); +	if (!pipefs_sb) +		return NULL; +	dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); +	rpc_put_sb_net(net); +	return dentry; +} + +static void nfs4blocklayout_unregister_net(struct net *net, +					   struct rpc_pipe *pipe) +{ +	struct super_block *pipefs_sb; + +	pipefs_sb = rpc_get_sb_net(net); +	if (pipefs_sb) { +		nfs4blocklayout_unregister_sb(pipefs_sb, pipe); +		rpc_put_sb_net(net); +	} +} + +static int nfs4blocklayout_net_init(struct net *net) +{ +	struct nfs_net *nn = net_generic(net, nfs_net_id); +	struct dentry *dentry; + +	init_waitqueue_head(&nn->bl_wq); +	nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); +	if (IS_ERR(nn->bl_device_pipe)) +		return PTR_ERR(nn->bl_device_pipe); +	dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); +	if (IS_ERR(dentry)) { +		rpc_destroy_pipe_data(nn->bl_device_pipe); +		return PTR_ERR(dentry); +	} +	nn->bl_device_pipe->dentry = dentry; +	return 0; +} + +static void nfs4blocklayout_net_exit(struct net *net) +{ +	struct nfs_net *nn = net_generic(net, nfs_net_id); + +	nfs4blocklayout_unregister_net(net, nn->bl_device_pipe); +	rpc_destroy_pipe_data(nn->bl_device_pipe); +	nn->bl_device_pipe = NULL; +} + +static struct pernet_operations nfs4blocklayout_net_ops = { +	.init = nfs4blocklayout_net_init, +	.exit = nfs4blocklayout_net_exit, +}; + +static int __init nfs4blocklayout_init(void) +{ +	int ret; + +	dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); + +	ret = pnfs_register_layoutdriver(&blocklayout_type); +	if (ret) +		goto out; + +	ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block); +	if (ret) +		goto out_remove; +	ret = register_pernet_subsys(&nfs4blocklayout_net_ops); +	if (ret) +		goto out_notifier; +out: +	return ret; + +out_notifier: +	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); +out_remove: +	pnfs_unregister_layoutdriver(&blocklayout_type); +	return ret; +} + +static void __exit nfs4blocklayout_exit(void) +{ +	dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", +	       __func__); + +	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); +	unregister_pernet_subsys(&nfs4blocklayout_net_ops); +	pnfs_unregister_layoutdriver(&blocklayout_type); +} + +MODULE_ALIAS("nfs-layouttype4-3"); + +module_init(nfs4blocklayout_init); +module_exit(nfs4blocklayout_exit); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h new file mode 100644 index 00000000000..9838fb02047 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayout.h @@ -0,0 +1,211 @@ +/* + *  linux/fs/nfs/blocklayout/blocklayout.h + * + *  Module for the NFSv4.1 pNFS block layout driver. + * + *  Copyright (c) 2006 The Regents of the University of Michigan. + *  All rights reserved. + * + *  Andy Adamson <andros@citi.umich.edu> + *  Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization.  if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose.  the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#ifndef FS_NFS_NFS4BLOCKLAYOUT_H +#define FS_NFS_NFS4BLOCKLAYOUT_H + +#include <linux/device-mapper.h> +#include <linux/nfs_fs.h> +#include <linux/sunrpc/rpc_pipe_fs.h> + +#include "../nfs4_fs.h" +#include "../pnfs.h" +#include "../netns.h" + +#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) +#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) +#define SECTOR_SIZE (1 << SECTOR_SHIFT) + +struct block_mount_id { +	spinlock_t			bm_lock;    /* protects list */ +	struct list_head		bm_devlist; /* holds pnfs_block_dev */ +}; + +struct pnfs_block_dev { +	struct list_head		bm_node; +	struct nfs4_deviceid		bm_mdevid;    /* associated devid */ +	struct block_device		*bm_mdev;     /* meta device itself */ +	struct net			*net; +}; + +enum exstate4 { +	PNFS_BLOCK_READWRITE_DATA	= 0, +	PNFS_BLOCK_READ_DATA		= 1, +	PNFS_BLOCK_INVALID_DATA		= 2, /* mapped, but data is invalid */ +	PNFS_BLOCK_NONE_DATA		= 3  /* unmapped, it's a hole */ +}; + +#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ + +struct my_tree { +	sector_t		mtt_step_size;	/* Internal sector alignment */ +	struct list_head	mtt_stub; /* Should be a radix tree */ +}; + +struct pnfs_inval_markings { +	spinlock_t	im_lock; +	struct my_tree	im_tree;	/* Sectors that need LAYOUTCOMMIT */ +	sector_t	im_block_size;	/* Server blocksize in sectors */ +	struct list_head im_extents;	/* Short extents for INVAL->RW conversion */ +}; + +struct pnfs_inval_tracking { +	struct list_head it_link; +	int		 it_sector; +	int		 it_tags; +}; + +/* sector_t fields are all in 512-byte sectors */ +struct pnfs_block_extent { +	struct kref	be_refcnt; +	struct list_head be_node;	/* link into lseg list */ +	struct nfs4_deviceid be_devid;  /* FIXME: could use device cache instead */ +	struct block_device *be_mdev; +	sector_t	be_f_offset;	/* the starting offset in the file */ +	sector_t	be_length;	/* the size of the extent */ +	sector_t	be_v_offset;	/* the starting offset in the volume */ +	enum exstate4	be_state;	/* the state of this extent */ +	struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ +}; + +/* Shortened extent used by LAYOUTCOMMIT */ +struct pnfs_block_short_extent { +	struct list_head bse_node; +	struct nfs4_deviceid bse_devid; +	struct block_device *bse_mdev; +	sector_t	bse_f_offset;	/* the starting offset in the file */ +	sector_t	bse_length;	/* the size of the extent */ +}; + +static inline void +BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) +{ +	spin_lock_init(&marks->im_lock); +	INIT_LIST_HEAD(&marks->im_tree.mtt_stub); +	INIT_LIST_HEAD(&marks->im_extents); +	marks->im_block_size = blocksize; +	marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, +					   blocksize); +} + +enum extentclass4 { +	RW_EXTENT       = 0, /* READWRTE and INVAL */ +	RO_EXTENT       = 1, /* READ and NONE */ +	EXTENT_LISTS    = 2, +}; + +static inline int bl_choose_list(enum exstate4 state) +{ +	if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) +		return RO_EXTENT; +	else +		return RW_EXTENT; +} + +struct pnfs_block_layout { +	struct pnfs_layout_hdr bl_layout; +	struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ +	spinlock_t		bl_ext_lock;   /* Protects list manipulation */ +	struct list_head	bl_extents[EXTENT_LISTS]; /* R and RW extents */ +	struct list_head	bl_commit;	/* Needs layout commit */ +	struct list_head	bl_committing;	/* Layout committing */ +	unsigned int		bl_count;	/* entries in bl_commit */ +	sector_t		bl_blocksize;  /* Server blocksize in sectors */ +}; + +#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data)) + +static inline struct pnfs_block_layout * +BLK_LO2EXT(struct pnfs_layout_hdr *lo) +{ +	return container_of(lo, struct pnfs_block_layout, bl_layout); +} + +static inline struct pnfs_block_layout * +BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) +{ +	return BLK_LO2EXT(lseg->pls_layout); +} + +struct bl_pipe_msg { +	struct rpc_pipe_msg msg; +	wait_queue_head_t *bl_wq; +}; + +struct bl_msg_hdr { +	u8  type; +	u16 totallen; /* length of entire message, including hdr itself */ +}; + +#define BL_DEVICE_UMOUNT               0x0 /* Umount--delete devices */ +#define BL_DEVICE_MOUNT                0x1 /* Mount--create devices*/ +#define BL_DEVICE_REQUEST_INIT         0x0 /* Start request */ +#define BL_DEVICE_REQUEST_PROC         0x1 /* User level process succeeds */ +#define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */ + +/* blocklayoutdev.c */ +ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); +void bl_pipe_destroy_msg(struct rpc_pipe_msg *); +void nfs4_blkdev_put(struct block_device *bdev); +struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, +						struct pnfs_device *dev); +int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, +				struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); + +/* blocklayoutdm.c */ +void bl_free_block_dev(struct pnfs_block_dev *bdev); + +/* extents.c */ +struct pnfs_block_extent * +bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, +		struct pnfs_block_extent **cow_read); +int bl_mark_sectors_init(struct pnfs_inval_markings *marks, +			     sector_t offset, sector_t length); +void bl_put_extent(struct pnfs_block_extent *be); +struct pnfs_block_extent *bl_alloc_extent(void); +int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); +int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, +				   struct xdr_stream *xdr, +				   const struct nfs4_layoutcommit_args *arg); +void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, +				   const struct nfs4_layoutcommit_args *arg, +				   int status); +int bl_add_merge_extent(struct pnfs_block_layout *bl, +			 struct pnfs_block_extent *new); +int bl_mark_for_commit(struct pnfs_block_extent *be, +			sector_t offset, sector_t length, +			struct pnfs_block_short_extent *new); +int bl_push_one_short_extent(struct pnfs_inval_markings *marks); +struct pnfs_block_short_extent * +bl_pop_one_short_extent(struct pnfs_inval_markings *marks); +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free); + +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c new file mode 100644 index 00000000000..04303b5c936 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -0,0 +1,384 @@ +/* + *  linux/fs/nfs/blocklayout/blocklayoutdev.c + * + *  Device operations for the pnfs nfs4 file layout driver. + * + *  Copyright (c) 2006 The Regents of the University of Michigan. + *  All rights reserved. + * + *  Andy Adamson <andros@citi.umich.edu> + *  Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization.  if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose.  the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#include <linux/module.h> +#include <linux/buffer_head.h> /* __bread */ + +#include <linux/genhd.h> +#include <linux/blkdev.h> +#include <linux/hash.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY         NFSDBG_PNFS_LD + +static int decode_sector_number(__be32 **rp, sector_t *sp) +{ +	uint64_t s; + +	*rp = xdr_decode_hyper(*rp, &s); +	if (s & 0x1ff) { +		printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__); +		return -1; +	} +	*sp = s >> SECTOR_SHIFT; +	return 0; +} + +/* + * Release the block device + */ +void nfs4_blkdev_put(struct block_device *bdev) +{ +	dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), +			MINOR(bdev->bd_dev)); +	blkdev_put(bdev, FMODE_READ); +} + +ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, +			 size_t mlen) +{ +	struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, +					 nfs_net_id); + +	if (mlen != sizeof (struct bl_dev_msg)) +		return -EINVAL; + +	if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0) +		return -EFAULT; + +	wake_up(&nn->bl_wq); + +	return mlen; +} + +void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ +	struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg); + +	if (msg->errno >= 0) +		return; +	wake_up(bl_pipe_msg->bl_wq); +} + +/* + * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. + */ +struct pnfs_block_dev * +nfs4_blk_decode_device(struct nfs_server *server, +		       struct pnfs_device *dev) +{ +	struct pnfs_block_dev *rv; +	struct block_device *bd = NULL; +	struct bl_pipe_msg bl_pipe_msg; +	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; +	struct bl_msg_hdr bl_msg = { +		.type = BL_DEVICE_MOUNT, +		.totallen = dev->mincount, +	}; +	uint8_t *dataptr; +	DECLARE_WAITQUEUE(wq, current); +	int offset, len, i, rc; +	struct net *net = server->nfs_client->cl_net; +	struct nfs_net *nn = net_generic(net, nfs_net_id); +	struct bl_dev_msg *reply = &nn->bl_mount_reply; + +	dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); +	dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, +		dev->mincount); + +	bl_pipe_msg.bl_wq = &nn->bl_wq; +	memset(msg, 0, sizeof(*msg)); +	msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); +	if (!msg->data) { +		rv = ERR_PTR(-ENOMEM); +		goto out; +	} + +	memcpy(msg->data, &bl_msg, sizeof(bl_msg)); +	dataptr = (uint8_t *) msg->data; +	len = dev->mincount; +	offset = sizeof(bl_msg); +	for (i = 0; len > 0; i++) { +		memcpy(&dataptr[offset], page_address(dev->pages[i]), +				len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); +		len -= PAGE_CACHE_SIZE; +		offset += PAGE_CACHE_SIZE; +	} +	msg->len = sizeof(bl_msg) + dev->mincount; + +	dprintk("%s CALLING USERSPACE DAEMON\n", __func__); +	add_wait_queue(&nn->bl_wq, &wq); +	rc = rpc_queue_upcall(nn->bl_device_pipe, msg); +	if (rc < 0) { +		remove_wait_queue(&nn->bl_wq, &wq); +		rv = ERR_PTR(rc); +		goto out; +	} + +	set_current_state(TASK_UNINTERRUPTIBLE); +	schedule(); +	__set_current_state(TASK_RUNNING); +	remove_wait_queue(&nn->bl_wq, &wq); + +	if (reply->status != BL_DEVICE_REQUEST_PROC) { +		dprintk("%s failed to open device: %d\n", +			__func__, reply->status); +		rv = ERR_PTR(-EINVAL); +		goto out; +	} + +	bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor), +			       FMODE_READ, NULL); +	if (IS_ERR(bd)) { +		dprintk("%s failed to open device : %ld\n", __func__, +			PTR_ERR(bd)); +		rv = ERR_CAST(bd); +		goto out; +	} + +	rv = kzalloc(sizeof(*rv), GFP_NOFS); +	if (!rv) { +		rv = ERR_PTR(-ENOMEM); +		goto out; +	} + +	rv->bm_mdev = bd; +	memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); +	rv->net = net; +	dprintk("%s Created device %s with bd_block_size %u\n", +		__func__, +		bd->bd_disk->disk_name, +		bd->bd_block_size); + +out: +	kfree(msg->data); +	return rv; +} + +/* Map deviceid returned by the server to constructed block_device */ +static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, +					    struct nfs4_deviceid *id) +{ +	struct block_device *rv = NULL; +	struct block_mount_id *mid; +	struct pnfs_block_dev *dev; + +	dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); +	mid = BLK_ID(lo); +	spin_lock(&mid->bm_lock); +	list_for_each_entry(dev, &mid->bm_devlist, bm_node) { +		if (memcmp(id->data, dev->bm_mdevid.data, +			   NFS4_DEVICEID4_SIZE) == 0) { +			rv = dev->bm_mdev; +			goto out; +		} +	} + out: +	spin_unlock(&mid->bm_lock); +	dprintk("%s returning %p\n", __func__, rv); +	return rv; +} + +/* Tracks info needed to ensure extents in layout obey constraints of spec */ +struct layout_verification { +	u32 mode;	/* R or RW */ +	u64 start;	/* Expected start of next non-COW extent */ +	u64 inval;	/* Start of INVAL coverage */ +	u64 cowread;	/* End of COW read coverage */ +}; + +/* Verify the extent meets the layout requirements of the pnfs-block draft, + * section 2.3.1. + */ +static int verify_extent(struct pnfs_block_extent *be, +			 struct layout_verification *lv) +{ +	if (lv->mode == IOMODE_READ) { +		if (be->be_state == PNFS_BLOCK_READWRITE_DATA || +		    be->be_state == PNFS_BLOCK_INVALID_DATA) +			return -EIO; +		if (be->be_f_offset != lv->start) +			return -EIO; +		lv->start += be->be_length; +		return 0; +	} +	/* lv->mode == IOMODE_RW */ +	if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { +		if (be->be_f_offset != lv->start) +			return -EIO; +		if (lv->cowread > lv->start) +			return -EIO; +		lv->start += be->be_length; +		lv->inval = lv->start; +		return 0; +	} else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { +		if (be->be_f_offset != lv->start) +			return -EIO; +		lv->start += be->be_length; +		return 0; +	} else if (be->be_state == PNFS_BLOCK_READ_DATA) { +		if (be->be_f_offset > lv->start) +			return -EIO; +		if (be->be_f_offset < lv->inval) +			return -EIO; +		if (be->be_f_offset < lv->cowread) +			return -EIO; +		/* It looks like you might want to min this with lv->start, +		 * but you really don't. +		 */ +		lv->inval = lv->inval + be->be_length; +		lv->cowread = be->be_f_offset + be->be_length; +		return 0; +	} else +		return -EIO; +} + +/* XDR decode pnfs_block_layout4 structure */ +int +nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, +			   struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) +{ +	struct pnfs_block_layout *bl = BLK_LO2EXT(lo); +	int i, status = -EIO; +	uint32_t count; +	struct pnfs_block_extent *be = NULL, *save; +	struct xdr_stream stream; +	struct xdr_buf buf; +	struct page *scratch; +	__be32 *p; +	struct layout_verification lv = { +		.mode = lgr->range.iomode, +		.start = lgr->range.offset >> SECTOR_SHIFT, +		.inval = lgr->range.offset >> SECTOR_SHIFT, +		.cowread = lgr->range.offset >> SECTOR_SHIFT, +	}; +	LIST_HEAD(extents); + +	dprintk("---> %s\n", __func__); + +	scratch = alloc_page(gfp_flags); +	if (!scratch) +		return -ENOMEM; + +	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); +	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + +	p = xdr_inline_decode(&stream, 4); +	if (unlikely(!p)) +		goto out_err; + +	count = be32_to_cpup(p++); + +	dprintk("%s enter, number of extents %i\n", __func__, count); +	p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); +	if (unlikely(!p)) +		goto out_err; + +	/* Decode individual extents, putting them in temporary +	 * staging area until whole layout is decoded to make error +	 * recovery easier. +	 */ +	for (i = 0; i < count; i++) { +		be = bl_alloc_extent(); +		if (!be) { +			status = -ENOMEM; +			goto out_err; +		} +		memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); +		p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); +		be->be_mdev = translate_devid(lo, &be->be_devid); +		if (!be->be_mdev) +			goto out_err; + +		/* The next three values are read in as bytes, +		 * but stored as 512-byte sector lengths +		 */ +		if (decode_sector_number(&p, &be->be_f_offset) < 0) +			goto out_err; +		if (decode_sector_number(&p, &be->be_length) < 0) +			goto out_err; +		if (decode_sector_number(&p, &be->be_v_offset) < 0) +			goto out_err; +		be->be_state = be32_to_cpup(p++); +		if (be->be_state == PNFS_BLOCK_INVALID_DATA) +			be->be_inval = &bl->bl_inval; +		if (verify_extent(be, &lv)) { +			dprintk("%s verify failed\n", __func__); +			goto out_err; +		} +		list_add_tail(&be->be_node, &extents); +	} +	if (lgr->range.offset + lgr->range.length != +			lv.start << SECTOR_SHIFT) { +		dprintk("%s Final length mismatch\n", __func__); +		be = NULL; +		goto out_err; +	} +	if (lv.start < lv.cowread) { +		dprintk("%s Final uncovered COW extent\n", __func__); +		be = NULL; +		goto out_err; +	} +	/* Extents decoded properly, now try to merge them in to +	 * existing layout extents. +	 */ +	spin_lock(&bl->bl_ext_lock); +	list_for_each_entry_safe(be, save, &extents, be_node) { +		list_del(&be->be_node); +		status = bl_add_merge_extent(bl, be); +		if (status) { +			spin_unlock(&bl->bl_ext_lock); +			/* This is a fairly catastrophic error, as the +			 * entire layout extent lists are now corrupted. +			 * We should have some way to distinguish this. +			 */ +			be = NULL; +			goto out_err; +		} +	} +	spin_unlock(&bl->bl_ext_lock); +	status = 0; + out: +	__free_page(scratch); +	dprintk("%s returns %i\n", __func__, status); +	return status; + + out_err: +	bl_put_extent(be); +	while (!list_empty(&extents)) { +		be = list_first_entry(&extents, struct pnfs_block_extent, +				      be_node); +		list_del(&be->be_node); +		bl_put_extent(be); +	} +	goto out; +} diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c new file mode 100644 index 00000000000..8999cfddd86 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -0,0 +1,108 @@ +/* + *  linux/fs/nfs/blocklayout/blocklayoutdm.c + * + *  Module for the NFSv4.1 pNFS block layout driver. + * + *  Copyright (c) 2007 The Regents of the University of Michigan. + *  All rights reserved. + * + *  Fred Isaman <iisaman@umich.edu> + *  Andy Adamson <andros@citi.umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization.  if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose.  the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include <linux/genhd.h> /* gendisk - used in a dprintk*/ +#include <linux/sched.h> +#include <linux/hash.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY         NFSDBG_PNFS_LD + +static void dev_remove(struct net *net, dev_t dev) +{ +	struct bl_pipe_msg bl_pipe_msg; +	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; +	struct bl_dev_msg bl_umount_request; +	struct bl_msg_hdr bl_msg = { +		.type = BL_DEVICE_UMOUNT, +		.totallen = sizeof(bl_umount_request), +	}; +	uint8_t *dataptr; +	DECLARE_WAITQUEUE(wq, current); +	struct nfs_net *nn = net_generic(net, nfs_net_id); + +	dprintk("Entering %s\n", __func__); + +	bl_pipe_msg.bl_wq = &nn->bl_wq; +	memset(msg, 0, sizeof(*msg)); +	msg->len = sizeof(bl_msg) + bl_msg.totallen; +	msg->data = kzalloc(msg->len, GFP_NOFS); +	if (!msg->data) +		goto out; + +	memset(&bl_umount_request, 0, sizeof(bl_umount_request)); +	bl_umount_request.major = MAJOR(dev); +	bl_umount_request.minor = MINOR(dev); + +	memcpy(msg->data, &bl_msg, sizeof(bl_msg)); +	dataptr = (uint8_t *) msg->data; +	memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); + +	add_wait_queue(&nn->bl_wq, &wq); +	if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) { +		remove_wait_queue(&nn->bl_wq, &wq); +		goto out; +	} + +	set_current_state(TASK_UNINTERRUPTIBLE); +	schedule(); +	__set_current_state(TASK_RUNNING); +	remove_wait_queue(&nn->bl_wq, &wq); + +out: +	kfree(msg->data); +} + +/* + * Release meta device + */ +static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) +{ +	dprintk("%s Releasing\n", __func__); +	nfs4_blkdev_put(bdev->bm_mdev); +	dev_remove(bdev->net, bdev->bm_mdev->bd_dev); +} + +void bl_free_block_dev(struct pnfs_block_dev *bdev) +{ +	if (bdev) { +		if (bdev->bm_mdev) { +			dprintk("%s Removing DM device: %d:%d\n", +				__func__, +				MAJOR(bdev->bm_mdev->bd_dev), +				MINOR(bdev->bm_mdev->bd_dev)); +			nfs4_blk_metadev_release(bdev); +		} +		kfree(bdev); +	} +} diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c new file mode 100644 index 00000000000..4d016144256 --- /dev/null +++ b/fs/nfs/blocklayout/extents.c @@ -0,0 +1,908 @@ +/* + *  linux/fs/nfs/blocklayout/blocklayout.h + * + *  Module for the NFSv4.1 pNFS block layout driver. + * + *  Copyright (c) 2006 The Regents of the University of Michigan. + *  All rights reserved. + * + *  Andy Adamson <andros@citi.umich.edu> + *  Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization.  if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose.  the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include "blocklayout.h" +#define NFSDBG_FACILITY         NFSDBG_PNFS_LD + +/* Bit numbers */ +#define EXTENT_INITIALIZED 0 +#define EXTENT_WRITTEN     1 +#define EXTENT_IN_COMMIT   2 +#define INTERNAL_EXISTS    MY_MAX_TAGS +#define INTERNAL_MASK      ((1 << INTERNAL_EXISTS) - 1) + +/* Returns largest t<=s s.t. t%base==0 */ +static inline sector_t normalize(sector_t s, int base) +{ +	sector_t tmp = s; /* Since do_div modifies its argument */ +	return s - sector_div(tmp, base); +} + +static inline sector_t normalize_up(sector_t s, int base) +{ +	return normalize(s + base - 1, base); +} + +/* Complete stub using list while determine API wanted */ + +/* Returns tags, or negative */ +static int32_t _find_entry(struct my_tree *tree, u64 s) +{ +	struct pnfs_inval_tracking *pos; + +	dprintk("%s(%llu) enter\n", __func__, s); +	list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { +		if (pos->it_sector > s) +			continue; +		else if (pos->it_sector == s) +			return pos->it_tags & INTERNAL_MASK; +		else +			break; +	} +	return -ENOENT; +} + +static inline +int _has_tag(struct my_tree *tree, u64 s, int32_t tag) +{ +	int32_t tags; + +	dprintk("%s(%llu, %i) enter\n", __func__, s, tag); +	s = normalize(s, tree->mtt_step_size); +	tags = _find_entry(tree, s); +	if ((tags < 0) || !(tags & (1 << tag))) +		return 0; +	else +		return 1; +} + +/* Creates entry with tag, or if entry already exists, unions tag to it. + * If storage is not NULL, newly created entry will use it. + * Returns number of entries added, or negative on error. + */ +static int _add_entry(struct my_tree *tree, u64 s, int32_t tag, +		      struct pnfs_inval_tracking *storage) +{ +	int found = 0; +	struct pnfs_inval_tracking *pos; + +	dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); +	list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { +		if (pos->it_sector > s) +			continue; +		else if (pos->it_sector == s) { +			found = 1; +			break; +		} else +			break; +	} +	if (found) { +		pos->it_tags |= (1 << tag); +		return 0; +	} else { +		struct pnfs_inval_tracking *new; +		new = storage; +		new->it_sector = s; +		new->it_tags = (1 << tag); +		list_add(&new->it_link, &pos->it_link); +		return 1; +	} +} + +/* XXXX Really want option to not create */ +/* Over range, unions tag with existing entries, else creates entry with tag */ +static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length) +{ +	u64 i; + +	dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); +	for (i = normalize(s, tree->mtt_step_size); i < s + length; +	     i += tree->mtt_step_size) +		if (_add_entry(tree, i, tag, NULL)) +			return -ENOMEM; +	return 0; +} + +/* Ensure that future operations on given range of tree will not malloc */ +static int _preload_range(struct pnfs_inval_markings *marks, +		u64 offset, u64 length) +{ +	u64 start, end, s; +	int count, i, used = 0, status = -ENOMEM; +	struct pnfs_inval_tracking **storage; +	struct my_tree  *tree = &marks->im_tree; + +	dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); +	start = normalize(offset, tree->mtt_step_size); +	end = normalize_up(offset + length, tree->mtt_step_size); +	count = (int)(end - start) / (int)tree->mtt_step_size; + +	/* Pre-malloc what memory we might need */ +	storage = kcalloc(count, sizeof(*storage), GFP_NOFS); +	if (!storage) +		return -ENOMEM; +	for (i = 0; i < count; i++) { +		storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), +				     GFP_NOFS); +		if (!storage[i]) +			goto out_cleanup; +	} + +	spin_lock_bh(&marks->im_lock); +	for (s = start; s < end; s += tree->mtt_step_size) +		used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); +	spin_unlock_bh(&marks->im_lock); + +	status = 0; + + out_cleanup: +	for (i = used; i < count; i++) { +		if (!storage[i]) +			break; +		kfree(storage[i]); +	} +	kfree(storage); +	return status; +} + +/* We are relying on page lock to serialize this */ +int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) +{ +	int rv; + +	spin_lock_bh(&marks->im_lock); +	rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); +	spin_unlock_bh(&marks->im_lock); +	return rv; +} + +/* Assume start, end already sector aligned */ +static int +_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag) +{ +	struct pnfs_inval_tracking *pos; +	u64 expect = 0; + +	dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); +	list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { +		if (pos->it_sector >= end) +			continue; +		if (!expect) { +			if ((pos->it_sector == end - tree->mtt_step_size) && +			    (pos->it_tags & (1 << tag))) { +				expect = pos->it_sector - tree->mtt_step_size; +				if (pos->it_sector < tree->mtt_step_size || expect < start) +					return 1; +				continue; +			} else { +				return 0; +			} +		} +		if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) +			return 0; +		expect -= tree->mtt_step_size; +		if (expect < start) +			return 1; +	} +	return 0; +} + +static int is_range_written(struct pnfs_inval_markings *marks, +			    sector_t start, sector_t end) +{ +	int rv; + +	spin_lock_bh(&marks->im_lock); +	rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); +	spin_unlock_bh(&marks->im_lock); +	return rv; +} + +/* Marks sectors in [offest, offset_length) as having been initialized. + * All lengths are step-aligned, where step is min(pagesize, blocksize). + * Currently assumes offset is page-aligned + */ +int bl_mark_sectors_init(struct pnfs_inval_markings *marks, +			     sector_t offset, sector_t length) +{ +	sector_t start, end; + +	dprintk("%s(offset=%llu,len=%llu) enter\n", +		__func__, (u64)offset, (u64)length); + +	start = normalize(offset, marks->im_block_size); +	end = normalize_up(offset + length, marks->im_block_size); +	if (_preload_range(marks, start, end - start)) +		goto outerr; + +	spin_lock_bh(&marks->im_lock); +	if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) +		goto out_unlock; +	spin_unlock_bh(&marks->im_lock); + +	return 0; + +out_unlock: +	spin_unlock_bh(&marks->im_lock); +outerr: +	return -ENOMEM; +} + +/* Marks sectors in [offest, offset+length) as having been written to disk. + * All lengths should be block aligned. + */ +static int mark_written_sectors(struct pnfs_inval_markings *marks, +				sector_t offset, sector_t length) +{ +	int status; + +	dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, +		(u64)offset, (u64)length); +	spin_lock_bh(&marks->im_lock); +	status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); +	spin_unlock_bh(&marks->im_lock); +	return status; +} + +static void print_short_extent(struct pnfs_block_short_extent *be) +{ +	dprintk("PRINT SHORT EXTENT extent %p\n", be); +	if (be) { +		dprintk("        be_f_offset %llu\n", (u64)be->bse_f_offset); +		dprintk("        be_length   %llu\n", (u64)be->bse_length); +	} +} + +static void print_clist(struct list_head *list, unsigned int count) +{ +	struct pnfs_block_short_extent *be; +	unsigned int i = 0; + +	ifdebug(FACILITY) { +		printk(KERN_DEBUG "****************\n"); +		printk(KERN_DEBUG "Extent list looks like:\n"); +		list_for_each_entry(be, list, bse_node) { +			i++; +			print_short_extent(be); +		} +		if (i != count) +			printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count); +		printk(KERN_DEBUG "****************\n"); +	} +} + +/* Note: In theory, we should do more checking that devid's match between + * old and new, but if they don't, the lists are too corrupt to salvage anyway. + */ +/* Note this is very similar to bl_add_merge_extent */ +static void add_to_commitlist(struct pnfs_block_layout *bl, +			      struct pnfs_block_short_extent *new) +{ +	struct list_head *clist = &bl->bl_commit; +	struct pnfs_block_short_extent *old, *save; +	sector_t end = new->bse_f_offset + new->bse_length; + +	dprintk("%s enter\n", __func__); +	print_short_extent(new); +	print_clist(clist, bl->bl_count); +	bl->bl_count++; +	/* Scan for proper place to insert, extending new to the left +	 * as much as possible. +	 */ +	list_for_each_entry_safe(old, save, clist, bse_node) { +		if (new->bse_f_offset < old->bse_f_offset) +			break; +		if (end <= old->bse_f_offset + old->bse_length) { +			/* Range is already in list */ +			bl->bl_count--; +			kfree(new); +			return; +		} else if (new->bse_f_offset <= +				old->bse_f_offset + old->bse_length) { +			/* new overlaps or abuts existing be */ +			if (new->bse_mdev == old->bse_mdev) { +				/* extend new to fully replace old */ +				new->bse_length += new->bse_f_offset - +						old->bse_f_offset; +				new->bse_f_offset = old->bse_f_offset; +				list_del(&old->bse_node); +				bl->bl_count--; +				kfree(old); +			} +		} +	} +	/* Note that if we never hit the above break, old will not point to a +	 * valid extent.  However, in that case &old->bse_node==list. +	 */ +	list_add_tail(&new->bse_node, &old->bse_node); +	/* Scan forward for overlaps.  If we find any, extend new and +	 * remove the overlapped extent. +	 */ +	old = list_prepare_entry(new, clist, bse_node); +	list_for_each_entry_safe_continue(old, save, clist, bse_node) { +		if (end < old->bse_f_offset) +			break; +		/* new overlaps or abuts old */ +		if (new->bse_mdev == old->bse_mdev) { +			if (end < old->bse_f_offset + old->bse_length) { +				/* extend new to fully cover old */ +				end = old->bse_f_offset + old->bse_length; +				new->bse_length = end - new->bse_f_offset; +			} +			list_del(&old->bse_node); +			bl->bl_count--; +			kfree(old); +		} +	} +	dprintk("%s: after merging\n", __func__); +	print_clist(clist, bl->bl_count); +} + +/* Note the range described by offset, length is guaranteed to be contained + * within be. + * new will be freed, either by this function or add_to_commitlist if they + * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist. + */ +int bl_mark_for_commit(struct pnfs_block_extent *be, +		    sector_t offset, sector_t length, +		    struct pnfs_block_short_extent *new) +{ +	sector_t new_end, end = offset + length; +	struct pnfs_block_layout *bl = container_of(be->be_inval, +						    struct pnfs_block_layout, +						    bl_inval); + +	mark_written_sectors(be->be_inval, offset, length); +	/* We want to add the range to commit list, but it must be +	 * block-normalized, and verified that the normalized range has +	 * been entirely written to disk. +	 */ +	new->bse_f_offset = offset; +	offset = normalize(offset, bl->bl_blocksize); +	if (offset < new->bse_f_offset) { +		if (is_range_written(be->be_inval, offset, new->bse_f_offset)) +			new->bse_f_offset = offset; +		else +			new->bse_f_offset = offset + bl->bl_blocksize; +	} +	new_end = normalize_up(end, bl->bl_blocksize); +	if (end < new_end) { +		if (is_range_written(be->be_inval, end, new_end)) +			end = new_end; +		else +			end = new_end - bl->bl_blocksize; +	} +	if (end <= new->bse_f_offset) { +		kfree(new); +		return 0; +	} +	new->bse_length = end - new->bse_f_offset; +	new->bse_devid = be->be_devid; +	new->bse_mdev = be->be_mdev; + +	spin_lock(&bl->bl_ext_lock); +	add_to_commitlist(bl, new); +	spin_unlock(&bl->bl_ext_lock); +	return 0; +} + +static void print_bl_extent(struct pnfs_block_extent *be) +{ +	dprintk("PRINT EXTENT extent %p\n", be); +	if (be) { +		dprintk("        be_f_offset %llu\n", (u64)be->be_f_offset); +		dprintk("        be_length   %llu\n", (u64)be->be_length); +		dprintk("        be_v_offset %llu\n", (u64)be->be_v_offset); +		dprintk("        be_state    %d\n", be->be_state); +	} +} + +static void +destroy_extent(struct kref *kref) +{ +	struct pnfs_block_extent *be; + +	be = container_of(kref, struct pnfs_block_extent, be_refcnt); +	dprintk("%s be=%p\n", __func__, be); +	kfree(be); +} + +void +bl_put_extent(struct pnfs_block_extent *be) +{ +	if (be) { +		dprintk("%s enter %p (%i)\n", __func__, be, +			atomic_read(&be->be_refcnt.refcount)); +		kref_put(&be->be_refcnt, destroy_extent); +	} +} + +struct pnfs_block_extent *bl_alloc_extent(void) +{ +	struct pnfs_block_extent *be; + +	be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS); +	if (!be) +		return NULL; +	INIT_LIST_HEAD(&be->be_node); +	kref_init(&be->be_refcnt); +	be->be_inval = NULL; +	return be; +} + +static void print_elist(struct list_head *list) +{ +	struct pnfs_block_extent *be; +	dprintk("****************\n"); +	dprintk("Extent list looks like:\n"); +	list_for_each_entry(be, list, be_node) { +		print_bl_extent(be); +	} +	dprintk("****************\n"); +} + +static inline int +extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) +{ +	/* Note this assumes new->be_f_offset >= old->be_f_offset */ +	return (new->be_state == old->be_state) && +		((new->be_state == PNFS_BLOCK_NONE_DATA) || +		 ((new->be_v_offset - old->be_v_offset == +		   new->be_f_offset - old->be_f_offset) && +		  new->be_mdev == old->be_mdev)); +} + +/* Adds new to appropriate list in bl, modifying new and removing existing + * extents as appropriate to deal with overlaps. + * + * See bl_find_get_extent for list constraints. + * + * Refcount on new is already set.  If end up not using it, or error out, + * need to put the reference. + * + * bl->bl_ext_lock is held by caller. + */ +int +bl_add_merge_extent(struct pnfs_block_layout *bl, +		     struct pnfs_block_extent *new) +{ +	struct pnfs_block_extent *be, *tmp; +	sector_t end = new->be_f_offset + new->be_length; +	struct list_head *list; + +	dprintk("%s enter with be=%p\n", __func__, new); +	print_bl_extent(new); +	list = &bl->bl_extents[bl_choose_list(new->be_state)]; +	print_elist(list); + +	/* Scan for proper place to insert, extending new to the left +	 * as much as possible. +	 */ +	list_for_each_entry_safe_reverse(be, tmp, list, be_node) { +		if (new->be_f_offset >= be->be_f_offset + be->be_length) +			break; +		if (new->be_f_offset >= be->be_f_offset) { +			if (end <= be->be_f_offset + be->be_length) { +				/* new is a subset of existing be*/ +				if (extents_consistent(be, new)) { +					dprintk("%s: new is subset, ignoring\n", +						__func__); +					bl_put_extent(new); +					return 0; +				} else { +					goto out_err; +				} +			} else { +				/* |<--   be   -->| +				 *          |<--   new   -->| */ +				if (extents_consistent(be, new)) { +					/* extend new to fully replace be */ +					new->be_length += new->be_f_offset - +						be->be_f_offset; +					new->be_f_offset = be->be_f_offset; +					new->be_v_offset = be->be_v_offset; +					dprintk("%s: removing %p\n", __func__, be); +					list_del(&be->be_node); +					bl_put_extent(be); +				} else { +					goto out_err; +				} +			} +		} else if (end >= be->be_f_offset + be->be_length) { +			/* new extent overlap existing be */ +			if (extents_consistent(be, new)) { +				/* extend new to fully replace be */ +				dprintk("%s: removing %p\n", __func__, be); +				list_del(&be->be_node); +				bl_put_extent(be); +			} else { +				goto out_err; +			} +		} else if (end > be->be_f_offset) { +			/*           |<--   be   -->| +			 *|<--   new   -->| */ +			if (extents_consistent(new, be)) { +				/* extend new to fully replace be */ +				new->be_length += be->be_f_offset + be->be_length - +					new->be_f_offset - new->be_length; +				dprintk("%s: removing %p\n", __func__, be); +				list_del(&be->be_node); +				bl_put_extent(be); +			} else { +				goto out_err; +			} +		} +	} +	/* Note that if we never hit the above break, be will not point to a +	 * valid extent.  However, in that case &be->be_node==list. +	 */ +	list_add(&new->be_node, &be->be_node); +	dprintk("%s: inserting new\n", __func__); +	print_elist(list); +	/* FIXME - The per-list consistency checks have all been done, +	 * should now check cross-list consistency. +	 */ +	return 0; + + out_err: +	bl_put_extent(new); +	return -EIO; +} + +/* Returns extent, or NULL.  If a second READ extent exists, it is returned + * in cow_read, if given. + * + * The extents are kept in two seperate ordered lists, one for READ and NONE, + * one for READWRITE and INVALID.  Within each list, we assume: + * 1. Extents are ordered by file offset. + * 2. For any given isect, there is at most one extents that matches. + */ +struct pnfs_block_extent * +bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, +	    struct pnfs_block_extent **cow_read) +{ +	struct pnfs_block_extent *be, *cow, *ret; +	int i; + +	dprintk("%s enter with isect %llu\n", __func__, (u64)isect); +	cow = ret = NULL; +	spin_lock(&bl->bl_ext_lock); +	for (i = 0; i < EXTENT_LISTS; i++) { +		list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { +			if (isect >= be->be_f_offset + be->be_length) +				break; +			if (isect >= be->be_f_offset) { +				/* We have found an extent */ +				dprintk("%s Get %p (%i)\n", __func__, be, +					atomic_read(&be->be_refcnt.refcount)); +				kref_get(&be->be_refcnt); +				if (!ret) +					ret = be; +				else if (be->be_state != PNFS_BLOCK_READ_DATA) +					bl_put_extent(be); +				else +					cow = be; +				break; +			} +		} +		if (ret && +		    (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) +			break; +	} +	spin_unlock(&bl->bl_ext_lock); +	if (cow_read) +		*cow_read = cow; +	print_bl_extent(ret); +	return ret; +} + +/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */ +static struct pnfs_block_extent * +bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) +{ +	struct pnfs_block_extent *be, *ret = NULL; +	int i; + +	dprintk("%s enter with isect %llu\n", __func__, (u64)isect); +	for (i = 0; i < EXTENT_LISTS; i++) { +		if (ret) +			break; +		list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { +			if (isect >= be->be_f_offset + be->be_length) +				break; +			if (isect >= be->be_f_offset) { +				/* We have found an extent */ +				dprintk("%s Get %p (%i)\n", __func__, be, +					atomic_read(&be->be_refcnt.refcount)); +				kref_get(&be->be_refcnt); +				ret = be; +				break; +			} +		} +	} +	print_bl_extent(ret); +	return ret; +} + +int +encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, +			       struct xdr_stream *xdr, +			       const struct nfs4_layoutcommit_args *arg) +{ +	struct pnfs_block_short_extent *lce, *save; +	unsigned int count = 0; +	__be32 *p, *xdr_start; + +	dprintk("%s enter\n", __func__); +	/* BUG - creation of bl_commit is buggy - need to wait for +	 * entire block to be marked WRITTEN before it can be added. +	 */ +	spin_lock(&bl->bl_ext_lock); +	/* Want to adjust for possible truncate */ +	/* We now want to adjust argument range */ + +	/* XDR encode the ranges found */ +	xdr_start = xdr_reserve_space(xdr, 8); +	if (!xdr_start) +		goto out; +	list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { +		p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); +		if (!p) +			break; +		p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE); +		p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT); +		p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); +		p = xdr_encode_hyper(p, 0LL); +		*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); +		list_move_tail(&lce->bse_node, &bl->bl_committing); +		bl->bl_count--; +		count++; +	} +	xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); +	xdr_start[1] = cpu_to_be32(count); +out: +	spin_unlock(&bl->bl_ext_lock); +	dprintk("%s found %i ranges\n", __func__, count); +	return 0; +} + +/* Helper function to set_to_rw that initialize a new extent */ +static void +_prep_new_extent(struct pnfs_block_extent *new, +		 struct pnfs_block_extent *orig, +		 sector_t offset, sector_t length, int state) +{ +	kref_init(&new->be_refcnt); +	/* don't need to INIT_LIST_HEAD(&new->be_node) */ +	memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); +	new->be_mdev = orig->be_mdev; +	new->be_f_offset = offset; +	new->be_length = length; +	new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; +	new->be_state = state; +	new->be_inval = orig->be_inval; +} + +/* Tries to merge be with extent in front of it in list. + * Frees storage if not used. + */ +static struct pnfs_block_extent * +_front_merge(struct pnfs_block_extent *be, struct list_head *head, +	     struct pnfs_block_extent *storage) +{ +	struct pnfs_block_extent *prev; + +	if (!storage) +		goto no_merge; +	if (&be->be_node == head || be->be_node.prev == head) +		goto no_merge; +	prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); +	if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || +	    !extents_consistent(prev, be)) +		goto no_merge; +	_prep_new_extent(storage, prev, prev->be_f_offset, +			 prev->be_length + be->be_length, prev->be_state); +	list_replace(&prev->be_node, &storage->be_node); +	bl_put_extent(prev); +	list_del(&be->be_node); +	bl_put_extent(be); +	return storage; + + no_merge: +	kfree(storage); +	return be; +} + +static u64 +set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) +{ +	u64 rv = offset + length; +	struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; +	struct pnfs_block_extent *children[3]; +	struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; +	int i = 0, j; + +	dprintk("%s(%llu, %llu)\n", __func__, offset, length); +	/* Create storage for up to three new extents e1, e2, e3 */ +	e1 = kmalloc(sizeof(*e1), GFP_ATOMIC); +	e2 = kmalloc(sizeof(*e2), GFP_ATOMIC); +	e3 = kmalloc(sizeof(*e3), GFP_ATOMIC); +	/* BUG - we are ignoring any failure */ +	if (!e1 || !e2 || !e3) +		goto out_nosplit; + +	spin_lock(&bl->bl_ext_lock); +	be = bl_find_get_extent_locked(bl, offset); +	rv = be->be_f_offset + be->be_length; +	if (be->be_state != PNFS_BLOCK_INVALID_DATA) { +		spin_unlock(&bl->bl_ext_lock); +		goto out_nosplit; +	} +	/* Add e* to children, bumping e*'s krefs */ +	if (be->be_f_offset != offset) { +		_prep_new_extent(e1, be, be->be_f_offset, +				 offset - be->be_f_offset, +				 PNFS_BLOCK_INVALID_DATA); +		children[i++] = e1; +		print_bl_extent(e1); +	} else +		merge1 = e1; +	_prep_new_extent(e2, be, offset, +			 min(length, be->be_f_offset + be->be_length - offset), +			 PNFS_BLOCK_READWRITE_DATA); +	children[i++] = e2; +	print_bl_extent(e2); +	if (offset + length < be->be_f_offset + be->be_length) { +		_prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, +				 be->be_f_offset + be->be_length - +				 offset - length, +				 PNFS_BLOCK_INVALID_DATA); +		children[i++] = e3; +		print_bl_extent(e3); +	} else +		merge2 = e3; + +	/* Remove be from list, and insert the e* */ +	/* We don't get refs on e*, since this list is the base reference +	 * set when init'ed. +	 */ +	if (i < 3) +		children[i] = NULL; +	new = children[0]; +	list_replace(&be->be_node, &new->be_node); +	bl_put_extent(be); +	new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); +	for (j = 1; j < i; j++) { +		old = new; +		new = children[j]; +		list_add(&new->be_node, &old->be_node); +	} +	if (merge2) { +		/* This is a HACK, should just create a _back_merge function */ +		new = list_entry(new->be_node.next, +				 struct pnfs_block_extent, be_node); +		new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); +	} +	spin_unlock(&bl->bl_ext_lock); + +	/* Since we removed the base reference above, be is now scheduled for +	 * destruction. +	 */ +	bl_put_extent(be); +	dprintk("%s returns %llu after split\n", __func__, rv); +	return rv; + + out_nosplit: +	kfree(e1); +	kfree(e2); +	kfree(e3); +	dprintk("%s returns %llu without splitting\n", __func__, rv); +	return rv; +} + +void +clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, +			      const struct nfs4_layoutcommit_args *arg, +			      int status) +{ +	struct pnfs_block_short_extent *lce, *save; + +	dprintk("%s status %d\n", __func__, status); +	list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) { +		if (likely(!status)) { +			u64 offset = lce->bse_f_offset; +			u64 end = offset + lce->bse_length; + +			do { +				offset = set_to_rw(bl, offset, end - offset); +			} while (offset < end); +			list_del(&lce->bse_node); + +			kfree(lce); +		} else { +			list_del(&lce->bse_node); +			spin_lock(&bl->bl_ext_lock); +			add_to_commitlist(bl, lce); +			spin_unlock(&bl->bl_ext_lock); +		} +	} +} + +int bl_push_one_short_extent(struct pnfs_inval_markings *marks) +{ +	struct pnfs_block_short_extent *new; + +	new = kmalloc(sizeof(*new), GFP_NOFS); +	if (unlikely(!new)) +		return -ENOMEM; + +	spin_lock_bh(&marks->im_lock); +	list_add(&new->bse_node, &marks->im_extents); +	spin_unlock_bh(&marks->im_lock); + +	return 0; +} + +struct pnfs_block_short_extent * +bl_pop_one_short_extent(struct pnfs_inval_markings *marks) +{ +	struct pnfs_block_short_extent *rv = NULL; + +	spin_lock_bh(&marks->im_lock); +	if (!list_empty(&marks->im_extents)) { +		rv = list_entry((&marks->im_extents)->next, +				struct pnfs_block_short_extent, bse_node); +		list_del_init(&rv->bse_node); +	} +	spin_unlock_bh(&marks->im_lock); + +	return rv; +} + +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free) +{ +	struct pnfs_block_short_extent *se = NULL, *tmp; + +	if (num_to_free <= 0) +		return; + +	spin_lock(&marks->im_lock); +	list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) { +		list_del(&se->bse_node); +		kfree(se); +		if (--num_to_free == 0) +			break; +	} +	spin_unlock(&marks->im_lock); + +	BUG_ON(num_to_free > 0); +} diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index 84690319e62..5f7b053720e 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -13,6 +13,7 @@  #include <linux/slab.h>  #include <linux/sunrpc/cache.h>  #include <linux/sunrpc/rpc_pipe_fs.h> +#include <net/net_namespace.h>  #include "cache_lib.h" @@ -111,31 +112,47 @@ int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)  	return 0;  } -int nfs_cache_register(struct cache_detail *cd) +int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd)  { -	struct nameidata nd; -	struct vfsmount *mnt;  	int ret; +	struct dentry *dir; -	mnt = rpc_get_mount(); -	if (IS_ERR(mnt)) -		return PTR_ERR(mnt); -	ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd); -	if (ret) -		goto err; -	ret = sunrpc_cache_register_pipefs(nd.path.dentry, -			cd->name, 0600, cd); -	path_put(&nd.path); -	if (!ret) -		return ret; -err: -	rpc_put_mount(); +	dir = rpc_d_lookup_sb(sb, "cache"); +	ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd); +	dput(dir);  	return ret;  } -void nfs_cache_unregister(struct cache_detail *cd) +int nfs_cache_register_net(struct net *net, struct cache_detail *cd)  { -	sunrpc_cache_unregister_pipefs(cd); -	rpc_put_mount(); +	struct super_block *pipefs_sb; +	int ret = 0; + +	sunrpc_init_cache_detail(cd); +	pipefs_sb = rpc_get_sb_net(net); +	if (pipefs_sb) { +		ret = nfs_cache_register_sb(pipefs_sb, cd); +		rpc_put_sb_net(net); +		if (ret) +			sunrpc_destroy_cache_detail(cd); +	} +	return ret; +} + +void nfs_cache_unregister_sb(struct super_block *sb, struct cache_detail *cd) +{ +	if (cd->u.pipefs.dir) +		sunrpc_cache_unregister_pipefs(cd);  } +void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd) +{ +	struct super_block *pipefs_sb; + +	pipefs_sb = rpc_get_sb_net(net); +	if (pipefs_sb) { +		nfs_cache_unregister_sb(pipefs_sb, cd); +		rpc_put_sb_net(net); +	} +	sunrpc_destroy_cache_detail(cd); +} diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h index 76f856e284e..4116d2c3f52 100644 --- a/fs/nfs/cache_lib.h +++ b/fs/nfs/cache_lib.h @@ -6,7 +6,7 @@  #include <linux/completion.h>  #include <linux/sunrpc/cache.h> -#include <asm/atomic.h> +#include <linux/atomic.h>  /*   * Deferred request handling @@ -23,5 +23,9 @@ extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);  extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);  extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq); -extern int nfs_cache_register(struct cache_detail *cd); -extern void nfs_cache_unregister(struct cache_detail *cd); +extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd); +extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd); +extern int nfs_cache_register_sb(struct super_block *sb, +				 struct cache_detail *cd); +extern void nfs_cache_unregister_sb(struct super_block *sb, +				    struct cache_detail *cd); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index aeec017fe81..073b4cf67ed 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -9,23 +9,22 @@  #include <linux/completion.h>  #include <linux/ip.h>  #include <linux/module.h> -#include <linux/smp_lock.h>  #include <linux/sunrpc/svc.h>  #include <linux/sunrpc/svcsock.h>  #include <linux/nfs_fs.h> +#include <linux/errno.h>  #include <linux/mutex.h>  #include <linux/freezer.h>  #include <linux/kthread.h>  #include <linux/sunrpc/svcauth_gss.h> -#if defined(CONFIG_NFS_V4_1)  #include <linux/sunrpc/bc_xprt.h> -#endif  #include <net/inet_sock.h>  #include "nfs4_fs.h"  #include "callback.h"  #include "internal.h" +#include "netns.h"  #define NFSDBG_FACILITY NFSDBG_CALLBACK @@ -40,31 +39,32 @@ static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];  static DEFINE_MUTEX(nfs_callback_mutex);  static struct svc_program nfs4_callback_program; -unsigned int nfs_callback_set_tcpport; -unsigned short nfs_callback_tcpport; -unsigned short nfs_callback_tcpport6; -#define NFS_CALLBACK_MAXPORTNR (65535U) - -static int param_set_portnr(const char *val, const struct kernel_param *kp) +static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)  { -	unsigned long num;  	int ret; +	struct nfs_net *nn = net_generic(net, nfs_net_id); + +	ret = svc_create_xprt(serv, "tcp", net, PF_INET, +				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); +	if (ret <= 0) +		goto out_err; +	nn->nfs_callback_tcpport = ret; +	dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", +			nn->nfs_callback_tcpport, PF_INET, net); -	if (!val) -		return -EINVAL; -	ret = strict_strtoul(val, 0, &num); -	if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) -		return -EINVAL; -	*((unsigned int *)kp->arg) = num; +	ret = svc_create_xprt(serv, "tcp", net, PF_INET6, +				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); +	if (ret > 0) { +		nn->nfs_callback_tcpport6 = ret; +		dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", +				nn->nfs_callback_tcpport6, PF_INET6, net); +	} else if (ret != -EAFNOSUPPORT) +		goto out_err;  	return 0; -} -static struct kernel_param_ops param_ops_portnr = { -	.set = param_set_portnr, -	.get = param_get_uint, -}; -#define param_check_portnr(name, p) __param_check(name, p, unsigned int); -module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); +out_err: +	return (ret) ? ret : -ENOMEM; +}  /*   * This is the NFSv4 callback kernel thread. @@ -72,7 +72,7 @@ module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);  static int  nfs4_callback_svc(void *vrqstp)  { -	int err, preverr = 0; +	int err;  	struct svc_rqst *rqstp = vrqstp;  	set_freezable(); @@ -82,20 +82,8 @@ nfs4_callback_svc(void *vrqstp)  		 * Listen for a request on the socket  		 */  		err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); -		if (err == -EAGAIN || err == -EINTR) { -			preverr = err; -			continue; -		} -		if (err < 0) { -			if (err != preverr) { -				printk(KERN_WARNING "%s: unexpected error " -					"from svc_recv (%d)\n", __func__, err); -				preverr = err; -			} -			schedule_timeout_uninterruptible(HZ); +		if (err == -EAGAIN || err == -EINTR)  			continue; -		} -		preverr = err;  		svc_process(rqstp);  	}  	return 0; @@ -104,39 +92,24 @@ nfs4_callback_svc(void *vrqstp)  /*   * Prepare to bring up the NFSv4 callback service   */ -struct svc_rqst * +static struct svc_rqst *  nfs4_callback_up(struct svc_serv *serv)  { -	int ret; - -	ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET, -				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); -	if (ret <= 0) -		goto out_err; -	nfs_callback_tcpport = ret; -	dprintk("NFS: Callback listener port = %u (af %u)\n", -			nfs_callback_tcpport, PF_INET); - -	ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6, -				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); -	if (ret > 0) { -		nfs_callback_tcpport6 = ret; -		dprintk("NFS: Callback listener port = %u (af %u)\n", -				nfs_callback_tcpport6, PF_INET6); -	} else if (ret == -EAFNOSUPPORT) -		ret = 0; -	else -		goto out_err; - -	return svc_prepare_thread(serv, &serv->sv_pools[0]); - -out_err: -	if (ret == 0) -		ret = -ENOMEM; -	return ERR_PTR(ret); +	return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);  }  #if defined(CONFIG_NFS_V4_1) +static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net) +{ +	/* +	 * Create an svc_sock for the back channel service that shares the +	 * fore channel connection. +	 * Returns the input port (0) and sets the svc_serv bc_xprt on success +	 */ +	return svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0, +			      SVC_SOCK_ANONYMOUS); +} +  /*   * The callback service for NFSv4.1 callbacks   */ @@ -152,6 +125,9 @@ nfs41_callback_svc(void *vrqstp)  	set_freezable();  	while (!kthread_should_stop()) { +		if (try_to_freeze()) +			continue; +  		prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);  		spin_lock_bh(&serv->sv_cb_lock);  		if (!list_empty(&serv->sv_cb_list)) { @@ -175,147 +151,257 @@ nfs41_callback_svc(void *vrqstp)  /*   * Bring up the NFSv4.1 callback service   */ -struct svc_rqst * -nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) +static struct svc_rqst * +nfs41_callback_up(struct svc_serv *serv)  { -	struct svc_xprt *bc_xprt; -	struct svc_rqst *rqstp = ERR_PTR(-ENOMEM); - -	dprintk("--> %s\n", __func__); -	/* Create a svc_sock for the service */ -	bc_xprt = svc_sock_create(serv, xprt->prot); -	if (!bc_xprt) -		goto out; - -	/* -	 * Save the svc_serv in the transport so that it can -	 * be referenced when the session backchannel is initialized -	 */ -	serv->bc_xprt = bc_xprt; -	xprt->bc_serv = serv; +	struct svc_rqst *rqstp;  	INIT_LIST_HEAD(&serv->sv_cb_list);  	spin_lock_init(&serv->sv_cb_lock);  	init_waitqueue_head(&serv->sv_cb_waitq); -	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); -	if (IS_ERR(rqstp)) -		svc_sock_destroy(bc_xprt); -out: -	dprintk("--> %s return %p\n", __func__, rqstp); +	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); +	if (IS_ERR(rqstp)) { +		svc_xprt_put(serv->sv_bc_xprt); +		serv->sv_bc_xprt = NULL; +	} +	dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));  	return rqstp;  } -static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, -		struct svc_serv *serv, struct rpc_xprt *xprt, +static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,  		struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))  { -	if (minorversion) { -		*rqstpp = nfs41_callback_up(serv, xprt); -		*callback_svc = nfs41_callback_svc; -	} -	return minorversion; +	*rqstpp = nfs41_callback_up(serv); +	*callback_svc = nfs41_callback_svc;  }  static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, -		struct nfs_callback_data *cb_info) +		struct svc_serv *serv)  {  	if (minorversion) -		xprt->bc_serv = cb_info->serv; +		/* +		 * Save the svc_serv in the transport so that it can +		 * be referenced when the session backchannel is initialized +		 */ +		xprt->bc_serv = serv;  }  #else -static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, -		struct svc_serv *serv, struct rpc_xprt *xprt, -		struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) +static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)  {  	return 0;  } +static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, +		struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) +{ +	*rqstpp = ERR_PTR(-ENOTSUPP); +	*callback_svc = ERR_PTR(-ENOTSUPP); +} +  static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, -		struct nfs_callback_data *cb_info) +		struct svc_serv *serv)  {  }  #endif /* CONFIG_NFS_V4_1 */ -/* - * Bring up the callback thread if it is not already up. - */ -int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) +static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, +				  struct svc_serv *serv)  { -	struct svc_serv *serv = NULL;  	struct svc_rqst *rqstp;  	int (*callback_svc)(void *vrqstp);  	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; -	char svc_name[12]; -	int ret = 0; -	int minorversion_setup; +	int ret; -	mutex_lock(&nfs_callback_mutex); -	if (cb_info->users++ || cb_info->task != NULL) { -		nfs_callback_bc_serv(minorversion, xprt, cb_info); -		goto out; -	} -	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); -	if (!serv) { -		ret = -ENOMEM; -		goto out_err; -	} +	nfs_callback_bc_serv(minorversion, xprt, serv); + +	if (cb_info->task) +		return 0; -	minorversion_setup =  nfs_minorversion_callback_svc_setup(minorversion, -					serv, xprt, &rqstp, &callback_svc); -	if (!minorversion_setup) { +	switch (minorversion) { +	case 0:  		/* v4.0 callback setup */  		rqstp = nfs4_callback_up(serv);  		callback_svc = nfs4_callback_svc; +		break; +	default: +		nfs_minorversion_callback_svc_setup(serv, +				&rqstp, &callback_svc);  	} -	if (IS_ERR(rqstp)) { -		ret = PTR_ERR(rqstp); -		goto out_err; -	} +	if (IS_ERR(rqstp)) +		return PTR_ERR(rqstp);  	svc_sock_update_bufs(serv); -	sprintf(svc_name, "nfsv4.%u-svc", minorversion);  	cb_info->serv = serv;  	cb_info->rqst = rqstp; -	cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name); +	cb_info->task = kthread_run(callback_svc, cb_info->rqst, +				    "nfsv4.%u-svc", minorversion);  	if (IS_ERR(cb_info->task)) {  		ret = PTR_ERR(cb_info->task);  		svc_exit_thread(cb_info->rqst);  		cb_info->rqst = NULL;  		cb_info->task = NULL; -		goto out_err; +		return ret;  	} -out: +	dprintk("nfs_callback_up: service started\n"); +	return 0; +} + +static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struct net *net) +{ +	struct nfs_net *nn = net_generic(net, nfs_net_id); + +	if (--nn->cb_users[minorversion]) +		return; + +	dprintk("NFS: destroy per-net callback data; net=%p\n", net); +	svc_shutdown_net(serv, net); +} + +static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net) +{ +	struct nfs_net *nn = net_generic(net, nfs_net_id); +	int ret; + +	if (nn->cb_users[minorversion]++) +		return 0; + +	dprintk("NFS: create per-net callback data; net=%p\n", net); + +	ret = svc_bind(serv, net); +	if (ret < 0) { +		printk(KERN_WARNING "NFS: bind callback service failed\n"); +		goto err_bind; +	} + +	switch (minorversion) { +		case 0: +			ret = nfs4_callback_up_net(serv, net); +			break; +		case 1: +		case 2: +			ret = nfs41_callback_up_net(serv, net); +			break; +		default: +			printk(KERN_ERR "NFS: unknown callback version: %d\n", +					minorversion); +			ret = -EINVAL; +			break; +	} + +	if (ret < 0) { +		printk(KERN_ERR "NFS: callback service start failed\n"); +		goto err_socks; +	} +	return 0; + +err_socks: +	svc_rpcb_cleanup(serv, net); +err_bind: +	dprintk("NFS: Couldn't create callback socket: err = %d; " +			"net = %p\n", ret, net); +	return ret; +} + +static struct svc_serv *nfs_callback_create_svc(int minorversion) +{ +	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; +	struct svc_serv *serv; + +	/* +	 * Check whether we're already up and running. +	 */ +	if (cb_info->task) { +		/* +		 * Note: increase service usage, because later in case of error +		 * svc_destroy() will be called. +		 */ +		svc_get(cb_info->serv); +		return cb_info->serv; +	} + +	/* +	 * Sanity check: if there's no task, +	 * we should be the first user ... +	 */ +	if (cb_info->users) +		printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", +			cb_info->users); + +	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); +	if (!serv) { +		printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); +		return ERR_PTR(-ENOMEM); +	} +	/* As there is only one thread we need to over-ride the +	 * default maximum of 80 connections +	 */ +	serv->sv_maxconn = 1024; +	dprintk("nfs_callback_create_svc: service created\n"); +	return serv; +} + +/* + * Bring up the callback thread if it is not already up. + */ +int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) +{ +	struct svc_serv *serv; +	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; +	int ret; +	struct net *net = xprt->xprt_net; + +	mutex_lock(&nfs_callback_mutex); + +	serv = nfs_callback_create_svc(minorversion); +	if (IS_ERR(serv)) { +		ret = PTR_ERR(serv); +		goto err_create; +	} + +	ret = nfs_callback_up_net(minorversion, serv, net); +	if (ret < 0) +		goto err_net; + +	ret = nfs_callback_start_svc(minorversion, xprt, serv); +	if (ret < 0) +		goto err_start; + +	cb_info->users++;  	/*  	 * svc_create creates the svc_serv with sv_nrthreads == 1, and then  	 * svc_prepare_thread increments that. So we need to call svc_destroy  	 * on both success and failure so that the refcount is 1 when the  	 * thread exits.  	 */ -	if (serv) -		svc_destroy(serv); +err_net: +	svc_destroy(serv); +err_create:  	mutex_unlock(&nfs_callback_mutex);  	return ret; -out_err: -	dprintk("NFS: Couldn't create callback socket or server thread; " -		"err = %d\n", ret); -	cb_info->users--; -	goto out; + +err_start: +	nfs_callback_down_net(minorversion, serv, net); +	dprintk("NFS: Couldn't create server thread; err = %d\n", ret); +	goto err_net;  }  /*   * Kill the callback thread if it's no longer being used.   */ -void nfs_callback_down(int minorversion) +void nfs_callback_down(int minorversion, struct net *net)  {  	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];  	mutex_lock(&nfs_callback_mutex); +	nfs_callback_down_net(minorversion, cb_info->serv, net);  	cb_info->users--;  	if (cb_info->users == 0 && cb_info->task != NULL) {  		kthread_stop(cb_info->task); +		dprintk("nfs_callback_down: service stopped\n");  		svc_exit_thread(cb_info->rqst); +		dprintk("nfs_callback_down: service destroyed\n");  		cb_info->serv = NULL;  		cb_info->rqst = NULL;  		cb_info->task = NULL; @@ -323,58 +409,57 @@ void nfs_callback_down(int minorversion)  	mutex_unlock(&nfs_callback_mutex);  } -static int check_gss_callback_principal(struct nfs_client *clp, -					struct svc_rqst *rqstp) +/* Boolean check of RPC_AUTH_GSS principal */ +int +check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)  { -	struct rpc_clnt *r = clp->cl_rpcclient; -	char *p = svc_gss_principal(rqstp); +	char *p = rqstp->rq_cred.cr_principal; +	if (rqstp->rq_authop->flavour != RPC_AUTH_GSS) +		return 1; + +	/* No RPC_AUTH_GSS on NFSv4.1 back channel yet */ +	if (clp->cl_minorversion != 0) +		return 0;  	/*  	 * It might just be a normal user principal, in which case  	 * userspace won't bother to tell us the name at all.  	 */  	if (p == NULL) -		return SVC_DENIED; +		return 0;  	/* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */  	if (memcmp(p, "nfs@", 4) != 0) -		return SVC_DENIED; +		return 0;  	p += 4; -	if (strcmp(p, r->cl_server) != 0) -		return SVC_DENIED; -	return SVC_OK; +	if (strcmp(p, clp->cl_hostname) != 0) +		return 0; +	return 1;  } +/* + * pg_authenticate method for nfsv4 callback threads. + * + * The authflavor has been negotiated, so an incorrect flavor is a server + * bug. Drop packets with incorrect authflavor. + * + * All other checking done after NFS decoding where the nfs_client can be + * found in nfs4_callback_compound + */  static int nfs_callback_authenticate(struct svc_rqst *rqstp)  { -	struct nfs_client *clp; -	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); -	int ret = SVC_OK; - -	/* Don't talk to strangers */ -	clp = nfs_find_client(svc_addr(rqstp), 4); -	if (clp == NULL) -		return SVC_DROP; - -	dprintk("%s: %s NFSv4 callback!\n", __func__, -			svc_print_addr(rqstp, buf, sizeof(buf))); -  	switch (rqstp->rq_authop->flavour) { -		case RPC_AUTH_NULL: -			if (rqstp->rq_proc != CB_NULL) -				ret = SVC_DENIED; -			break; -		case RPC_AUTH_UNIX: -			break; -		case RPC_AUTH_GSS: -			ret = check_gss_callback_principal(clp, rqstp); -			break; -		default: -			ret = SVC_DENIED; +	case RPC_AUTH_NULL: +		if (rqstp->rq_proc != CB_NULL) +			return SVC_DROP; +		break; +	case RPC_AUTH_GSS: +		/* No RPC_AUTH_GSS support yet in NFSv4.1 */ +		 if (svc_is_backchannel(rqstp)) +			return SVC_DROP;  	} -	nfs_put_client(clp); -	return ret; +	return SVC_OK;  }  /* diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 85a7cfd1b8d..84326e9fb47 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -7,6 +7,7 @@   */  #ifndef __LINUX_FS_NFS_CALLBACK_H  #define __LINUX_FS_NFS_CALLBACK_H +#include <linux/sunrpc/svc.h>  #define NFS4_CALLBACK 0x40000000  #define NFS4_CALLBACK_XDRSIZE 2048 @@ -31,13 +32,24 @@ enum nfs4_callback_opnum {  	OP_CB_WANTS_CANCELLED = 12,  	OP_CB_NOTIFY_LOCK   = 13,  	OP_CB_NOTIFY_DEVICEID = 14, +/* Callback operations new to NFSv4.2 */ +	OP_CB_OFFLOAD = 15,  	OP_CB_ILLEGAL = 10044,  }; +struct cb_process_state { +	__be32			drc_status; +	struct nfs_client	*clp; +	u32			slotid; +	u32			minorversion; +	struct net		*net; +}; +  struct cb_compound_hdr_arg {  	unsigned int taglen;  	const char *tag;  	unsigned int minorversion; +	unsigned int cb_ident; /* v4.0 callback identifier */  	unsigned nops;  }; @@ -103,14 +115,23 @@ struct cb_sequenceres {  	uint32_t			csr_target_highestslotid;  }; -extern unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, -				       struct cb_sequenceres *res); +extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, +				       struct cb_sequenceres *res, +				       struct cb_process_state *cps);  extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,  					     const nfs4_stateid *stateid);  #define RCA4_TYPE_MASK_RDATA_DLG	0  #define RCA4_TYPE_MASK_WDATA_DLG	1 +#define RCA4_TYPE_MASK_DIR_DLG         2 +#define RCA4_TYPE_MASK_FILE_LAYOUT     3 +#define RCA4_TYPE_MASK_BLK_LAYOUT      4 +#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN  8 +#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX  9 +#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 +#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 +#define RCA4_TYPE_MASK_ALL 0xf31f  struct cb_recallanyargs {  	struct sockaddr	*craa_addr; @@ -118,25 +139,66 @@ struct cb_recallanyargs {  	uint32_t	craa_type_mask;  }; -extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy); +extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, +					void *dummy, +					struct cb_process_state *cps);  struct cb_recallslotargs {  	struct sockaddr	*crsa_addr; -	uint32_t	crsa_target_max_slots; +	uint32_t	crsa_target_highest_slotid; +}; +extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, +					 void *dummy, +					 struct cb_process_state *cps); + +struct cb_layoutrecallargs { +	struct sockaddr		*cbl_addr; +	uint32_t		cbl_recall_type; +	uint32_t		cbl_layout_type; +	uint32_t		cbl_layoutchanged; +	union { +		struct { +			struct nfs_fh		cbl_fh; +			struct pnfs_layout_range cbl_range; +			nfs4_stateid		cbl_stateid; +		}; +		struct nfs_fsid		cbl_fsid; +	};  }; -extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, -					  void *dummy); -#endif /* CONFIG_NFS_V4_1 */ +extern __be32 nfs4_callback_layoutrecall( +	struct cb_layoutrecallargs *args, +	void *dummy, struct cb_process_state *cps); -extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); -extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy); +struct cb_devicenotifyitem { +	uint32_t		cbd_notify_type; +	uint32_t		cbd_layout_type; +	struct nfs4_deviceid	cbd_dev_id; +	uint32_t		cbd_immediate; +}; -#ifdef CONFIG_NFS_V4 +struct cb_devicenotifyargs { +	int				 ndevs; +	struct cb_devicenotifyitem	 *devs; +}; + +extern __be32 nfs4_callback_devicenotify( +	struct cb_devicenotifyargs *args, +	void *dummy, struct cb_process_state *cps); + +#endif /* CONFIG_NFS_V4_1 */ +extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); +extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, +				    struct cb_getattrres *res, +				    struct cb_process_state *cps); +extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, +				   struct cb_process_state *cps); +#if IS_ENABLED(CONFIG_NFS_V4)  extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); -extern void nfs_callback_down(int minorversion); +extern void nfs_callback_down(int minorversion, struct net *net);  extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,  					    const nfs4_stateid *stateid); +extern int nfs4_set_callback_sessionid(struct nfs_client *clp);  #endif /* CONFIG_NFS_V4 */  /*   * nfs41: Callbacks are expected to not cause substantial latency, @@ -148,6 +210,5 @@ extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,  extern unsigned int nfs_callback_set_tcpport;  extern unsigned short nfs_callback_tcpport; -extern unsigned short nfs_callback_tcpport6;  #endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 2950fca0c61..41db5258e7a 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -8,34 +8,40 @@  #include <linux/nfs4.h>  #include <linux/nfs_fs.h>  #include <linux/slab.h> +#include <linux/rcupdate.h>  #include "nfs4_fs.h"  #include "callback.h"  #include "delegation.h"  #include "internal.h" +#include "pnfs.h" +#include "nfs4session.h" +#include "nfs4trace.h"  #ifdef NFS_DEBUG  #define NFSDBG_FACILITY NFSDBG_CALLBACK  #endif -  -__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res) + +__be32 nfs4_callback_getattr(struct cb_getattrargs *args, +			     struct cb_getattrres *res, +			     struct cb_process_state *cps)  { -	struct nfs_client *clp;  	struct nfs_delegation *delegation;  	struct nfs_inode *nfsi;  	struct inode *inode; +	res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION); +	if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ +		goto out; +  	res->bitmap[0] = res->bitmap[1] = 0;  	res->status = htonl(NFS4ERR_BADHANDLE); -	clp = nfs_find_client(args->addr, 4); -	if (clp == NULL) -		goto out; -	dprintk("NFS: GETATTR callback request from %s\n", -		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); +	dprintk_rcu("NFS: GETATTR callback request from %s\n", +		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); -	inode = nfs_delegation_find_inode(clp, &args->fh); +	inode = nfs_delegation_find_inode(cps->clp, &args->fh);  	if (inode == NULL) -		goto out_putclient; +		goto out;  	nfsi = NFS_I(inode);  	rcu_read_lock();  	delegation = rcu_dereference(nfsi->delegation); @@ -55,77 +61,233 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *  out_iput:  	rcu_read_unlock();  	iput(inode); -out_putclient: -	nfs_put_client(clp);  out:  	dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));  	return res->status;  } -__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) +__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, +			    struct cb_process_state *cps)  { -	struct nfs_client *clp;  	struct inode *inode;  	__be32 res; +	res = htonl(NFS4ERR_OP_NOT_IN_SESSION); +	if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ +		goto out; + +	dprintk_rcu("NFS: RECALL callback request from %s\n", +		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); +  	res = htonl(NFS4ERR_BADHANDLE); -	clp = nfs_find_client(args->addr, 4); -	if (clp == NULL) +	inode = nfs_delegation_find_inode(cps->clp, &args->fh); +	if (inode == NULL)  		goto out; +	/* Set up a helper thread to actually return the delegation */ +	switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { +	case 0: +		res = 0; +		break; +	case -ENOENT: +		res = htonl(NFS4ERR_BAD_STATEID); +		break; +	default: +		res = htonl(NFS4ERR_RESOURCE); +	} +	trace_nfs4_recall_delegation(inode, -ntohl(res)); +	iput(inode); +out: +	dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); +	return res; +} -	dprintk("NFS: RECALL callback request from %s\n", -		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); - -	do { -		struct nfs_client *prev = clp; - -		inode = nfs_delegation_find_inode(clp, &args->fh); -		if (inode != NULL) { -			/* Set up a helper thread to actually return the delegation */ -			switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { -				case 0: -					res = 0; -					break; -				case -ENOENT: -					if (res != 0) -						res = htonl(NFS4ERR_BAD_STATEID); -					break; -				default: -					res = htonl(NFS4ERR_RESOURCE); +#if defined(CONFIG_NFS_V4_1) + +/* + * Lookup a layout by filehandle. + * + * Note: gets a refcount on the layout hdr and on its respective inode. + * Caller must put the layout hdr and the inode. + * + * TODO: keep track of all layouts (and delegations) in a hash table + * hashed by filehandle. + */ +static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, +		struct nfs_fh *fh, nfs4_stateid *stateid) +{ +	struct nfs_server *server; +	struct inode *ino; +	struct pnfs_layout_hdr *lo; + +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +		list_for_each_entry(lo, &server->layouts, plh_layouts) { +			if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid)) +				continue; +			if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) +				continue; +			ino = igrab(lo->plh_inode); +			if (!ino) +				break; +			spin_lock(&ino->i_lock); +			/* Is this layout in the process of being freed? */ +			if (NFS_I(ino)->layout != lo) { +				spin_unlock(&ino->i_lock); +				iput(ino); +				break;  			} -			iput(inode); +			pnfs_get_layout_hdr(lo); +			spin_unlock(&ino->i_lock); +			return lo;  		} -		clp = nfs_find_client_next(prev); -		nfs_put_client(prev); -	} while (clp != NULL); +	} + +	return NULL; +} + +static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, +		struct nfs_fh *fh, nfs4_stateid *stateid) +{ +	struct pnfs_layout_hdr *lo; + +	spin_lock(&clp->cl_lock); +	rcu_read_lock(); +	lo = get_layout_by_fh_locked(clp, fh, stateid); +	rcu_read_unlock(); +	spin_unlock(&clp->cl_lock); + +	return lo; +} + +static u32 initiate_file_draining(struct nfs_client *clp, +				  struct cb_layoutrecallargs *args) +{ +	struct inode *ino; +	struct pnfs_layout_hdr *lo; +	u32 rv = NFS4ERR_NOMATCHING_LAYOUT; +	LIST_HEAD(free_me_list); + +	lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); +	if (!lo) +		goto out; + +	ino = lo->plh_inode; +	spin_lock(&ino->i_lock); +	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || +	    pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, +					&args->cbl_range)) +		rv = NFS4ERR_DELAY; +	else +		rv = NFS4ERR_NOMATCHING_LAYOUT; +	pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); +	spin_unlock(&ino->i_lock); +	pnfs_free_lseg_list(&free_me_list); +	pnfs_put_layout_hdr(lo); +	iput(ino);  out: -	dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); +	return rv; +} + +static u32 initiate_bulk_draining(struct nfs_client *clp, +				  struct cb_layoutrecallargs *args) +{ +	int stat; + +	if (args->cbl_recall_type == RETURN_FSID) +		stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true); +	else +		stat = pnfs_destroy_layouts_byclid(clp, true); +	if (stat != 0) +		return NFS4ERR_DELAY; +	return NFS4ERR_NOMATCHING_LAYOUT; +} + +static u32 do_callback_layoutrecall(struct nfs_client *clp, +				    struct cb_layoutrecallargs *args) +{ +	u32 res; + +	dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); +	if (args->cbl_recall_type == RETURN_FILE) +		res = initiate_file_draining(clp, args); +	else +		res = initiate_bulk_draining(clp, args); +	dprintk("%s returning %i\n", __func__, res);  	return res; +  } -int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) +__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, +				  void *dummy, struct cb_process_state *cps)  { -	if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, -					 sizeof(delegation->stateid.data)) != 0) -		return 0; -	return 1; +	u32 res; + +	dprintk("%s: -->\n", __func__); + +	if (cps->clp) +		res = do_callback_layoutrecall(cps->clp, args); +	else +		res = NFS4ERR_OP_NOT_IN_SESSION; + +	dprintk("%s: exit with status = %d\n", __func__, res); +	return cpu_to_be32(res);  } -#if defined(CONFIG_NFS_V4_1) +static void pnfs_recall_all_layouts(struct nfs_client *clp) +{ +	struct cb_layoutrecallargs args; + +	/* Pretend we got a CB_LAYOUTRECALL(ALL) */ +	memset(&args, 0, sizeof(args)); +	args.cbl_recall_type = RETURN_ALL; +	/* FIXME we ignore errors, what should we do? */ +	do_callback_layoutrecall(clp, &args); +} -int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) +__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, +				  void *dummy, struct cb_process_state *cps)  { -	if (delegation == NULL) -		return 0; +	int i; +	__be32 res = 0; +	struct nfs_client *clp = cps->clp; +	struct nfs_server *server = NULL; -	if (stateid->stateid.seqid != 0) -		return 0; -	if (memcmp(&delegation->stateid.stateid.other, -		   &stateid->stateid.other, -		   NFS4_STATEID_OTHER_SIZE)) -		return 0; +	dprintk("%s: -->\n", __func__); -	return 1; +	if (!clp) { +		res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); +		goto out; +	} + +	for (i = 0; i < args->ndevs; i++) { +		struct cb_devicenotifyitem *dev = &args->devs[i]; + +		if (!server || +		    server->pnfs_curr_ld->id != dev->cbd_layout_type) { +			rcu_read_lock(); +			list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) +				if (server->pnfs_curr_ld && +				    server->pnfs_curr_ld->id == dev->cbd_layout_type) { +					rcu_read_unlock(); +					goto found; +				} +			rcu_read_unlock(); +			dprintk("%s: layout type %u not found\n", +				__func__, dev->cbd_layout_type); +			continue; +		} + +	found: +		if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) +			dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, " +				"deleting instead\n", __func__); +		nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); +	} + +out: +	kfree(args->devs); +	dprintk("%s: exit with status = %u\n", +		__func__, be32_to_cpu(res)); +	return res;  }  /* @@ -146,24 +308,24 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)  {  	struct nfs4_slot *slot; -	dprintk("%s enter. slotid %d seqid %d\n", +	dprintk("%s enter. slotid %u seqid %u\n",  		__func__, args->csa_slotid, args->csa_sequenceid); -	if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS) +	if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)  		return htonl(NFS4ERR_BADSLOT);  	slot = tbl->slots + args->csa_slotid; -	dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr); +	dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr);  	/* Normal */  	if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {  		slot->seq_nr++; -		return htonl(NFS4_OK); +		goto out_ok;  	}  	/* Replay */  	if (args->csa_sequenceid == slot->seq_nr) { -		dprintk("%s seqid %d is a replay\n", +		dprintk("%s seqid %u is a replay\n",  			__func__, args->csa_sequenceid);  		/* Signal process_op to set this error on next op */  		if (args->csa_cachethis == 0) @@ -177,47 +339,14 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)  	/* Wraparound */  	if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {  		slot->seq_nr = 1; -		return htonl(NFS4_OK); +		goto out_ok;  	}  	/* Misordered request */  	return htonl(NFS4ERR_SEQ_MISORDERED); -} - -/* - * Returns a pointer to a held 'struct nfs_client' that matches the server's - * address, major version number, and session ID.  It is the caller's - * responsibility to release the returned reference. - * - * Returns NULL if there are no connections with sessions, or if no session - * matches the one of interest. - */ - static struct nfs_client *find_client_with_session( -	const struct sockaddr *addr, u32 nfsversion, -	struct nfs4_sessionid *sessionid) -{ -	struct nfs_client *clp; - -	clp = nfs_find_client(addr, 4); -	if (clp == NULL) -		return NULL; - -	do { -		struct nfs_client *prev = clp; - -		if (clp->cl_session != NULL) { -			if (memcmp(clp->cl_session->sess_id.data, -					sessionid->data, -					NFS4_MAX_SESSIONID_LEN) == 0) { -				/* Returns a held reference to clp */ -				return clp; -			} -		} -		clp = nfs_find_client_next(prev); -		nfs_put_client(prev); -	} while (clp != NULL); - -	return NULL; +out_ok: +	tbl->highest_used_slotid = args->csa_slotid; +	return htonl(NFS4_OK);  }  /* @@ -276,20 +405,40 @@ out:  }  __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, -				struct cb_sequenceres *res) +			      struct cb_sequenceres *res, +			      struct cb_process_state *cps)  { +	struct nfs4_slot_table *tbl;  	struct nfs_client *clp;  	int i; -	__be32 status; +	__be32 status = htonl(NFS4ERR_BADSESSION); -	status = htonl(NFS4ERR_BADSESSION); -	clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid); +	clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, +					 &args->csa_sessionid, cps->minorversion);  	if (clp == NULL)  		goto out; +	tbl = &clp->cl_session->bc_slot_table; + +	spin_lock(&tbl->slot_tbl_lock); +	/* state manager is resetting the session */ +	if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { +		spin_unlock(&tbl->slot_tbl_lock); +		status = htonl(NFS4ERR_DELAY); +		/* Return NFS4ERR_BADSESSION if we're draining the session +		 * in order to reset it. +		 */ +		if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) +			status = htonl(NFS4ERR_BADSESSION); +		goto out; +	} +  	status = validate_seqid(&clp->cl_session->bc_slot_table, args); +	spin_unlock(&tbl->slot_tbl_lock);  	if (status) -		goto out_putclient; +		goto out; + +	cps->slotid = args->csa_slotid;  	/*  	 * Check for pending referring calls.  If a match is found, a @@ -298,7 +447,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,  	 */  	if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {  		status = htonl(NFS4ERR_DELAY); -		goto out_putclient; +		goto out;  	}  	memcpy(&res->csr_sessionid, &args->csa_sessionid, @@ -308,82 +457,85 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,  	res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;  	res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; -out_putclient: -	nfs_put_client(clp);  out: +	cps->clp = clp; /* put in nfs4_callback_compound */  	for (i = 0; i < args->csa_nrclists; i++)  		kfree(args->csa_rclists[i].rcl_refcalls);  	kfree(args->csa_rclists); -	if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) -		res->csr_status = 0; -	else +	if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) { +		cps->drc_status = status; +		status = 0; +	} else  		res->csr_status = status; + +	trace_nfs4_cb_sequence(args, res, status);  	dprintk("%s: exit with status = %d res->csr_status %d\n", __func__,  		ntohl(status), ntohl(res->csr_status));  	return status;  } -__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) +static bool +validate_bitmap_values(unsigned long mask) +{ +	return (mask & ~RCA4_TYPE_MASK_ALL) == 0; +} + +__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy, +			       struct cb_process_state *cps)  { -	struct nfs_client *clp;  	__be32 status;  	fmode_t flags = 0; -	status = htonl(NFS4ERR_OP_NOT_IN_SESSION); -	clp = nfs_find_client(args->craa_addr, 4); -	if (clp == NULL) +	status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); +	if (!cps->clp) /* set in cb_sequence */  		goto out; -	dprintk("NFS: RECALL_ANY callback request from %s\n", -		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); +	dprintk_rcu("NFS: RECALL_ANY callback request from %s\n", +		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + +	status = cpu_to_be32(NFS4ERR_INVAL); +	if (!validate_bitmap_values(args->craa_type_mask)) +		goto out; +	status = cpu_to_be32(NFS4_OK);  	if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *)  		     &args->craa_type_mask))  		flags = FMODE_READ;  	if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *)  		     &args->craa_type_mask))  		flags |= FMODE_WRITE; - +	if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) +		     &args->craa_type_mask)) +		pnfs_recall_all_layouts(cps->clp);  	if (flags) -		nfs_expire_all_delegation_types(clp, flags); -	status = htonl(NFS4_OK); +		nfs_expire_unused_delegation_types(cps->clp, flags);  out:  	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));  	return status;  }  /* Reduce the fore channel's max_slots to the target value */ -__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy) +__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, +				struct cb_process_state *cps)  { -	struct nfs_client *clp;  	struct nfs4_slot_table *fc_tbl;  	__be32 status;  	status = htonl(NFS4ERR_OP_NOT_IN_SESSION); -	clp = nfs_find_client(args->crsa_addr, 4); -	if (clp == NULL) +	if (!cps->clp) /* set in cb_sequence */  		goto out; -	dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", -		rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), -		args->crsa_target_max_slots); - -	fc_tbl = &clp->cl_session->fc_slot_table; +	dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %u\n", +		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR), +		args->crsa_target_highest_slotid); -	status = htonl(NFS4ERR_BAD_HIGH_SLOT); -	if (args->crsa_target_max_slots > fc_tbl->max_slots || -	    args->crsa_target_max_slots < 1) -		goto out_putclient; +	fc_tbl = &cps->clp->cl_session->fc_slot_table;  	status = htonl(NFS4_OK); -	if (args->crsa_target_max_slots == fc_tbl->max_slots) -		goto out_putclient; -	fc_tbl->target_max_slots = args->crsa_target_max_slots; -	nfs41_handle_recall_slot(clp); -out_putclient: -	nfs_put_client(clp);	/* balance nfs_find_client */ +	nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid); +	nfs41_server_notify_target_slotid_update(cps->clp);  out:  	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));  	return status; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 05af212f0ed..f4ccfe6521e 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -9,9 +9,14 @@  #include <linux/sunrpc/svc.h>  #include <linux/nfs4.h>  #include <linux/nfs_fs.h> +#include <linux/ratelimit.h> +#include <linux/printk.h>  #include <linux/slab.h> +#include <linux/sunrpc/bc_xprt.h>  #include "nfs4_fs.h"  #include "callback.h" +#include "internal.h" +#include "nfs4session.h"  #define CB_OP_TAGLEN_MAXSZ	(512)  #define CB_OP_HDR_RES_MAXSZ	(2 + CB_OP_TAGLEN_MAXSZ) @@ -22,6 +27,8 @@  #define CB_OP_RECALL_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)  #if defined(CONFIG_NFS_V4_1) +#define CB_OP_LAYOUTRECALL_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ) +#define CB_OP_DEVICENOTIFY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)  #define CB_OP_SEQUENCE_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ + \  					4 + 1 + 3)  #define CB_OP_RECALLANY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ) @@ -33,7 +40,8 @@  /* Internal error code */  #define NFS4ERR_RESOURCE_HDR	11050 -typedef __be32 (*callback_process_op_t)(void *, void *); +typedef __be32 (*callback_process_op_t)(void *, void *, +					struct cb_process_state *);  typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);  typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); @@ -68,7 +76,7 @@ static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)  	p = xdr_inline_decode(xdr, nbytes);  	if (unlikely(p == NULL)) -		printk(KERN_WARNING "NFSv4 callback reply buffer overflowed!\n"); +		printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n");  	return p;  } @@ -133,10 +141,10 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)  {  	__be32 *p; -	p = read_buf(xdr, 16); +	p = read_buf(xdr, NFS4_STATEID_SIZE);  	if (unlikely(p == NULL))  		return htonl(NFS4ERR_RESOURCE); -	memcpy(stateid->data, p, 16); +	memcpy(stateid, p, NFS4_STATEID_SIZE);  	return 0;  } @@ -150,7 +158,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound  		return status;  	/* We do not like overly long tags! */  	if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) { -		printk("NFSv4 CALLBACK %s: client sent tag of length %u\n", +		printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n",  				__func__, hdr->taglen);  		return htonl(NFS4ERR_RESOURCE);  	} @@ -158,11 +166,11 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound  	if (unlikely(p == NULL))  		return htonl(NFS4ERR_RESOURCE);  	hdr->minorversion = ntohl(*p++); -	/* Check minor version is zero or one. */ -	if (hdr->minorversion <= 1) { -		p++;	/* skip callback_ident */ +	/* Check for minor version support */ +	if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) { +		hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */  	} else { -		printk(KERN_WARNING "%s: NFSv4 server callback with " +		pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "  			"illegal minor version %u!\n",  			__func__, hdr->minorversion);  		return htonl(NFS4ERR_MINOR_VERS_MISMATCH); @@ -220,6 +228,157 @@ out:  #if defined(CONFIG_NFS_V4_1) +static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, +				       struct xdr_stream *xdr, +				       struct cb_layoutrecallargs *args) +{ +	__be32 *p; +	__be32 status = 0; +	uint32_t iomode; + +	args->cbl_addr = svc_addr(rqstp); +	p = read_buf(xdr, 4 * sizeof(uint32_t)); +	if (unlikely(p == NULL)) { +		status = htonl(NFS4ERR_BADXDR); +		goto out; +	} + +	args->cbl_layout_type = ntohl(*p++); +	/* Depite the spec's xdr, iomode really belongs in the FILE switch, +	 * as it is unusable and ignored with the other types. +	 */ +	iomode = ntohl(*p++); +	args->cbl_layoutchanged = ntohl(*p++); +	args->cbl_recall_type = ntohl(*p++); + +	if (args->cbl_recall_type == RETURN_FILE) { +		args->cbl_range.iomode = iomode; +		status = decode_fh(xdr, &args->cbl_fh); +		if (unlikely(status != 0)) +			goto out; + +		p = read_buf(xdr, 2 * sizeof(uint64_t)); +		if (unlikely(p == NULL)) { +			status = htonl(NFS4ERR_BADXDR); +			goto out; +		} +		p = xdr_decode_hyper(p, &args->cbl_range.offset); +		p = xdr_decode_hyper(p, &args->cbl_range.length); +		status = decode_stateid(xdr, &args->cbl_stateid); +		if (unlikely(status != 0)) +			goto out; +	} else if (args->cbl_recall_type == RETURN_FSID) { +		p = read_buf(xdr, 2 * sizeof(uint64_t)); +		if (unlikely(p == NULL)) { +			status = htonl(NFS4ERR_BADXDR); +			goto out; +		} +		p = xdr_decode_hyper(p, &args->cbl_fsid.major); +		p = xdr_decode_hyper(p, &args->cbl_fsid.minor); +	} else if (args->cbl_recall_type != RETURN_ALL) { +		status = htonl(NFS4ERR_BADXDR); +		goto out; +	} +	dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n", +		__func__, +		args->cbl_layout_type, iomode, +		args->cbl_layoutchanged, args->cbl_recall_type); +out: +	dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); +	return status; +} + +static +__be32 decode_devicenotify_args(struct svc_rqst *rqstp, +				struct xdr_stream *xdr, +				struct cb_devicenotifyargs *args) +{ +	__be32 *p; +	__be32 status = 0; +	u32 tmp; +	int n, i; +	args->ndevs = 0; + +	/* Num of device notifications */ +	p = read_buf(xdr, sizeof(uint32_t)); +	if (unlikely(p == NULL)) { +		status = htonl(NFS4ERR_BADXDR); +		goto out; +	} +	n = ntohl(*p++); +	if (n <= 0) +		goto out; +	if (n > ULONG_MAX / sizeof(*args->devs)) { +		status = htonl(NFS4ERR_BADXDR); +		goto out; +	} + +	args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL); +	if (!args->devs) { +		status = htonl(NFS4ERR_DELAY); +		goto out; +	} + +	/* Decode each dev notification */ +	for (i = 0; i < n; i++) { +		struct cb_devicenotifyitem *dev = &args->devs[i]; + +		p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE); +		if (unlikely(p == NULL)) { +			status = htonl(NFS4ERR_BADXDR); +			goto err; +		} + +		tmp = ntohl(*p++);	/* bitmap size */ +		if (tmp != 1) { +			status = htonl(NFS4ERR_INVAL); +			goto err; +		} +		dev->cbd_notify_type = ntohl(*p++); +		if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && +		    dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { +			status = htonl(NFS4ERR_INVAL); +			goto err; +		} + +		tmp = ntohl(*p++);	/* opaque size */ +		if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && +		     (tmp != NFS4_DEVICEID4_SIZE + 8)) || +		    ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && +		     (tmp != NFS4_DEVICEID4_SIZE + 4))) { +			status = htonl(NFS4ERR_INVAL); +			goto err; +		} +		dev->cbd_layout_type = ntohl(*p++); +		memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE); +		p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + +		if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { +			p = read_buf(xdr, sizeof(uint32_t)); +			if (unlikely(p == NULL)) { +				status = htonl(NFS4ERR_BADXDR); +				goto err; +			} +			dev->cbd_immediate = ntohl(*p++); +		} else { +			dev->cbd_immediate = 0; +		} + +		args->ndevs++; + +		dprintk("%s: type %d layout 0x%x immediate %d\n", +			__func__, dev->cbd_notify_type, dev->cbd_layout_type, +			dev->cbd_immediate); +	} +out: +	dprintk("%s: status %d ndevs %d\n", +		__func__, ntohl(status), args->ndevs); +	return status; +err: +	kfree(args->devs); +	goto out; +} +  static __be32 decode_sessionid(struct xdr_stream *xdr,  				 struct nfs4_sessionid *sid)  { @@ -297,9 +456,9 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,  	args->csa_nrclists = ntohl(*p++);  	args->csa_rclists = NULL;  	if (args->csa_nrclists) { -		args->csa_rclists = kmalloc(args->csa_nrclists * -					    sizeof(*args->csa_rclists), -					    GFP_KERNEL); +		args->csa_rclists = kmalloc_array(args->csa_nrclists, +						  sizeof(*args->csa_rclists), +						  GFP_KERNEL);  		if (unlikely(args->csa_rclists == NULL))  			goto out; @@ -336,17 +495,18 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,  				      struct xdr_stream *xdr,  				      struct cb_recallanyargs *args)  { -	__be32 *p; +	uint32_t bitmap[2]; +	__be32 *p, status;  	args->craa_addr = svc_addr(rqstp);  	p = read_buf(xdr, 4);  	if (unlikely(p == NULL))  		return htonl(NFS4ERR_BADXDR);  	args->craa_objs_to_keep = ntohl(*p++); -	p = read_buf(xdr, 4); -	if (unlikely(p == NULL)) -		return htonl(NFS4ERR_BADXDR); -	args->craa_type_mask = ntohl(*p); +	status = decode_bitmap(xdr, bitmap); +	if (unlikely(status)) +		return status; +	args->craa_type_mask = bitmap[0];  	return 0;  } @@ -361,7 +521,7 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,  	p = read_buf(xdr, 4);  	if (unlikely(p == NULL))  		return htonl(NFS4ERR_BADXDR); -	args->crsa_target_max_slots = ntohl(*p++); +	args->crsa_target_highest_slotid = ntohl(*p++);  	return 0;  } @@ -537,7 +697,7 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,  				       const struct cb_sequenceres *res)  {  	__be32 *p; -	unsigned status = res->csr_status; +	__be32 status = res->csr_status;  	if (unlikely(status != 0))  		goto out; @@ -574,11 +734,11 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)  	case OP_CB_SEQUENCE:  	case OP_CB_RECALL_ANY:  	case OP_CB_RECALL_SLOT: +	case OP_CB_LAYOUTRECALL: +	case OP_CB_NOTIFY_DEVICEID:  		*op = &callback_ops[op_nr];  		break; -	case OP_CB_LAYOUTRECALL: -	case OP_CB_NOTIFY_DEVICEID:  	case OP_CB_NOTIFY:  	case OP_CB_PUSH_DELEG:  	case OP_CB_RECALLABLE_OBJ_AVAIL: @@ -593,6 +753,26 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)  	return htonl(NFS_OK);  } +static void nfs4_callback_free_slot(struct nfs4_session *session) +{ +	struct nfs4_slot_table *tbl = &session->bc_slot_table; + +	spin_lock(&tbl->slot_tbl_lock); +	/* +	 * Let the state manager know callback processing done. +	 * A single slot, so highest used slotid is either 0 or -1 +	 */ +	tbl->highest_used_slotid = NFS4_NO_SLOT; +	nfs4_slot_tbl_drain_complete(tbl); +	spin_unlock(&tbl->slot_tbl_lock); +} + +static void nfs4_cb_free_slot(struct cb_process_state *cps) +{ +	if (cps->slotid != NFS4_NO_SLOT) +		nfs4_callback_free_slot(cps->clp->cl_session); +} +  #else /* CONFIG_NFS_V4_1 */  static __be32 @@ -601,8 +781,31 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)  	return htonl(NFS4ERR_MINOR_VERS_MISMATCH);  } +static void nfs4_cb_free_slot(struct cb_process_state *cps) +{ +}  #endif /* CONFIG_NFS_V4_1 */ +#ifdef CONFIG_NFS_V4_2 +static __be32 +preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op) +{ +	__be32 status = preprocess_nfs41_op(nop, op_nr, op); +	if (status != htonl(NFS4ERR_OP_ILLEGAL)) +		return status; + +	if (op_nr == OP_CB_OFFLOAD) +		return htonl(NFS4ERR_NOTSUPP); +	return htonl(NFS4ERR_OP_ILLEGAL); +} +#else /* CONFIG_NFS_V4_2 */ +static __be32 +preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op) +{ +	return htonl(NFS4ERR_MINOR_VERS_MISMATCH); +} +#endif /* CONFIG_NFS_V4_2 */ +  static __be32  preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)  { @@ -618,10 +821,10 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)  	return htonl(NFS_OK);  } -static __be32 process_op(uint32_t minorversion, int nop, -		struct svc_rqst *rqstp, +static __be32 process_op(int nop, struct svc_rqst *rqstp,  		struct xdr_stream *xdr_in, void *argp, -		struct xdr_stream *xdr_out, void *resp, int* drc_status) +		struct xdr_stream *xdr_out, void *resp, +		struct cb_process_state *cps)  {  	struct callback_op *op = &callback_ops[0];  	unsigned int op_nr; @@ -635,17 +838,29 @@ static __be32 process_op(uint32_t minorversion, int nop,  		return status;  	dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", -		__func__, minorversion, nop, op_nr); +		__func__, cps->minorversion, nop, op_nr); + +	switch (cps->minorversion) { +	case 0: +		status = preprocess_nfs4_op(op_nr, &op); +		break; +	case 1: +		status = preprocess_nfs41_op(nop, op_nr, &op); +		break; +	case 2: +		status = preprocess_nfs42_op(nop, op_nr, &op); +		break; +	default: +		status = htonl(NFS4ERR_MINOR_VERS_MISMATCH); +	} -	status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) : -				preprocess_nfs4_op(op_nr, &op);  	if (status == htonl(NFS4ERR_OP_ILLEGAL))  		op_nr = OP_CB_ILLEGAL;  	if (status)  		goto encode_hdr; -	if (*drc_status) { -		status = *drc_status; +	if (cps->drc_status) { +		status = cps->drc_status;  		goto encode_hdr;  	} @@ -653,16 +868,10 @@ static __be32 process_op(uint32_t minorversion, int nop,  	if (maxlen > 0 && maxlen < PAGE_SIZE) {  		status = op->decode_args(rqstp, xdr_in, argp);  		if (likely(status == 0)) -			status = op->process_op(argp, resp); +			status = op->process_op(argp, resp, cps);  	} else  		status = htonl(NFS4ERR_RESOURCE); -	/* Only set by OP_CB_SEQUENCE processing */ -	if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) { -		*drc_status = status; -		status = 0; -	} -  encode_hdr:  	res = encode_op_hdr(xdr_out, op_nr, status);  	if (unlikely(res)) @@ -681,8 +890,13 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r  	struct cb_compound_hdr_arg hdr_arg = { 0 };  	struct cb_compound_hdr_res hdr_res = { NULL };  	struct xdr_stream xdr_in, xdr_out; -	__be32 *p; -	__be32 status, drc_status = 0; +	__be32 *p, status; +	struct cb_process_state cps = { +		.drc_status = 0, +		.clp = NULL, +		.slotid = NFS4_NO_SLOT, +		.net = SVC_NET(rqstp), +	};  	unsigned int nops = 0;  	dprintk("%s: start\n", __func__); @@ -696,14 +910,21 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r  	if (status == __constant_htonl(NFS4ERR_RESOURCE))  		return rpc_garbage_args; +	if (hdr_arg.minorversion == 0) { +		cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); +		if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) +			return rpc_drop_reply; +	} + +	cps.minorversion = hdr_arg.minorversion;  	hdr_res.taglen = hdr_arg.taglen;  	hdr_res.tag = hdr_arg.tag;  	if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)  		return rpc_system_err;  	while (status == 0 && nops != hdr_arg.nops) { -		status = process_op(hdr_arg.minorversion, nops, rqstp, -				    &xdr_in, argp, &xdr_out, resp, &drc_status); +		status = process_op(nops, rqstp, &xdr_in, +				    argp, &xdr_out, resp, &cps);  		nops++;  	} @@ -716,6 +937,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r  	*hdr_res.status = status;  	*hdr_res.nops = htonl(nops); +	nfs4_cb_free_slot(&cps); +	nfs_put_client(cps.clp);  	dprintk("%s: done, status = %u\n", __func__, ntohl(status));  	return rpc_success;  } @@ -739,6 +962,18 @@ static struct callback_op callback_ops[] = {  		.res_maxsize = CB_OP_RECALL_RES_MAXSZ,  	},  #if defined(CONFIG_NFS_V4_1) +	[OP_CB_LAYOUTRECALL] = { +		.process_op = (callback_process_op_t)nfs4_callback_layoutrecall, +		.decode_args = +			(callback_decode_arg_t)decode_layoutrecall_args, +		.res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, +	}, +	[OP_CB_NOTIFY_DEVICEID] = { +		.process_op = (callback_process_op_t)nfs4_callback_devicenotify, +		.decode_args = +			(callback_decode_arg_t)decode_devicenotify_args, +		.res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, +	},  	[OP_CB_SEQUENCE] = {  		.process_op = (callback_process_op_t)nfs4_callback_sequence,  		.decode_args = (callback_decode_arg_t)decode_cb_sequence_args, @@ -792,4 +1027,5 @@ struct svc_version nfs4_callback_version4 = {  	.vs_proc = nfs4_callback_procedures1,  	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,  	.vs_dispatch = NULL, +	.vs_hidden = 1,  }; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 0870d0d4efc..1d09289c8f0 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -36,11 +36,13 @@  #include <linux/inet.h>  #include <linux/in6.h>  #include <linux/slab.h> +#include <linux/idr.h>  #include <net/ipv6.h>  #include <linux/nfs_xdr.h>  #include <linux/sunrpc/bc_xprt.h> +#include <linux/nsproxy.h> +#include <linux/pid_namespace.h> -#include <asm/system.h>  #include "nfs4_fs.h"  #include "callback.h" @@ -49,64 +51,96 @@  #include "internal.h"  #include "fscache.h"  #include "pnfs.h" +#include "nfs.h" +#include "netns.h"  #define NFSDBG_FACILITY		NFSDBG_CLIENT -static DEFINE_SPINLOCK(nfs_client_lock); -static LIST_HEAD(nfs_client_list); -static LIST_HEAD(nfs_volume_list);  static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); +static DEFINE_SPINLOCK(nfs_version_lock); +static DEFINE_MUTEX(nfs_version_mutex); +static LIST_HEAD(nfs_versions);  /*   * RPC cruft for NFS   */ -static struct rpc_version *nfs_version[5] = { -	[2]			= &nfs_version2, -#ifdef CONFIG_NFS_V3 -	[3]			= &nfs_version3, -#endif -#ifdef CONFIG_NFS_V4 -	[4]			= &nfs_version4, -#endif +static const struct rpc_version *nfs_version[5] = { +	[2] = NULL, +	[3] = NULL, +	[4] = NULL,  }; -struct rpc_program nfs_program = { +const struct rpc_program nfs_program = {  	.name			= "nfs",  	.number			= NFS_PROGRAM,  	.nrvers			= ARRAY_SIZE(nfs_version),  	.version		= nfs_version,  	.stats			= &nfs_rpcstat, -	.pipe_dir_name		= "/nfs", +	.pipe_dir_name		= NFS_PIPE_DIRNAME,  };  struct rpc_stat nfs_rpcstat = {  	.program		= &nfs_program  }; +static struct nfs_subversion *find_nfs_version(unsigned int version) +{ +	struct nfs_subversion *nfs; +	spin_lock(&nfs_version_lock); -#ifdef CONFIG_NFS_V3_ACL -static struct rpc_stat		nfsacl_rpcstat = { &nfsacl_program }; -static struct rpc_version *	nfsacl_version[] = { -	[3]			= &nfsacl_version3, -}; +	list_for_each_entry(nfs, &nfs_versions, list) { +		if (nfs->rpc_ops->version == version) { +			spin_unlock(&nfs_version_lock); +			return nfs; +		} +	} -struct rpc_program		nfsacl_program = { -	.name			= "nfsacl", -	.number			= NFS_ACL_PROGRAM, -	.nrvers			= ARRAY_SIZE(nfsacl_version), -	.version		= nfsacl_version, -	.stats			= &nfsacl_rpcstat, -}; -#endif  /* CONFIG_NFS_V3_ACL */ - -struct nfs_client_initdata { -	const char *hostname; -	const struct sockaddr *addr; -	size_t addrlen; -	const struct nfs_rpc_ops *rpc_ops; -	int proto; -	u32 minorversion; -}; +	spin_unlock(&nfs_version_lock); +	return ERR_PTR(-EPROTONOSUPPORT); +} + +struct nfs_subversion *get_nfs_version(unsigned int version) +{ +	struct nfs_subversion *nfs = find_nfs_version(version); + +	if (IS_ERR(nfs)) { +		mutex_lock(&nfs_version_mutex); +		request_module("nfsv%d", version); +		nfs = find_nfs_version(version); +		mutex_unlock(&nfs_version_mutex); +	} + +	if (!IS_ERR(nfs)) +		try_module_get(nfs->owner); +	return nfs; +} + +void put_nfs_version(struct nfs_subversion *nfs) +{ +	module_put(nfs->owner); +} + +void register_nfs_version(struct nfs_subversion *nfs) +{ +	spin_lock(&nfs_version_lock); + +	list_add(&nfs->list, &nfs_versions); +	nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers; + +	spin_unlock(&nfs_version_lock); +} +EXPORT_SYMBOL_GPL(register_nfs_version); + +void unregister_nfs_version(struct nfs_subversion *nfs) +{ +	spin_lock(&nfs_version_lock); + +	nfs_version[nfs->rpc_ops->version] = NULL; +	list_del(&nfs->list); + +	spin_unlock(&nfs_version_lock); +} +EXPORT_SYMBOL_GPL(unregister_nfs_version);  /*   * Allocate a shared client record @@ -114,7 +148,7 @@ struct nfs_client_initdata {   * Since these are allocated/deallocated very rarely, we don't   * bother putting them in a slab cache...   */ -static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) +struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)  {  	struct nfs_client *clp;  	struct rpc_cred *cred; @@ -123,7 +157,10 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_  	if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)  		goto error_0; -	clp->rpc_ops = cl_init->rpc_ops; +	clp->cl_nfs_mod = cl_init->nfs_mod; +	try_module_get(clp->cl_nfs_mod->owner); + +	clp->rpc_ops = clp->cl_nfs_mod->rpc_ops;  	atomic_set(&clp->cl_count, 1);  	clp->cl_cons_state = NFS_CS_INITING; @@ -142,85 +179,67 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_  	clp->cl_rpcclient = ERR_PTR(-EINVAL);  	clp->cl_proto = cl_init->proto; +	clp->cl_net = get_net(cl_init->net); -#ifdef CONFIG_NFS_V4 -	INIT_LIST_HEAD(&clp->cl_delegations); -	spin_lock_init(&clp->cl_lock); -	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); -	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); -	clp->cl_boot_time = CURRENT_TIME; -	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; -	clp->cl_minorversion = cl_init->minorversion; -	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; -#endif -	cred = rpc_lookup_machine_cred(); +	cred = rpc_lookup_machine_cred("*");  	if (!IS_ERR(cred))  		clp->cl_machine_cred = cred; -#if defined(CONFIG_NFS_V4_1) -	INIT_LIST_HEAD(&clp->cl_layouts); -#endif  	nfs_fscache_get_client_cookie(clp);  	return clp;  error_cleanup: +	put_nfs_version(clp->cl_nfs_mod);  	kfree(clp);  error_0:  	return ERR_PTR(err);  } +EXPORT_SYMBOL_GPL(nfs_alloc_client); -#ifdef CONFIG_NFS_V4 -/* - * Clears/puts all minor version specific parts from an nfs_client struct - * reverting it to minorversion 0. - */ -static void nfs4_clear_client_minor_version(struct nfs_client *clp) +#if IS_ENABLED(CONFIG_NFS_V4) +void nfs_cleanup_cb_ident_idr(struct net *net)  { -#ifdef CONFIG_NFS_V4_1 -	if (nfs4_has_session(clp)) { -		nfs4_destroy_session(clp->cl_session); -		clp->cl_session = NULL; -	} +	struct nfs_net *nn = net_generic(net, nfs_net_id); -	clp->cl_mvops = nfs_v4_minor_ops[0]; -#endif /* CONFIG_NFS_V4_1 */ +	idr_destroy(&nn->cb_ident_idr);  } -/* - * Destroy the NFS4 callback service - */ -static void nfs4_destroy_callback(struct nfs_client *clp) +/* nfs_client_lock held */ +static void nfs_cb_idr_remove_locked(struct nfs_client *clp)  { -	if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) -		nfs_callback_down(clp->cl_mvops->minor_version); +	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + +	if (clp->cl_cb_ident) +		idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident);  } -static void nfs4_shutdown_client(struct nfs_client *clp) +static void pnfs_init_server(struct nfs_server *server)  { -	if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) -		nfs4_kill_renewd(clp); -	nfs4_clear_client_minor_version(clp); -	nfs4_destroy_callback(clp); -	if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) -		nfs_idmap_delete(clp); - -	rpc_destroy_wait_queue(&clp->cl_rpcwaitq); +	rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");  } +  #else -static void nfs4_shutdown_client(struct nfs_client *clp) +void nfs_cleanup_cb_ident_idr(struct net *net) +{ +} + +static void nfs_cb_idr_remove_locked(struct nfs_client *clp) +{ +} + +static void pnfs_init_server(struct nfs_server *server)  {  } +  #endif /* CONFIG_NFS_V4 */  /*   * Destroy a shared client record   */ -static void nfs_free_client(struct nfs_client *clp) +void nfs_free_client(struct nfs_client *clp)  {  	dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); -	nfs4_shutdown_client(clp); -  	nfs_fscache_release_client_cookie(clp);  	/* -EIO all pending I/O */ @@ -230,29 +249,36 @@ static void nfs_free_client(struct nfs_client *clp)  	if (clp->cl_machine_cred != NULL)  		put_rpccred(clp->cl_machine_cred); +	put_net(clp->cl_net); +	put_nfs_version(clp->cl_nfs_mod);  	kfree(clp->cl_hostname);  	kfree(clp);  	dprintk("<-- nfs_free_client()\n");  } +EXPORT_SYMBOL_GPL(nfs_free_client);  /*   * Release a reference to a shared client record   */  void nfs_put_client(struct nfs_client *clp)  { +	struct nfs_net *nn; +  	if (!clp)  		return;  	dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count)); +	nn = net_generic(clp->cl_net, nfs_net_id); -	if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) { +	if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {  		list_del(&clp->cl_share_link); -		spin_unlock(&nfs_client_lock); +		nfs_cb_idr_remove_locked(clp); +		spin_unlock(&nn->nfs_client_lock); -		BUG_ON(!list_empty(&clp->cl_superblocks)); +		WARN_ON_ONCE(!list_empty(&clp->cl_superblocks)); -		nfs_free_client(clp); +		clp->rpc_ops->free_client(clp);  	}  }  EXPORT_SYMBOL_GPL(nfs_put_client); @@ -275,11 +301,12 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,  	const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;  	const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; -	if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && -	    sin1->sin6_scope_id != sin2->sin6_scope_id) +	if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))  		return 0; +	else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) +		return sin1->sin6_scope_id == sin2->sin6_scope_id; -	return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); +	return 1;  }  #else	/* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */  static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, @@ -325,12 +352,13 @@ static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,  		(sin1->sin_port == sin2->sin_port);  } +#if defined(CONFIG_NFS_V4_1)  /*   * Test if two socket addresses represent the same actual socket,   * by comparing (only) relevant fields, excluding the port number.   */ -static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, -				     const struct sockaddr *sa2) +int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, +			      const struct sockaddr *sa2)  {  	if (sa1->sa_family != sa2->sa_family)  		return 0; @@ -343,6 +371,8 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,  	}  	return 0;  } +EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr); +#endif /* CONFIG_NFS_V4_1 */  /*   * Test if two socket addresses represent the same actual socket, @@ -364,72 +394,6 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,  }  /* - * Find a client by IP address and protocol version - * - returns NULL if no such client - */ -struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion) -{ -	struct nfs_client *clp; - -	spin_lock(&nfs_client_lock); -	list_for_each_entry(clp, &nfs_client_list, cl_share_link) { -		struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; - -		/* Don't match clients that failed to initialise properly */ -		if (!(clp->cl_cons_state == NFS_CS_READY || -		      clp->cl_cons_state == NFS_CS_SESSION_INITING)) -			continue; - -		/* Different NFS versions cannot share the same nfs_client */ -		if (clp->rpc_ops->version != nfsversion) -			continue; - -		/* Match only the IP address, not the port number */ -		if (!nfs_sockaddr_match_ipaddr(addr, clap)) -			continue; - -		atomic_inc(&clp->cl_count); -		spin_unlock(&nfs_client_lock); -		return clp; -	} -	spin_unlock(&nfs_client_lock); -	return NULL; -} - -/* - * Find a client by IP address and protocol version - * - returns NULL if no such client - */ -struct nfs_client *nfs_find_client_next(struct nfs_client *clp) -{ -	struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr; -	u32 nfsvers = clp->rpc_ops->version; - -	spin_lock(&nfs_client_lock); -	list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) { -		struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; - -		/* Don't match clients that failed to initialise properly */ -		if (clp->cl_cons_state != NFS_CS_READY) -			continue; - -		/* Different NFS versions cannot share the same nfs_client */ -		if (clp->rpc_ops->version != nfsvers) -			continue; - -		/* Match only the IP address, not the port number */ -		if (!nfs_sockaddr_match_ipaddr(sap, clap)) -			continue; - -		atomic_inc(&clp->cl_count); -		spin_unlock(&nfs_client_lock); -		return clp; -	} -	spin_unlock(&nfs_client_lock); -	return NULL; -} - -/*   * Find an nfs_client on the list that matches the initialisation data   * that is supplied.   */ @@ -437,15 +401,16 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat  {  	struct nfs_client *clp;  	const struct sockaddr *sap = data->addr; +	struct nfs_net *nn = net_generic(data->net, nfs_net_id); -	list_for_each_entry(clp, &nfs_client_list, cl_share_link) { +	list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {  	        const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;  		/* Don't match clients that failed to initialise properly */  		if (clp->cl_cons_state < 0)  			continue;  		/* Different NFS versions cannot share the same nfs_client */ -		if (clp->rpc_ops != data->rpc_ops) +		if (clp->rpc_ops != data->nfs_mod->rpc_ops)  			continue;  		if (clp->cl_proto != data->proto) @@ -463,55 +428,28 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat  	return NULL;  } +static bool nfs_client_init_is_complete(const struct nfs_client *clp) +{ +	return clp->cl_cons_state != NFS_CS_INITING; +} + +int nfs_wait_client_init_complete(const struct nfs_client *clp) +{ +	return wait_event_killable(nfs_client_active_wq, +			nfs_client_init_is_complete(clp)); +} +EXPORT_SYMBOL_GPL(nfs_wait_client_init_complete); +  /* - * Look up a client by IP address and protocol version - * - creates a new record if one doesn't yet exist + * Found an existing client.  Make sure it's ready before returning.   */ -static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) +static struct nfs_client * +nfs_found_client(const struct nfs_client_initdata *cl_init, +		 struct nfs_client *clp)  { -	struct nfs_client *clp, *new = NULL;  	int error; -	dprintk("--> nfs_get_client(%s,v%u)\n", -		cl_init->hostname ?: "", cl_init->rpc_ops->version); - -	/* see if the client already exists */ -	do { -		spin_lock(&nfs_client_lock); - -		clp = nfs_match_client(cl_init); -		if (clp) -			goto found_client; -		if (new) -			goto install_client; - -		spin_unlock(&nfs_client_lock); - -		new = nfs_alloc_client(cl_init); -	} while (!IS_ERR(new)); - -	dprintk("--> nfs_get_client() = %ld [failed]\n", PTR_ERR(new)); -	return new; - -	/* install a new client and return with it unready */ -install_client: -	clp = new; -	list_add(&clp->cl_share_link, &nfs_client_list); -	spin_unlock(&nfs_client_lock); -	dprintk("--> nfs_get_client() = %p [new]\n", clp); -	return clp; - -	/* found an existing client -	 * - make sure it's ready before returning -	 */ -found_client: -	spin_unlock(&nfs_client_lock); - -	if (new) -		nfs_free_client(new); - -	error = wait_event_killable(nfs_client_active_wq, -				clp->cl_cons_state < NFS_CS_INITING); +	error = nfs_wait_client_init_complete(clp);  	if (error < 0) {  		nfs_put_client(clp);  		return ERR_PTR(-ERESTARTSYS); @@ -523,41 +461,75 @@ found_client:  		return ERR_PTR(error);  	} -	BUG_ON(clp->cl_cons_state != NFS_CS_READY); +	smp_rmb(); -	dprintk("--> nfs_get_client() = %p [share]\n", clp); +	dprintk("<-- %s found nfs_client %p for %s\n", +		__func__, clp, cl_init->hostname ?: "");  	return clp;  }  /* - * Mark a server as ready or failed + * Look up a client by IP address and protocol version + * - creates a new record if one doesn't yet exist   */ -void nfs_mark_client_ready(struct nfs_client *clp, int state) +struct nfs_client * +nfs_get_client(const struct nfs_client_initdata *cl_init, +	       const struct rpc_timeout *timeparms, +	       const char *ip_addr, +	       rpc_authflavor_t authflavour)  { -	clp->cl_cons_state = state; -	wake_up_all(&nfs_client_active_wq); +	struct nfs_client *clp, *new = NULL; +	struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); +	const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops; + +	dprintk("--> nfs_get_client(%s,v%u)\n", +		cl_init->hostname ?: "", rpc_ops->version); + +	/* see if the client already exists */ +	do { +		spin_lock(&nn->nfs_client_lock); + +		clp = nfs_match_client(cl_init); +		if (clp) { +			spin_unlock(&nn->nfs_client_lock); +			if (new) +				new->rpc_ops->free_client(new); +			return nfs_found_client(cl_init, clp); +		} +		if (new) { +			list_add_tail(&new->cl_share_link, +					&nn->nfs_client_list); +			spin_unlock(&nn->nfs_client_lock); +			new->cl_flags = cl_init->init_flags; +			return rpc_ops->init_client(new, timeparms, ip_addr); +		} + +		spin_unlock(&nn->nfs_client_lock); + +		new = rpc_ops->alloc_client(cl_init); +	} while (!IS_ERR(new)); + +	dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n", +		cl_init->hostname ?: "", PTR_ERR(new)); +	return new;  } +EXPORT_SYMBOL_GPL(nfs_get_client);  /* - * With sessions, the client is not marked ready until after a - * successful EXCHANGE_ID and CREATE_SESSION. - * - * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate - * other versions of NFS can be tried. + * Mark a server as ready or failed   */ -int nfs4_check_client_ready(struct nfs_client *clp) +void nfs_mark_client_ready(struct nfs_client *clp, int state)  { -	if (!nfs4_has_session(clp)) -		return 0; -	if (clp->cl_cons_state < NFS_CS_READY) -		return -EPROTONOSUPPORT; -	return 0; +	smp_wmb(); +	clp->cl_cons_state = state; +	wake_up_all(&nfs_client_active_wq);  } +EXPORT_SYMBOL_GPL(nfs_mark_client_ready);  /*   * Initialise the timeout values for a connection   */ -static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, +void nfs_init_timeout_values(struct rpc_timeout *to, int proto,  				    unsigned int timeo, unsigned int retrans)  {  	to->to_initval = timeo * HZ / 10; @@ -594,18 +566,18 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,  		BUG();  	}  } +EXPORT_SYMBOL_GPL(nfs_init_timeout_values);  /*   * Create an RPC client handle   */ -static int nfs_create_rpc_client(struct nfs_client *clp, -				 const struct rpc_timeout *timeparms, -				 rpc_authflavor_t flavor, -				 int discrtry, int noresvport) +int nfs_create_rpc_client(struct nfs_client *clp, +			  const struct rpc_timeout *timeparms, +			  rpc_authflavor_t flavor)  {  	struct rpc_clnt		*clnt = NULL;  	struct rpc_create_args args = { -		.net		= &init_net, +		.net		= clp->cl_net,  		.protocol	= clp->cl_proto,  		.address	= (struct sockaddr *)&clp->cl_addr,  		.addrsize	= clp->cl_addrlen, @@ -616,10 +588,14 @@ static int nfs_create_rpc_client(struct nfs_client *clp,  		.authflavor	= flavor,  	}; -	if (discrtry) +	if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags))  		args.flags |= RPC_CLNT_CREATE_DISCRTRY; -	if (noresvport) +	if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags)) +		args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT; +	if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags))  		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; +	if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) +		args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS;  	if (!IS_ERR(clp->cl_rpcclient))  		return 0; @@ -634,14 +610,14 @@ static int nfs_create_rpc_client(struct nfs_client *clp,  	clp->cl_rpcclient = clnt;  	return 0;  } +EXPORT_SYMBOL_GPL(nfs_create_rpc_client);  /*   * Version 2 or 3 client destruction   */  static void nfs_destroy_server(struct nfs_server *server)  { -	if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) || -			!(server->flags & NFS_MOUNT_LOCAL_FCNTL)) +	if (server->nlm_host)  		nlmclnt_done(server->nlm_host);  } @@ -659,6 +635,7 @@ static int nfs_start_lockd(struct nfs_server *server)  		.nfs_version	= clp->rpc_ops->version,  		.noresvport	= server->flags & NFS_MOUNT_NORESVPORT ?  					1 : 0, +		.net		= clp->cl_net,  	};  	if (nlm_init.nfs_version > 3) @@ -685,45 +662,16 @@ static int nfs_start_lockd(struct nfs_server *server)  }  /* - * Initialise an NFSv3 ACL client connection - */ -#ifdef CONFIG_NFS_V3_ACL -static void nfs_init_server_aclclient(struct nfs_server *server) -{ -	if (server->nfs_client->rpc_ops->version != 3) -		goto out_noacl; -	if (server->flags & NFS_MOUNT_NOACL) -		goto out_noacl; - -	server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3); -	if (IS_ERR(server->client_acl)) -		goto out_noacl; - -	/* No errors! Assume that Sun nfsacls are supported */ -	server->caps |= NFS_CAP_ACLS; -	return; - -out_noacl: -	server->caps &= ~NFS_CAP_ACLS; -} -#else -static inline void nfs_init_server_aclclient(struct nfs_server *server) -{ -	server->flags &= ~NFS_MOUNT_NOACL; -	server->caps &= ~NFS_CAP_ACLS; -} -#endif - -/*   * Create a general RPC client   */ -static int nfs_init_server_rpcclient(struct nfs_server *server, +int nfs_init_server_rpcclient(struct nfs_server *server,  		const struct rpc_timeout *timeo,  		rpc_authflavor_t pseudoflavour)  {  	struct nfs_client *clp = server->nfs_client; -	server->client = rpc_clone_client(clp->cl_rpcclient); +	server->client = rpc_clone_client_set_auth(clp->cl_rpcclient, +							pseudoflavour);  	if (IS_ERR(server->client)) {  		dprintk("%s: couldn't create rpc_client!\n", __func__);  		return PTR_ERR(server->client); @@ -733,67 +681,67 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,  			timeo,  			sizeof(server->client->cl_timeout_default));  	server->client->cl_timeout = &server->client->cl_timeout_default; - -	if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) { -		struct rpc_auth *auth; - -		auth = rpcauth_create(pseudoflavour, server->client); -		if (IS_ERR(auth)) { -			dprintk("%s: couldn't create credcache!\n", __func__); -			return PTR_ERR(auth); -		} -	}  	server->client->cl_softrtry = 0;  	if (server->flags & NFS_MOUNT_SOFT)  		server->client->cl_softrtry = 1;  	return 0;  } +EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); -/* - * Initialise an NFS2 or NFS3 client +/** + * nfs_init_client - Initialise an NFS2 or NFS3 client + * + * @clp: nfs_client to initialise + * @timeparms: timeout parameters for underlying RPC transport + * @ip_addr: IP presentation address (not used) + * + * Returns pointer to an NFS client, or an ERR_PTR value.   */ -static int nfs_init_client(struct nfs_client *clp, -			   const struct rpc_timeout *timeparms, -			   const struct nfs_parsed_mount_data *data) +struct nfs_client *nfs_init_client(struct nfs_client *clp, +		    const struct rpc_timeout *timeparms, +		    const char *ip_addr)  {  	int error;  	if (clp->cl_cons_state == NFS_CS_READY) {  		/* the client is already initialised */  		dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp); -		return 0; +		return clp;  	}  	/*  	 * Create a client RPC handle for doing FSSTAT with UNIX auth only  	 * - RFC 2623, sec 2.3.2  	 */ -	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, -				      0, data->flags & NFS_MOUNT_NORESVPORT); +	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);  	if (error < 0)  		goto error;  	nfs_mark_client_ready(clp, NFS_CS_READY); -	return 0; +	return clp;  error:  	nfs_mark_client_ready(clp, error); +	nfs_put_client(clp);  	dprintk("<-- nfs_init_client() = xerror %d\n", error); -	return error; +	return ERR_PTR(error);  } +EXPORT_SYMBOL_GPL(nfs_init_client);  /*   * Create a version 2 or 3 client   */  static int nfs_init_server(struct nfs_server *server, -			   const struct nfs_parsed_mount_data *data) +			   const struct nfs_parsed_mount_data *data, +			   struct nfs_subversion *nfs_mod)  {  	struct nfs_client_initdata cl_init = {  		.hostname = data->nfs_server.hostname,  		.addr = (const struct sockaddr *)&data->nfs_server.address,  		.addrlen = data->nfs_server.addrlen, -		.rpc_ops = &nfs_v2_clientops, +		.nfs_mod = nfs_mod,  		.proto = data->nfs_server.protocol, +		.net = data->net,  	};  	struct rpc_timeout timeparms;  	struct nfs_client *clp; @@ -801,24 +749,18 @@ static int nfs_init_server(struct nfs_server *server,  	dprintk("--> nfs_init_server()\n"); -#ifdef CONFIG_NFS_V3 -	if (data->version == 3) -		cl_init.rpc_ops = &nfs_v3_clientops; -#endif +	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, +			data->timeo, data->retrans); +	if (data->flags & NFS_MOUNT_NORESVPORT) +		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);  	/* Allocate or find a client reference we can use */ -	clp = nfs_get_client(&cl_init); +	clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);  	if (IS_ERR(clp)) {  		dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));  		return PTR_ERR(clp);  	} -	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, -			data->timeo, data->retrans); -	error = nfs_init_client(clp, &timeparms, data); -	if (error < 0) -		goto error; -  	server->nfs_client = clp;  	/* Initialise the client representation from the mount data */ @@ -826,7 +768,7 @@ static int nfs_init_server(struct nfs_server *server,  	server->options = data->options;  	server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|  		NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| -		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; +		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR;  	if (data->rsize)  		server->rsize = nfs_block_size(data->rsize, NULL); @@ -844,8 +786,10 @@ static int nfs_init_server(struct nfs_server *server,  		goto error;  	server->port = data->nfs_server.port; +	server->auth_info = data->auth_info; -	error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); +	error = nfs_init_server_rpcclient(server, &timeparms, +					  data->selected_flavor);  	if (error < 0)  		goto error; @@ -860,8 +804,6 @@ static int nfs_init_server(struct nfs_server *server,  	server->mountd_protocol = data->mount_server.protocol;  	server->namelen  = data->namlen; -	/* Create a client RPC handle for the NFSv3 ACL management interface */ -	nfs_init_server_aclclient(server);  	dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp);  	return 0; @@ -875,7 +817,9 @@ error:  /*   * Load up the server record from information gained in an fsinfo record   */ -static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) +static void nfs_server_set_fsinfo(struct nfs_server *server, +				  struct nfs_fh *mntfh, +				  struct nfs_fsinfo *fsinfo)  {  	unsigned long max_rpc_payload; @@ -905,7 +849,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *  	if (server->wsize > NFS_MAX_FILE_IO_SIZE)  		server->wsize = NFS_MAX_FILE_IO_SIZE;  	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -	set_pnfs_layoutdriver(server, fsinfo->layouttype);  	server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); @@ -931,7 +874,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *  /*   * Probe filesystem information, including the FSID on v2/v3   */ -static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) +int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr)  {  	struct nfs_fsinfo fsinfo;  	struct nfs_client *clp = server->nfs_client; @@ -951,7 +894,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str  	if (error < 0)  		goto out_error; -	nfs_server_set_fsinfo(server, &fsinfo); +	nfs_server_set_fsinfo(server, mntfh, &fsinfo);  	/* Get some general file system info */  	if (server->namelen == 0) { @@ -971,11 +914,12 @@ out_error:  	dprintk("nfs_probe_fsinfo: error = %d\n", -error);  	return error;  } +EXPORT_SYMBOL_GPL(nfs_probe_fsinfo);  /*   * Copy useful information when duplicating a server record   */ -static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) +void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)  {  	target->flags = source->flags;  	target->rsize = source->rsize; @@ -986,12 +930,47 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve  	target->acdirmax = source->acdirmax;  	target->caps = source->caps;  	target->options = source->options; +	target->auth_info = source->auth_info;  } +EXPORT_SYMBOL_GPL(nfs_server_copy_userdata); + +void nfs_server_insert_lists(struct nfs_server *server) +{ +	struct nfs_client *clp = server->nfs_client; +	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + +	spin_lock(&nn->nfs_client_lock); +	list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); +	list_add_tail(&server->master_link, &nn->nfs_volume_list); +	clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); +	spin_unlock(&nn->nfs_client_lock); + +} +EXPORT_SYMBOL_GPL(nfs_server_insert_lists); + +void nfs_server_remove_lists(struct nfs_server *server) +{ +	struct nfs_client *clp = server->nfs_client; +	struct nfs_net *nn; + +	if (clp == NULL) +		return; +	nn = net_generic(clp->cl_net, nfs_net_id); +	spin_lock(&nn->nfs_client_lock); +	list_del_rcu(&server->client_link); +	if (list_empty(&clp->cl_superblocks)) +		set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); +	list_del(&server->master_link); +	spin_unlock(&nn->nfs_client_lock); + +	synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(nfs_server_remove_lists);  /*   * Allocate and initialise a server record   */ -static struct nfs_server *nfs_alloc_server(void) +struct nfs_server *nfs_alloc_server(void)  {  	struct nfs_server *server; @@ -1004,6 +983,9 @@ static struct nfs_server *nfs_alloc_server(void)  	/* Zero out the NFS state stuff */  	INIT_LIST_HEAD(&server->client_link);  	INIT_LIST_HEAD(&server->master_link); +	INIT_LIST_HEAD(&server->delegations); +	INIT_LIST_HEAD(&server->layouts); +	INIT_LIST_HEAD(&server->state_owners_lru);  	atomic_set(&server->active, 0); @@ -1019,8 +1001,13 @@ static struct nfs_server *nfs_alloc_server(void)  		return NULL;  	} +	ida_init(&server->openowner_id); +	ida_init(&server->lockowner_id); +	pnfs_init_server(server); +  	return server;  } +EXPORT_SYMBOL_GPL(nfs_alloc_server);  /*   * Free up a server record @@ -1029,11 +1016,7 @@ void nfs_free_server(struct nfs_server *server)  {  	dprintk("--> nfs_free_server()\n"); -	unset_pnfs_layoutdriver(server); -	spin_lock(&nfs_client_lock); -	list_del(&server->client_link); -	list_del(&server->master_link); -	spin_unlock(&nfs_client_lock); +	nfs_server_remove_lists(server);  	if (server->destroy != NULL)  		server->destroy(server); @@ -1045,19 +1028,22 @@ void nfs_free_server(struct nfs_server *server)  	nfs_put_client(server->nfs_client); +	ida_destroy(&server->lockowner_id); +	ida_destroy(&server->openowner_id);  	nfs_free_iostats(server->io_stats);  	bdi_destroy(&server->backing_dev_info);  	kfree(server);  	nfs_release_automount_timer();  	dprintk("<-- nfs_free_server()\n");  } +EXPORT_SYMBOL_GPL(nfs_free_server);  /*   * Create a version 2 or 3 volume record   * - keyed on server and FSID   */ -struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, -				     struct nfs_fh *mntfh) +struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info, +				     struct nfs_subversion *nfs_mod)  {  	struct nfs_server *server;  	struct nfs_fattr *fattr; @@ -1073,22 +1059,18 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,  		goto error;  	/* Get a client representation */ -	error = nfs_init_server(server, data); +	error = nfs_init_server(server, mount_info->parsed, nfs_mod);  	if (error < 0)  		goto error; -	BUG_ON(!server->nfs_client); -	BUG_ON(!server->nfs_client->rpc_ops); -	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); -  	/* Probe the root fh to retrieve its FSID */ -	error = nfs_probe_fsinfo(server, mntfh, fattr); +	error = nfs_probe_fsinfo(server, mount_info->mntfh, fattr);  	if (error < 0)  		goto error;  	if (server->nfs_client->rpc_ops->version == 3) {  		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)  			server->namelen = NFS3_MAXNAMLEN; -		if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) +		if (!(mount_info->parsed->flags & NFS_MOUNT_NORDIRPLUS))  			server->caps |= NFS_CAP_READDIRPLUS;  	} else {  		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) @@ -1096,7 +1078,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,  	}  	if (!(fattr->valid & NFS_ATTR_FATTR)) { -		error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); +		error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL);  		if (error < 0) {  			dprintk("nfs_create_server: getattr error = %d\n", -error);  			goto error; @@ -1108,11 +1090,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,  		(unsigned long long) server->fsid.major,  		(unsigned long long) server->fsid.minor); -	spin_lock(&nfs_client_lock); -	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); -	list_add_tail(&server->master_link, &nfs_volume_list); -	spin_unlock(&nfs_client_lock); - +	nfs_server_insert_lists(server);  	server->mount_time = jiffies;  	nfs_free_fattr(fattr);  	return server; @@ -1122,386 +1100,15 @@ error:  	nfs_free_server(server);  	return ERR_PTR(error);  } - -#ifdef CONFIG_NFS_V4 -/* - * Initialize the NFS4 callback service - */ -static int nfs4_init_callback(struct nfs_client *clp) -{ -	int error; - -	if (clp->rpc_ops->version == 4) { -		if (nfs4_has_session(clp)) { -			error = xprt_setup_backchannel( -						clp->cl_rpcclient->cl_xprt, -						NFS41_BC_MIN_CALLBACKS); -			if (error < 0) -				return error; -		} - -		error = nfs_callback_up(clp->cl_mvops->minor_version, -					clp->cl_rpcclient->cl_xprt); -		if (error < 0) { -			dprintk("%s: failed to start callback. Error = %d\n", -				__func__, error); -			return error; -		} -		__set_bit(NFS_CS_CALLBACK, &clp->cl_res_state); -	} -	return 0; -} - -/* - * Initialize the minor version specific parts of an NFS4 client record - */ -static int nfs4_init_client_minor_version(struct nfs_client *clp) -{ -#if defined(CONFIG_NFS_V4_1) -	if (clp->cl_mvops->minor_version) { -		struct nfs4_session *session = NULL; -		/* -		 * Create the session and mark it expired. -		 * When a SEQUENCE operation encounters the expired session -		 * it will do session recovery to initialize it. -		 */ -		session = nfs4_alloc_session(clp); -		if (!session) -			return -ENOMEM; - -		clp->cl_session = session; -		/* -		 * The create session reply races with the server back -		 * channel probe. Mark the client NFS_CS_SESSION_INITING -		 * so that the client back channel can find the -		 * nfs_client struct -		 */ -		clp->cl_cons_state = NFS_CS_SESSION_INITING; -	} -#endif /* CONFIG_NFS_V4_1 */ - -	return nfs4_init_callback(clp); -} - -/* - * Initialise an NFS4 client record - */ -static int nfs4_init_client(struct nfs_client *clp, -		const struct rpc_timeout *timeparms, -		const char *ip_addr, -		rpc_authflavor_t authflavour, -		int flags) -{ -	int error; - -	if (clp->cl_cons_state == NFS_CS_READY) { -		/* the client is initialised already */ -		dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp); -		return 0; -	} - -	/* Check NFS protocol revision and initialize RPC op vector */ -	clp->rpc_ops = &nfs_v4_clientops; - -	error = nfs_create_rpc_client(clp, timeparms, authflavour, -				      1, flags & NFS_MOUNT_NORESVPORT); -	if (error < 0) -		goto error; -	strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); - -	error = nfs_idmap_new(clp); -	if (error < 0) { -		dprintk("%s: failed to create idmapper. Error = %d\n", -			__func__, error); -		goto error; -	} -	__set_bit(NFS_CS_IDMAP, &clp->cl_res_state); - -	error = nfs4_init_client_minor_version(clp); -	if (error < 0) -		goto error; - -	if (!nfs4_has_session(clp)) -		nfs_mark_client_ready(clp, NFS_CS_READY); -	return 0; - -error: -	nfs_mark_client_ready(clp, error); -	dprintk("<-- nfs4_init_client() = xerror %d\n", error); -	return error; -} - -/* - * Set up an NFS4 client - */ -static int nfs4_set_client(struct nfs_server *server, -		const char *hostname, -		const struct sockaddr *addr, -		const size_t addrlen, -		const char *ip_addr, -		rpc_authflavor_t authflavour, -		int proto, const struct rpc_timeout *timeparms, -		u32 minorversion) -{ -	struct nfs_client_initdata cl_init = { -		.hostname = hostname, -		.addr = addr, -		.addrlen = addrlen, -		.rpc_ops = &nfs_v4_clientops, -		.proto = proto, -		.minorversion = minorversion, -	}; -	struct nfs_client *clp; -	int error; - -	dprintk("--> nfs4_set_client()\n"); - -	/* Allocate or find a client reference we can use */ -	clp = nfs_get_client(&cl_init); -	if (IS_ERR(clp)) { -		error = PTR_ERR(clp); -		goto error; -	} -	error = nfs4_init_client(clp, timeparms, ip_addr, authflavour, -					server->flags); -	if (error < 0) -		goto error_put; - -	server->nfs_client = clp; -	dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); -	return 0; - -error_put: -	nfs_put_client(clp); -error: -	dprintk("<-- nfs4_set_client() = xerror %d\n", error); -	return error; -} - - -/* - * Session has been established, and the client marked ready. - * Set the mount rsize and wsize with negotiated fore channel - * attributes which will be bound checked in nfs_server_set_fsinfo. - */ -static void nfs4_session_set_rwsize(struct nfs_server *server) -{ -#ifdef CONFIG_NFS_V4_1 -	struct nfs4_session *sess; -	u32 server_resp_sz; -	u32 server_rqst_sz; - -	if (!nfs4_has_session(server->nfs_client)) -		return; -	sess = server->nfs_client->cl_session; -	server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; -	server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; - -	if (server->rsize > server_resp_sz) -		server->rsize = server_resp_sz; -	if (server->wsize > server_rqst_sz) -		server->wsize = server_rqst_sz; -#endif /* CONFIG_NFS_V4_1 */ -} - -static int nfs4_server_common_setup(struct nfs_server *server, -		struct nfs_fh *mntfh) -{ -	struct nfs_fattr *fattr; -	int error; - -	BUG_ON(!server->nfs_client); -	BUG_ON(!server->nfs_client->rpc_ops); -	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - -	fattr = nfs_alloc_fattr(); -	if (fattr == NULL) -		return -ENOMEM; - -	/* We must ensure the session is initialised first */ -	error = nfs4_init_session(server); -	if (error < 0) -		goto out; - -	/* Probe the root fh to retrieve its FSID and filehandle */ -	error = nfs4_get_rootfh(server, mntfh); -	if (error < 0) -		goto out; - -	dprintk("Server FSID: %llx:%llx\n", -			(unsigned long long) server->fsid.major, -			(unsigned long long) server->fsid.minor); -	dprintk("Mount FH: %d\n", mntfh->size); - -	nfs4_session_set_rwsize(server); - -	error = nfs_probe_fsinfo(server, mntfh, fattr); -	if (error < 0) -		goto out; - -	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) -		server->namelen = NFS4_MAXNAMLEN; - -	spin_lock(&nfs_client_lock); -	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); -	list_add_tail(&server->master_link, &nfs_volume_list); -	spin_unlock(&nfs_client_lock); - -	server->mount_time = jiffies; -out: -	nfs_free_fattr(fattr); -	return error; -} - -/* - * Create a version 4 volume record - */ -static int nfs4_init_server(struct nfs_server *server, -		const struct nfs_parsed_mount_data *data) -{ -	struct rpc_timeout timeparms; -	int error; - -	dprintk("--> nfs4_init_server()\n"); - -	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, -			data->timeo, data->retrans); - -	/* Initialise the client representation from the mount data */ -	server->flags = data->flags; -	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK; -	if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) -			server->caps |= NFS_CAP_READDIRPLUS; -	server->options = data->options; - -	/* Get a client record */ -	error = nfs4_set_client(server, -			data->nfs_server.hostname, -			(const struct sockaddr *)&data->nfs_server.address, -			data->nfs_server.addrlen, -			data->client_address, -			data->auth_flavors[0], -			data->nfs_server.protocol, -			&timeparms, -			data->minorversion); -	if (error < 0) -		goto error; - -	if (data->rsize) -		server->rsize = nfs_block_size(data->rsize, NULL); -	if (data->wsize) -		server->wsize = nfs_block_size(data->wsize, NULL); - -	server->acregmin = data->acregmin * HZ; -	server->acregmax = data->acregmax * HZ; -	server->acdirmin = data->acdirmin * HZ; -	server->acdirmax = data->acdirmax * HZ; - -	server->port = data->nfs_server.port; - -	error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); - -error: -	/* Done */ -	dprintk("<-- nfs4_init_server() = %d\n", error); -	return error; -} - -/* - * Create a version 4 volume record - * - keyed on server and FSID - */ -struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, -				      struct nfs_fh *mntfh) -{ -	struct nfs_server *server; -	int error; - -	dprintk("--> nfs4_create_server()\n"); - -	server = nfs_alloc_server(); -	if (!server) -		return ERR_PTR(-ENOMEM); - -	/* set up the general RPC client */ -	error = nfs4_init_server(server, data); -	if (error < 0) -		goto error; - -	error = nfs4_server_common_setup(server, mntfh); -	if (error < 0) -		goto error; - -	dprintk("<-- nfs4_create_server() = %p\n", server); -	return server; - -error: -	nfs_free_server(server); -	dprintk("<-- nfs4_create_server() = error %d\n", error); -	return ERR_PTR(error); -} - -/* - * Create an NFS4 referral server record - */ -struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, -					       struct nfs_fh *mntfh) -{ -	struct nfs_client *parent_client; -	struct nfs_server *server, *parent_server; -	int error; - -	dprintk("--> nfs4_create_referral_server()\n"); - -	server = nfs_alloc_server(); -	if (!server) -		return ERR_PTR(-ENOMEM); - -	parent_server = NFS_SB(data->sb); -	parent_client = parent_server->nfs_client; - -	/* Initialise the client representation from the parent server */ -	nfs_server_copy_userdata(server, parent_server); -	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; - -	/* Get a client representation. -	 * Note: NFSv4 always uses TCP, */ -	error = nfs4_set_client(server, data->hostname, -				data->addr, -				data->addrlen, -				parent_client->cl_ipaddr, -				data->authflavor, -				parent_server->client->cl_xprt->prot, -				parent_server->client->cl_timeout, -				parent_client->cl_mvops->minor_version); -	if (error < 0) -		goto error; - -	error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); -	if (error < 0) -		goto error; - -	error = nfs4_server_common_setup(server, mntfh); -	if (error < 0) -		goto error; - -	dprintk("<-- nfs_create_referral_server() = %p\n", server); -	return server; - -error: -	nfs_free_server(server); -	dprintk("<-- nfs4_create_referral_server() = error %d\n", error); -	return ERR_PTR(error); -} - -#endif /* CONFIG_NFS_V4 */ +EXPORT_SYMBOL_GPL(nfs_create_server);  /*   * Clone an NFS2, NFS3 or NFS4 server record   */  struct nfs_server *nfs_clone_server(struct nfs_server *source,  				    struct nfs_fh *fh, -				    struct nfs_fattr *fattr) +				    struct nfs_fattr *fattr, +				    rpc_authflavor_t flavor)  {  	struct nfs_server *server;  	struct nfs_fattr *fattr_fsinfo; @@ -1522,6 +1129,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,  	/* Copy data from the source */  	server->nfs_client = source->nfs_client; +	server->destroy = source->destroy;  	atomic_inc(&server->nfs_client->cl_count);  	nfs_server_copy_userdata(server, source); @@ -1529,11 +1137,9 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,  	error = nfs_init_server_rpcclient(server,  			source->client->cl_timeout, -			source->client->cl_auth->au_flavor); +			flavor);  	if (error < 0)  		goto out_free_server; -	if (!IS_ERR(source->client_acl)) -		nfs_init_server_aclclient(server);  	/* probe the filesystem info for this server filesystem */  	error = nfs_probe_fsinfo(server, fh, fattr_fsinfo); @@ -1551,11 +1157,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,  	if (error < 0)  		goto out_free_server; -	spin_lock(&nfs_client_lock); -	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); -	list_add_tail(&server->master_link, &nfs_volume_list); -	spin_unlock(&nfs_client_lock); - +	nfs_server_insert_lists(server);  	server->mount_time = jiffies;  	nfs_free_fattr(fattr_fsinfo); @@ -1568,6 +1170,20 @@ out_free_server:  	dprintk("<-- nfs_clone_server() = error %d\n", error);  	return ERR_PTR(error);  } +EXPORT_SYMBOL_GPL(nfs_clone_server); + +void nfs_clients_init(struct net *net) +{ +	struct nfs_net *nn = net_generic(net, nfs_net_id); + +	INIT_LIST_HEAD(&nn->nfs_client_list); +	INIT_LIST_HEAD(&nn->nfs_volume_list); +#if IS_ENABLED(CONFIG_NFS_V4) +	idr_init(&nn->cb_ident_idr); +#endif +	spin_lock_init(&nn->nfs_client_lock); +	nn->boot_time = CURRENT_TIME; +}  #ifdef CONFIG_PROC_FS  static struct proc_dir_entry *proc_fs_nfs; @@ -1622,13 +1238,15 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)  {  	struct seq_file *m;  	int ret; +	struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info; +	struct net *net = pid_ns->child_reaper->nsproxy->net_ns;  	ret = seq_open(file, &nfs_server_list_ops);  	if (ret < 0)  		return ret;  	m = file->private_data; -	m->private = PDE(inode)->data; +	m->private = net;  	return 0;  } @@ -1638,9 +1256,11 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)   */  static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)  { +	struct nfs_net *nn = net_generic(m->private, nfs_net_id); +  	/* lock the list against modification */ -	spin_lock(&nfs_client_lock); -	return seq_list_start_head(&nfs_client_list, *_pos); +	spin_lock(&nn->nfs_client_lock); +	return seq_list_start_head(&nn->nfs_client_list, *_pos);  }  /* @@ -1648,7 +1268,9 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)   */  static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)  { -	return seq_list_next(v, &nfs_client_list, pos); +	struct nfs_net *nn = net_generic(p->private, nfs_net_id); + +	return seq_list_next(v, &nn->nfs_client_list, pos);  }  /* @@ -1656,7 +1278,9 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)   */  static void nfs_server_list_stop(struct seq_file *p, void *v)  { -	spin_unlock(&nfs_client_lock); +	struct nfs_net *nn = net_generic(p->private, nfs_net_id); + +	spin_unlock(&nn->nfs_client_lock);  }  /* @@ -1665,9 +1289,10 @@ static void nfs_server_list_stop(struct seq_file *p, void *v)  static int nfs_server_list_show(struct seq_file *m, void *v)  {  	struct nfs_client *clp; +	struct nfs_net *nn = net_generic(m->private, nfs_net_id);  	/* display header on line 1 */ -	if (v == &nfs_client_list) { +	if (v == &nn->nfs_client_list) {  		seq_puts(m, "NV SERVER   PORT USE HOSTNAME\n");  		return 0;  	} @@ -1675,12 +1300,18 @@ static int nfs_server_list_show(struct seq_file *m, void *v)  	/* display one transport per line on subsequent lines */  	clp = list_entry(v, struct nfs_client, cl_share_link); +	/* Check if the client is initialized */ +	if (clp->cl_cons_state != NFS_CS_READY) +		return 0; + +	rcu_read_lock();  	seq_printf(m, "v%u %s %s %3d %s\n",  		   clp->rpc_ops->version,  		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),  		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),  		   atomic_read(&clp->cl_count),  		   clp->cl_hostname); +	rcu_read_unlock();  	return 0;  } @@ -1692,13 +1323,15 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)  {  	struct seq_file *m;  	int ret; +	struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info; +	struct net *net = pid_ns->child_reaper->nsproxy->net_ns;  	ret = seq_open(file, &nfs_volume_list_ops);  	if (ret < 0)  		return ret;  	m = file->private_data; -	m->private = PDE(inode)->data; +	m->private = net;  	return 0;  } @@ -1708,9 +1341,11 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)   */  static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)  { +	struct nfs_net *nn = net_generic(m->private, nfs_net_id); +  	/* lock the list against modification */ -	spin_lock(&nfs_client_lock); -	return seq_list_start_head(&nfs_volume_list, *_pos); +	spin_lock(&nn->nfs_client_lock); +	return seq_list_start_head(&nn->nfs_volume_list, *_pos);  }  /* @@ -1718,7 +1353,9 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)   */  static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)  { -	return seq_list_next(v, &nfs_volume_list, pos); +	struct nfs_net *nn = net_generic(p->private, nfs_net_id); + +	return seq_list_next(v, &nn->nfs_volume_list, pos);  }  /* @@ -1726,7 +1363,9 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)   */  static void nfs_volume_list_stop(struct seq_file *p, void *v)  { -	spin_unlock(&nfs_client_lock); +	struct nfs_net *nn = net_generic(p->private, nfs_net_id); + +	spin_unlock(&nn->nfs_client_lock);  }  /* @@ -1737,9 +1376,10 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)  	struct nfs_server *server;  	struct nfs_client *clp;  	char dev[8], fsid[17]; +	struct nfs_net *nn = net_generic(m->private, nfs_net_id);  	/* display header on line 1 */ -	if (v == &nfs_volume_list) { +	if (v == &nn->nfs_volume_list) {  		seq_puts(m, "NV SERVER   PORT DEV     FSID              FSC\n");  		return 0;  	} @@ -1754,6 +1394,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)  		 (unsigned long long) server->fsid.major,  		 (unsigned long long) server->fsid.minor); +	rcu_read_lock();  	seq_printf(m, "v%u %s %s %-7s %-17s %s\n",  		   clp->rpc_ops->version,  		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), @@ -1761,6 +1402,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)  		   dev,  		   fsid,  		   nfs_server_fscache_state(server)); +	rcu_read_unlock();  	return 0;  } diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 232a7eead33..5d8ccecf5f5 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -11,7 +11,6 @@  #include <linux/module.h>  #include <linux/sched.h>  #include <linux/slab.h> -#include <linux/smp_lock.h>  #include <linux/spinlock.h>  #include <linux/nfs4.h> @@ -21,32 +20,35 @@  #include "nfs4_fs.h"  #include "delegation.h"  #include "internal.h" - -static void nfs_do_free_delegation(struct nfs_delegation *delegation) -{ -	if (delegation->cred) -		put_rpccred(delegation->cred); -	kfree(delegation); -} - -static void nfs_free_delegation_callback(struct rcu_head *head) -{ -	struct nfs_delegation *delegation = container_of(head, struct nfs_delegation, rcu); - -	nfs_do_free_delegation(delegation); -} +#include "nfs4trace.h"  static void nfs_free_delegation(struct nfs_delegation *delegation)  { -	call_rcu(&delegation->rcu, nfs_free_delegation_callback); +	if (delegation->cred) { +		put_rpccred(delegation->cred); +		delegation->cred = NULL; +	} +	kfree_rcu(delegation, rcu);  } +/** + * nfs_mark_delegation_referenced - set delegation's REFERENCED flag + * @delegation: delegation to process + * + */  void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)  {  	set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);  } -int nfs_have_delegation(struct inode *inode, fmode_t flags) +/** + * nfs_have_delegation - check if inode has a delegation + * @inode: inode to check + * @flags: delegation types to check for + * + * Returns one if inode has the indicated delegation, otherwise zero. + */ +int nfs4_have_delegation(struct inode *inode, fmode_t flags)  {  	struct nfs_delegation *delegation;  	int ret = 0; @@ -54,7 +56,8 @@ int nfs_have_delegation(struct inode *inode, fmode_t flags)  	flags &= FMODE_READ|FMODE_WRITE;  	rcu_read_lock();  	delegation = rcu_dereference(NFS_I(inode)->delegation); -	if (delegation != NULL && (delegation->type & flags) == flags) { +	if (delegation != NULL && (delegation->type & flags) == flags && +	    !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {  		nfs_mark_delegation_referenced(delegation);  		ret = 1;  	} @@ -62,7 +65,7 @@ int nfs_have_delegation(struct inode *inode, fmode_t flags)  	return ret;  } -static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state) +static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)  {  	struct inode *inode = state->inode;  	struct file_lock *fl; @@ -71,20 +74,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_  	if (inode->i_flock == NULL)  		goto out; -	/* Protect inode->i_flock using the file locks lock */ -	lock_flocks(); +	/* Protect inode->i_flock using the i_lock */ +	spin_lock(&inode->i_lock);  	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {  		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))  			continue;  		if (nfs_file_open_context(fl->fl_file) != ctx)  			continue; -		unlock_flocks(); -		status = nfs4_lock_delegation_recall(state, fl); +		spin_unlock(&inode->i_lock); +		status = nfs4_lock_delegation_recall(fl, state, stateid);  		if (status < 0)  			goto out; -		lock_flocks(); +		spin_lock(&inode->i_lock);  	} -	unlock_flocks(); +	spin_unlock(&inode->i_lock);  out:  	return status;  } @@ -93,7 +96,9 @@ static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *s  {  	struct nfs_inode *nfsi = NFS_I(inode);  	struct nfs_open_context *ctx; +	struct nfs4_state_owner *sp;  	struct nfs4_state *state; +	unsigned int seq;  	int err;  again: @@ -104,13 +109,20 @@ again:  			continue;  		if (!test_bit(NFS_DELEGATED_STATE, &state->flags))  			continue; -		if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) +		if (!nfs4_stateid_match(&state->stateid, stateid))  			continue;  		get_nfs_open_context(ctx);  		spin_unlock(&inode->i_lock); +		sp = state->owner; +		/* Block nfs4_proc_unlck */ +		mutex_lock(&sp->so_delegreturn_mutex); +		seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);  		err = nfs4_open_delegation_recall(ctx, state, stateid); -		if (err >= 0) -			err = nfs_delegation_claim_locks(ctx, state); +		if (!err) +			err = nfs_delegation_claim_locks(ctx, state, stateid); +		if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) +			err = -EAGAIN; +		mutex_unlock(&sp->so_delegreturn_mutex);  		put_nfs_open_context(ctx);  		if (err != 0)  			return err; @@ -120,10 +132,15 @@ again:  	return 0;  } -/* - * Set up a delegation on an inode +/** + * nfs_inode_reclaim_delegation - process a delegation reclaim request + * @inode: inode to process + * @cred: credential to use for request + * @res: new delegation state from server + *   */ -void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) +void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, +				  struct nfs_openres *res)  {  	struct nfs_delegation *delegation;  	struct rpc_cred *oldcred = NULL; @@ -133,8 +150,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st  	if (delegation != NULL) {  		spin_lock(&delegation->lock);  		if (delegation->inode != NULL) { -			memcpy(delegation->stateid.data, res->delegation.data, -			       sizeof(delegation->stateid.data)); +			nfs4_stateid_copy(&delegation->stateid, &res->delegation);  			delegation->type = res->delegation_type;  			delegation->maxsize = res->maxsize;  			oldcred = delegation->cred; @@ -145,6 +161,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st  			spin_unlock(&delegation->lock);  			put_rpccred(oldcred);  			rcu_read_unlock(); +			trace_nfs4_reclaim_delegation(inode, res->delegation_type);  		} else {  			/* We appear to have raced with a delegation return. */  			spin_unlock(&delegation->lock); @@ -176,38 +193,104 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation  	return inode;  } -static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, -							   const nfs4_stateid *stateid, -							   struct nfs_client *clp) +static struct nfs_delegation * +nfs_start_delegation_return_locked(struct nfs_inode *nfsi)  { -	struct nfs_delegation *delegation = -		rcu_dereference_protected(nfsi->delegation, -					  lockdep_is_held(&clp->cl_lock)); +	struct nfs_delegation *ret = NULL; +	struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);  	if (delegation == NULL) -		goto nomatch; +		goto out; +	spin_lock(&delegation->lock); +	if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) +		ret = delegation; +	spin_unlock(&delegation->lock); +out: +	return ret; +} + +static struct nfs_delegation * +nfs_start_delegation_return(struct nfs_inode *nfsi) +{ +	struct nfs_delegation *delegation; + +	rcu_read_lock(); +	delegation = nfs_start_delegation_return_locked(nfsi); +	rcu_read_unlock(); +	return delegation; +} + +static void +nfs_abort_delegation_return(struct nfs_delegation *delegation, +		struct nfs_client *clp) +{ +  	spin_lock(&delegation->lock); -	if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, -				sizeof(delegation->stateid.data)) != 0) -		goto nomatch_unlock; +	clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); +	set_bit(NFS_DELEGATION_RETURN, &delegation->flags); +	spin_unlock(&delegation->lock); +	set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); +} + +static struct nfs_delegation * +nfs_detach_delegation_locked(struct nfs_inode *nfsi, +		struct nfs_delegation *delegation, +		struct nfs_client *clp) +{ +	struct nfs_delegation *deleg_cur = +		rcu_dereference_protected(nfsi->delegation, +				lockdep_is_held(&clp->cl_lock)); + +	if (deleg_cur == NULL || delegation != deleg_cur) +		return NULL; + +	spin_lock(&delegation->lock); +	set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);  	list_del_rcu(&delegation->super_list);  	delegation->inode = NULL;  	nfsi->delegation_state = 0;  	rcu_assign_pointer(nfsi->delegation, NULL);  	spin_unlock(&delegation->lock);  	return delegation; -nomatch_unlock: -	spin_unlock(&delegation->lock); -nomatch: -	return NULL;  } -/* - * Set up a delegation on an inode +static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi, +		struct nfs_delegation *delegation, +		struct nfs_server *server) +{ +	struct nfs_client *clp = server->nfs_client; + +	spin_lock(&clp->cl_lock); +	delegation = nfs_detach_delegation_locked(nfsi, delegation, clp); +	spin_unlock(&clp->cl_lock); +	return delegation; +} + +static struct nfs_delegation * +nfs_inode_detach_delegation(struct inode *inode) +{ +	struct nfs_inode *nfsi = NFS_I(inode); +	struct nfs_server *server = NFS_SERVER(inode); +	struct nfs_delegation *delegation; + +	delegation = nfs_start_delegation_return(nfsi); +	if (delegation == NULL) +		return NULL; +	return nfs_detach_delegation(nfsi, delegation, server); +} + +/** + * nfs_inode_set_delegation - set up a delegation on an inode + * @inode: inode to which delegation applies + * @cred: cred to use for subsequent delegation processing + * @res: new delegation state from server + * + * Returns zero on success, or a negative errno value.   */  int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)  { -	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; +	struct nfs_server *server = NFS_SERVER(inode); +	struct nfs_client *clp = server->nfs_client;  	struct nfs_inode *nfsi = NFS_I(inode);  	struct nfs_delegation *delegation, *old_delegation;  	struct nfs_delegation *freeme = NULL; @@ -216,11 +299,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct  	delegation = kmalloc(sizeof(*delegation), GFP_NOFS);  	if (delegation == NULL)  		return -ENOMEM; -	memcpy(delegation->stateid.data, res->delegation.data, -			sizeof(delegation->stateid.data)); +	nfs4_stateid_copy(&delegation->stateid, &res->delegation);  	delegation->type = res->delegation_type;  	delegation->maxsize = res->maxsize; -	delegation->change_attr = nfsi->change_attr; +	delegation->change_attr = inode->i_version;  	delegation->cred = get_rpccred(cred);  	delegation->inode = inode;  	delegation->flags = 1<<NFS_DELEGATION_REFERENCED; @@ -228,28 +310,34 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct  	spin_lock(&clp->cl_lock);  	old_delegation = rcu_dereference_protected(nfsi->delegation, -						   lockdep_is_held(&clp->cl_lock)); +					lockdep_is_held(&clp->cl_lock));  	if (old_delegation != NULL) { -		if (memcmp(&delegation->stateid, &old_delegation->stateid, -					sizeof(old_delegation->stateid)) == 0 && +		if (nfs4_stateid_match(&delegation->stateid, +					&old_delegation->stateid) &&  				delegation->type == old_delegation->type) {  			goto out;  		}  		/*  		 * Deal with broken servers that hand out two  		 * delegations for the same file. +		 * Allow for upgrades to a WRITE delegation, but +		 * nothing else.  		 */  		dfprintk(FILE, "%s: server %s handed out "  				"a duplicate delegation!\n",  				__func__, clp->cl_hostname); -		if (delegation->type <= old_delegation->type) { +		if (delegation->type == old_delegation->type || +		    !(delegation->type & FMODE_WRITE)) {  			freeme = delegation;  			delegation = NULL;  			goto out;  		} -		freeme = nfs_detach_delegation_locked(nfsi, NULL, clp); +		freeme = nfs_detach_delegation_locked(nfsi,  +				old_delegation, clp); +		if (freeme == NULL) +			goto out;  	} -	list_add_rcu(&delegation->super_list, &clp->cl_delegations); +	list_add_rcu(&delegation->super_list, &server->delegations);  	nfsi->delegation_state = delegation->type;  	rcu_assign_pointer(nfsi->delegation, delegation);  	delegation = NULL; @@ -258,6 +346,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct  	spin_lock(&inode->i_lock);  	nfsi->cache_validity |= NFS_INO_REVAL_FORCED;  	spin_unlock(&inode->i_lock); +	trace_nfs4_set_delegation(inode, res->delegation_type);  out:  	spin_unlock(&clp->cl_lock); @@ -271,19 +360,29 @@ out:  /*   * Basic procedure for returning a delegation to the server   */ -static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) +static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync)  { +	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;  	struct nfs_inode *nfsi = NFS_I(inode);  	int err; -	/* -	 * Guard against new delegated open/lock/unlock calls and against -	 * state recovery -	 */ -	down_write(&nfsi->rwsem); -	err = nfs_delegation_claim_opens(inode, &delegation->stateid); -	up_write(&nfsi->rwsem); -	if (err) +	if (delegation == NULL) +		return 0; +	do { +		err = nfs_delegation_claim_opens(inode, &delegation->stateid); +		if (!issync || err != -EAGAIN) +			break; +		/* +		 * Guard against state recovery +		 */ +		err = nfs4_wait_clnt_recover(clp); +	} while (err == 0); + +	if (err) { +		nfs_abort_delegation_return(delegation, clp); +		goto out; +	} +	if (!nfs_detach_delegation(nfsi, delegation, NFS_SERVER(inode)))  		goto out;  	err = nfs_do_return_delegation(inode, delegation, issync); @@ -291,206 +390,298 @@ out:  	return err;  } -/* - * Return all delegations that have been marked for return +static bool nfs_delegation_need_return(struct nfs_delegation *delegation) +{ +	bool ret = false; + +	if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) +		ret = true; +	if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) { +		struct inode *inode; + +		spin_lock(&delegation->lock); +		inode = delegation->inode; +		if (inode && list_empty(&NFS_I(inode)->open_files)) +			ret = true; +		spin_unlock(&delegation->lock); +	} +	return ret; +} + +/** + * nfs_client_return_marked_delegations - return previously marked delegations + * @clp: nfs_client to process + * + * Note that this function is designed to be called by the state + * manager thread. For this reason, it cannot flush the dirty data, + * since that could deadlock in case of a state recovery error. + * + * Returns zero on success, or a negative errno value.   */  int nfs_client_return_marked_delegations(struct nfs_client *clp)  {  	struct nfs_delegation *delegation; +	struct nfs_server *server;  	struct inode *inode;  	int err = 0;  restart:  	rcu_read_lock(); -	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { -		if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) -			continue; -		inode = nfs_delegation_grab_inode(delegation); -		if (inode == NULL) -			continue; -		spin_lock(&clp->cl_lock); -		delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp); -		spin_unlock(&clp->cl_lock); -		rcu_read_unlock(); -		if (delegation != NULL) { -			filemap_flush(inode->i_mapping); -			err = __nfs_inode_return_delegation(inode, delegation, 0); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +		list_for_each_entry_rcu(delegation, &server->delegations, +								super_list) { +			if (!nfs_delegation_need_return(delegation)) +				continue; +			inode = nfs_delegation_grab_inode(delegation); +			if (inode == NULL) +				continue; +			delegation = nfs_start_delegation_return_locked(NFS_I(inode)); +			rcu_read_unlock(); + +			err = nfs_end_delegation_return(inode, delegation, 0); +			iput(inode); +			if (!err) +				goto restart; +			set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); +			return err;  		} -		iput(inode); -		if (!err) -			goto restart; -		set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); -		return err;  	}  	rcu_read_unlock();  	return 0;  } -/* - * This function returns the delegation without reclaiming opens - * or protecting against delegation reclaims. - * It is therefore really only safe to be called from - * nfs4_clear_inode() +/** + * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens + * @inode: inode to process + * + * Does not protect against delegation reclaims, therefore really only safe + * to be called from nfs4_clear_inode().   */  void nfs_inode_return_delegation_noreclaim(struct inode *inode)  { -	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; -	struct nfs_inode *nfsi = NFS_I(inode);  	struct nfs_delegation *delegation; -	if (rcu_access_pointer(nfsi->delegation) != NULL) { -		spin_lock(&clp->cl_lock); -		delegation = nfs_detach_delegation_locked(nfsi, NULL, clp); -		spin_unlock(&clp->cl_lock); -		if (delegation != NULL) -			nfs_do_return_delegation(inode, delegation, 0); -	} +	delegation = nfs_inode_detach_delegation(inode); +	if (delegation != NULL) +		nfs_do_return_delegation(inode, delegation, 0);  } -int nfs_inode_return_delegation(struct inode *inode) +/** + * nfs_inode_return_delegation - synchronously return a delegation + * @inode: inode to process + * + * This routine will always flush any dirty data to disk on the + * assumption that if we need to return the delegation, then + * we should stop caching. + * + * Returns zero on success, or a negative errno value. + */ +int nfs4_inode_return_delegation(struct inode *inode)  { -	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;  	struct nfs_inode *nfsi = NFS_I(inode);  	struct nfs_delegation *delegation;  	int err = 0; -	if (rcu_access_pointer(nfsi->delegation) != NULL) { -		spin_lock(&clp->cl_lock); -		delegation = nfs_detach_delegation_locked(nfsi, NULL, clp); -		spin_unlock(&clp->cl_lock); -		if (delegation != NULL) { -			nfs_wb_all(inode); -			err = __nfs_inode_return_delegation(inode, delegation, 1); -		} -	} +	nfs_wb_all(inode); +	delegation = nfs_start_delegation_return(nfsi); +	if (delegation != NULL) +		err = nfs_end_delegation_return(inode, delegation, 1);  	return err;  } -static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation) +static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, +		struct nfs_delegation *delegation) +{ +	set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); +	set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); +} + +static void nfs_mark_return_delegation(struct nfs_server *server, +		struct nfs_delegation *delegation)  {  	set_bit(NFS_DELEGATION_RETURN, &delegation->flags); -	set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); +	set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);  } -/* - * Return all delegations associated to a super block - */ -void nfs_super_return_all_delegations(struct super_block *sb) +static bool nfs_server_mark_return_all_delegations(struct nfs_server *server)  { -	struct nfs_client *clp = NFS_SB(sb)->nfs_client;  	struct nfs_delegation *delegation; +	bool ret = false; + +	list_for_each_entry_rcu(delegation, &server->delegations, super_list) { +		nfs_mark_return_delegation(server, delegation); +		ret = true; +	} +	return ret; +} + +static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) +{ +	struct nfs_server *server; + +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) +		nfs_server_mark_return_all_delegations(server); +	rcu_read_unlock(); +} + +static void nfs_delegation_run_state_manager(struct nfs_client *clp) +{ +	if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) +		nfs4_schedule_state_manager(clp); +} + +/** + * nfs_expire_all_delegations + * @clp: client to process + * + */ +void nfs_expire_all_delegations(struct nfs_client *clp) +{ +	nfs_client_mark_return_all_delegations(clp); +	nfs_delegation_run_state_manager(clp); +} + +/** + * nfs_super_return_all_delegations - return delegations for one superblock + * @sb: sb to process + * + */ +void nfs_server_return_all_delegations(struct nfs_server *server) +{ +	struct nfs_client *clp = server->nfs_client; +	bool need_wait;  	if (clp == NULL)  		return; +  	rcu_read_lock(); -	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { -		spin_lock(&delegation->lock); -		if (delegation->inode != NULL && delegation->inode->i_sb == sb) -			set_bit(NFS_DELEGATION_RETURN, &delegation->flags); -		spin_unlock(&delegation->lock); -	} +	need_wait = nfs_server_mark_return_all_delegations(server);  	rcu_read_unlock(); -	if (nfs_client_return_marked_delegations(clp) != 0) + +	if (need_wait) {  		nfs4_schedule_state_manager(clp); +		nfs4_wait_clnt_recover(clp); +	}  } -static -void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, fmode_t flags) +static void nfs_mark_return_unused_delegation_types(struct nfs_server *server, +						 fmode_t flags)  {  	struct nfs_delegation *delegation; -	rcu_read_lock(); -	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { +	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {  		if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))  			continue;  		if (delegation->type & flags) -			nfs_mark_return_delegation(clp, delegation); +			nfs_mark_return_if_closed_delegation(server, delegation);  	} -	rcu_read_unlock();  } -static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) +static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *clp, +							fmode_t flags)  { -	nfs_client_mark_return_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); -} +	struct nfs_server *server; -static void nfs_delegation_run_state_manager(struct nfs_client *clp) -{ -	if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) -		nfs4_schedule_state_manager(clp); +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) +		nfs_mark_return_unused_delegation_types(server, flags); +	rcu_read_unlock();  } -void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags) +void nfs_remove_bad_delegation(struct inode *inode)  { -	nfs_client_mark_return_all_delegation_types(clp, flags); -	nfs_delegation_run_state_manager(clp); -} +	struct nfs_delegation *delegation; -void nfs_expire_all_delegations(struct nfs_client *clp) -{ -	nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); +	delegation = nfs_inode_detach_delegation(inode); +	if (delegation) { +		nfs_inode_find_state_and_recover(inode, &delegation->stateid); +		nfs_free_delegation(delegation); +	}  } +EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); -/* - * Return all delegations following an NFS4ERR_CB_PATH_DOWN error. +/** + * nfs_expire_unused_delegation_types + * @clp: client to process + * @flags: delegation types to expire + *   */ -void nfs_handle_cb_pathdown(struct nfs_client *clp) +void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags)  { -	if (clp == NULL) -		return; -	nfs_client_mark_return_all_delegations(clp); +	nfs_client_mark_return_unused_delegation_types(clp, flags); +	nfs_delegation_run_state_manager(clp);  } -static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp) +static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)  {  	struct nfs_delegation *delegation; -	rcu_read_lock(); -	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { +	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {  		if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))  			continue; -		nfs_mark_return_delegation(clp, delegation); +		nfs_mark_return_if_closed_delegation(server, delegation);  	} -	rcu_read_unlock();  } +/** + * nfs_expire_unreferenced_delegations - Eliminate unused delegations + * @clp: nfs_client to process + * + */  void nfs_expire_unreferenced_delegations(struct nfs_client *clp)  { -	nfs_client_mark_return_unreferenced_delegations(clp); +	struct nfs_server *server; + +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) +		nfs_mark_return_unreferenced_delegations(server); +	rcu_read_unlock(); +  	nfs_delegation_run_state_manager(clp);  } -/* - * Asynchronous delegation recall! +/** + * nfs_async_inode_return_delegation - asynchronously return a delegation + * @inode: inode to process + * @stateid: state ID information + * + * Returns zero on success, or a negative errno value.   */ -int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) +int nfs_async_inode_return_delegation(struct inode *inode, +				      const nfs4_stateid *stateid)  { -	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; +	struct nfs_server *server = NFS_SERVER(inode); +	struct nfs_client *clp = server->nfs_client;  	struct nfs_delegation *delegation; +	filemap_flush(inode->i_mapping); +  	rcu_read_lock();  	delegation = rcu_dereference(NFS_I(inode)->delegation); +	if (delegation == NULL) +		goto out_enoent; -	if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { -		rcu_read_unlock(); -		return -ENOENT; -	} - -	nfs_mark_return_delegation(clp, delegation); +	if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) +		goto out_enoent; +	nfs_mark_return_delegation(server, delegation);  	rcu_read_unlock(); +  	nfs_delegation_run_state_manager(clp);  	return 0; +out_enoent: +	rcu_read_unlock(); +	return -ENOENT;  } -/* - * Retrieve the inode associated with a delegation - */ -struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle) +static struct inode * +nfs_delegation_find_inode_server(struct nfs_server *server, +				 const struct nfs_fh *fhandle)  {  	struct nfs_delegation *delegation;  	struct inode *res = NULL; -	rcu_read_lock(); -	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { + +	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {  		spin_lock(&delegation->lock);  		if (delegation->inode != NULL &&  		    nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { @@ -500,60 +691,136 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs  		if (res != NULL)  			break;  	} +	return res; +} + +/** + * nfs_delegation_find_inode - retrieve the inode associated with a delegation + * @clp: client state handle + * @fhandle: filehandle from a delegation recall + * + * Returns pointer to inode matching "fhandle," or NULL if a matching inode + * cannot be found. + */ +struct inode *nfs_delegation_find_inode(struct nfs_client *clp, +					const struct nfs_fh *fhandle) +{ +	struct nfs_server *server; +	struct inode *res = NULL; + +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +		res = nfs_delegation_find_inode_server(server, fhandle); +		if (res != NULL) +			break; +	}  	rcu_read_unlock();  	return res;  } -/* - * Mark all delegations as needing to be reclaimed +static void nfs_delegation_mark_reclaim_server(struct nfs_server *server) +{ +	struct nfs_delegation *delegation; + +	list_for_each_entry_rcu(delegation, &server->delegations, super_list) +		set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); +} + +/** + * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed + * @clp: nfs_client to process + *   */  void nfs_delegation_mark_reclaim(struct nfs_client *clp)  { -	struct nfs_delegation *delegation; +	struct nfs_server *server; +  	rcu_read_lock(); -	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) -		set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) +		nfs_delegation_mark_reclaim_server(server);  	rcu_read_unlock();  } -/* - * Reap all unclaimed delegations after reboot recovery is done +/** + * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done + * @clp: nfs_client to process + *   */  void nfs_delegation_reap_unclaimed(struct nfs_client *clp)  {  	struct nfs_delegation *delegation; +	struct nfs_server *server;  	struct inode *inode; +  restart:  	rcu_read_lock(); -	list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { -		if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) -			continue; -		inode = nfs_delegation_grab_inode(delegation); -		if (inode == NULL) -			continue; -		spin_lock(&clp->cl_lock); -		delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp); -		spin_unlock(&clp->cl_lock); -		rcu_read_unlock(); -		if (delegation != NULL) -			nfs_free_delegation(delegation); -		iput(inode); -		goto restart; +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +		list_for_each_entry_rcu(delegation, &server->delegations, +								super_list) { +			if (test_bit(NFS_DELEGATION_NEED_RECLAIM, +						&delegation->flags) == 0) +				continue; +			inode = nfs_delegation_grab_inode(delegation); +			if (inode == NULL) +				continue; +			delegation = nfs_detach_delegation(NFS_I(inode), +					delegation, server); +			rcu_read_unlock(); + +			if (delegation != NULL) +				nfs_free_delegation(delegation); +			iput(inode); +			goto restart; +		}  	}  	rcu_read_unlock();  } -int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode) +/** + * nfs_delegations_present - check for existence of delegations + * @clp: client state handle + * + * Returns one if there are any nfs_delegation structures attached + * to this nfs_client. + */ +int nfs_delegations_present(struct nfs_client *clp) +{ +	struct nfs_server *server; +	int ret = 0; + +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) +		if (!list_empty(&server->delegations)) { +			ret = 1; +			break; +		} +	rcu_read_unlock(); +	return ret; +} + +/** + * nfs4_copy_delegation_stateid - Copy inode's state ID information + * @dst: stateid data structure to fill in + * @inode: inode to check + * @flags: delegation type requirement + * + * Returns "true" and fills in "dst->data" * if inode had a delegation, + * otherwise "false" is returned. + */ +bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, +		fmode_t flags)  {  	struct nfs_inode *nfsi = NFS_I(inode);  	struct nfs_delegation *delegation; -	int ret = 0; +	bool ret; +	flags &= FMODE_READ|FMODE_WRITE;  	rcu_read_lock();  	delegation = rcu_dereference(nfsi->delegation); -	if (delegation != NULL) { -		memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); -		ret = 1; +	ret = (delegation != NULL && (delegation->type & flags) == flags); +	if (ret) { +		nfs4_stateid_copy(dst, &delegation->stateid); +		nfs_mark_delegation_referenced(delegation);  	}  	rcu_read_unlock();  	return ret; diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 2026304bda1..9a79c7a99d6 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -8,7 +8,7 @@  #ifndef FS_NFS_DELEGATION_H  #define FS_NFS_DELEGATION_H -#if defined(CONFIG_NFS_V4) +#if IS_ENABLED(CONFIG_NFS_V4)  /*   * NFSv4 delegation   */ @@ -28,22 +28,25 @@ struct nfs_delegation {  enum {  	NFS_DELEGATION_NEED_RECLAIM = 0,  	NFS_DELEGATION_RETURN, +	NFS_DELEGATION_RETURN_IF_CLOSED,  	NFS_DELEGATION_REFERENCED, +	NFS_DELEGATION_RETURNING,  };  int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);  void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); -int nfs_inode_return_delegation(struct inode *inode); +int nfs4_inode_return_delegation(struct inode *inode);  int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);  void nfs_inode_return_delegation_noreclaim(struct inode *inode);  struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); -void nfs_super_return_all_delegations(struct super_block *sb); +void nfs_server_return_all_delegations(struct nfs_server *);  void nfs_expire_all_delegations(struct nfs_client *clp); -void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags); +void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags);  void nfs_expire_unreferenced_delegations(struct nfs_client *clp); -void nfs_handle_cb_pathdown(struct nfs_client *clp);  int nfs_client_return_marked_delegations(struct nfs_client *clp); +int nfs_delegations_present(struct nfs_client *clp); +void nfs_remove_bad_delegation(struct inode *inode);  void nfs_delegation_mark_reclaim(struct nfs_client *clp);  void nfs_delegation_reap_unclaimed(struct nfs_client *clp); @@ -51,27 +54,17 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp);  /* NFSv4 delegation-related procedures */  int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);  int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); -int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); -int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode); +int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); +bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags);  void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); -int nfs_have_delegation(struct inode *inode, fmode_t flags); +int nfs4_have_delegation(struct inode *inode, fmode_t flags); -#else -static inline int nfs_have_delegation(struct inode *inode, fmode_t flags) -{ -	return 0; -} - -static inline int nfs_inode_return_delegation(struct inode *inode) -{ -	return 0; -}  #endif  static inline int nfs_have_delegated_attributes(struct inode *inode)  { -	return nfs_have_delegation(inode, FMODE_READ) && +	return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) &&  		!(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);  } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 07ac3847e56..4a3d4ef7612 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -17,6 +17,7 @@   *  6 Jun 1999	Cache readdir lookups in the page cache. -DaveM   */ +#include <linux/module.h>  #include <linux/time.h>  #include <linux/errno.h>  #include <linux/stat.h> @@ -32,104 +33,67 @@  #include <linux/pagevec.h>  #include <linux/namei.h>  #include <linux/mount.h> +#include <linux/swap.h>  #include <linux/sched.h> -#include <linux/vmalloc.h> +#include <linux/kmemleak.h> +#include <linux/xattr.h>  #include "delegation.h"  #include "iostat.h"  #include "internal.h"  #include "fscache.h" +#include "nfstrace.h" +  /* #define NFS_DEBUG_VERBOSE 1 */  static int nfs_opendir(struct inode *, struct file *); -static int nfs_readdir(struct file *, void *, filldir_t); -static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *); -static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *); -static int nfs_mkdir(struct inode *, struct dentry *, int); -static int nfs_rmdir(struct inode *, struct dentry *); -static int nfs_unlink(struct inode *, struct dentry *); -static int nfs_symlink(struct inode *, struct dentry *, const char *); -static int nfs_link(struct dentry *, struct inode *, struct dentry *); -static int nfs_mknod(struct inode *, struct dentry *, int, dev_t); -static int nfs_rename(struct inode *, struct dentry *, -		      struct inode *, struct dentry *); -static int nfs_fsync_dir(struct file *, int); +static int nfs_closedir(struct inode *, struct file *); +static int nfs_readdir(struct file *, struct dir_context *); +static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);  static loff_t nfs_llseek_dir(struct file *, loff_t, int); -static int nfs_readdir_clear_array(struct page*, gfp_t); +static void nfs_readdir_clear_array(struct page*);  const struct file_operations nfs_dir_operations = {  	.llseek		= nfs_llseek_dir,  	.read		= generic_read_dir, -	.readdir	= nfs_readdir, +	.iterate	= nfs_readdir,  	.open		= nfs_opendir, -	.release	= nfs_release, +	.release	= nfs_closedir,  	.fsync		= nfs_fsync_dir,  }; -const struct inode_operations nfs_dir_inode_operations = { -	.create		= nfs_create, -	.lookup		= nfs_lookup, -	.link		= nfs_link, -	.unlink		= nfs_unlink, -	.symlink	= nfs_symlink, -	.mkdir		= nfs_mkdir, -	.rmdir		= nfs_rmdir, -	.mknod		= nfs_mknod, -	.rename		= nfs_rename, -	.permission	= nfs_permission, -	.getattr	= nfs_getattr, -	.setattr	= nfs_setattr, -}; - -const struct address_space_operations nfs_dir_addr_space_ops = { -	.releasepage = nfs_readdir_clear_array, +const struct address_space_operations nfs_dir_aops = { +	.freepage = nfs_readdir_clear_array,  }; -#ifdef CONFIG_NFS_V3 -const struct inode_operations nfs3_dir_inode_operations = { -	.create		= nfs_create, -	.lookup		= nfs_lookup, -	.link		= nfs_link, -	.unlink		= nfs_unlink, -	.symlink	= nfs_symlink, -	.mkdir		= nfs_mkdir, -	.rmdir		= nfs_rmdir, -	.mknod		= nfs_mknod, -	.rename		= nfs_rename, -	.permission	= nfs_permission, -	.getattr	= nfs_getattr, -	.setattr	= nfs_setattr, -	.listxattr	= nfs3_listxattr, -	.getxattr	= nfs3_getxattr, -	.setxattr	= nfs3_setxattr, -	.removexattr	= nfs3_removexattr, -}; -#endif  /* CONFIG_NFS_V3 */ - -#ifdef CONFIG_NFS_V4 - -static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *); -static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd); -const struct inode_operations nfs4_dir_inode_operations = { -	.create		= nfs_open_create, -	.lookup		= nfs_atomic_lookup, -	.link		= nfs_link, -	.unlink		= nfs_unlink, -	.symlink	= nfs_symlink, -	.mkdir		= nfs_mkdir, -	.rmdir		= nfs_rmdir, -	.mknod		= nfs_mknod, -	.rename		= nfs_rename, -	.permission	= nfs_permission, -	.getattr	= nfs_getattr, -	.setattr	= nfs_setattr, -	.getxattr       = nfs4_getxattr, -	.setxattr       = nfs4_setxattr, -	.listxattr      = nfs4_listxattr, -}; +static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) +{ +	struct nfs_inode *nfsi = NFS_I(dir); +	struct nfs_open_dir_context *ctx; +	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); +	if (ctx != NULL) { +		ctx->duped = 0; +		ctx->attr_gencount = nfsi->attr_gencount; +		ctx->dir_cookie = 0; +		ctx->dup_cookie = 0; +		ctx->cred = get_rpccred(cred); +		spin_lock(&dir->i_lock); +		list_add(&ctx->list, &nfsi->open_files); +		spin_unlock(&dir->i_lock); +		return ctx; +	} +	return  ERR_PTR(-ENOMEM); +} -#endif /* CONFIG_NFS_V4 */ +static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx) +{ +	spin_lock(&dir->i_lock); +	list_del(&ctx->list); +	spin_unlock(&dir->i_lock); +	put_rpccred(ctx->cred); +	kfree(ctx); +}  /*   * Open file @@ -137,16 +101,23 @@ const struct inode_operations nfs4_dir_inode_operations = {  static int  nfs_opendir(struct inode *inode, struct file *filp)  { -	int res; +	int res = 0; +	struct nfs_open_dir_context *ctx; +	struct rpc_cred *cred; -	dfprintk(FILE, "NFS: open dir(%s/%s)\n", -			filp->f_path.dentry->d_parent->d_name.name, -			filp->f_path.dentry->d_name.name); +	dfprintk(FILE, "NFS: open dir(%pD2)\n", filp);  	nfs_inc_stats(inode, NFSIOS_VFSOPEN); -	/* Call generic open code in order to cache credentials */ -	res = nfs_open(inode, filp); +	cred = rpc_lookup_cred(); +	if (IS_ERR(cred)) +		return PTR_ERR(cred); +	ctx = alloc_nfs_open_dir_context(inode, cred); +	if (IS_ERR(ctx)) { +		res = PTR_ERR(ctx); +		goto out; +	} +	filp->private_data = ctx;  	if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {  		/* This is a mountpoint, so d_revalidate will never  		 * have been called, so we need to refresh the @@ -154,30 +125,40 @@ nfs_opendir(struct inode *inode, struct file *filp)  		 */  		__nfs_revalidate_inode(NFS_SERVER(inode), inode);  	} +out: +	put_rpccred(cred);  	return res;  } +static int +nfs_closedir(struct inode *inode, struct file *filp) +{ +	put_nfs_open_dir_context(filp->f_path.dentry->d_inode, filp->private_data); +	return 0; +} +  struct nfs_cache_array_entry {  	u64 cookie;  	u64 ino;  	struct qstr string; +	unsigned char d_type;  };  struct nfs_cache_array { -	unsigned int size; +	int size;  	int eof_index;  	u64 last_cookie;  	struct nfs_cache_array_entry array[0];  }; -#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry)) - -typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); +typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);  typedef struct {  	struct file	*file;  	struct page	*page; +	struct dir_context *ctx;  	unsigned long	page_index;  	u64		*dir_cookie; +	u64		last_cookie;  	loff_t		current_index;  	decode_dirent_t	decode; @@ -194,9 +175,13 @@ typedef struct {  static  struct nfs_cache_array *nfs_readdir_get_array(struct page *page)  { +	void *ptr;  	if (page == NULL)  		return ERR_PTR(-EIO); -	return (struct nfs_cache_array *)kmap(page); +	ptr = kmap(page); +	if (ptr == NULL) +		return ERR_PTR(-ENOMEM); +	return ptr;  }  static @@ -209,14 +194,15 @@ void nfs_readdir_release_array(struct page *page)   * we are freeing strings created by nfs_add_to_readdir_array()   */  static -int nfs_readdir_clear_array(struct page *page, gfp_t mask) +void nfs_readdir_clear_array(struct page *page)  { -	struct nfs_cache_array *array = nfs_readdir_get_array(page); +	struct nfs_cache_array *array;  	int i; + +	array = kmap_atomic(page);  	for (i = 0; i < array->size; i++)  		kfree(array->array[i].string.name); -	nfs_readdir_release_array(page); -	return 0; +	kunmap_atomic(array);  }  /* @@ -231,6 +217,11 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le  	string->name = kmemdup(name, len, GFP_KERNEL);  	if (string->name == NULL)  		return -ENOMEM; +	/* +	 * Avoid a kmemleak false positive. The pointer to the name is stored +	 * in a page cache page which kmemleak does not scan. +	 */ +	kmemleak_not_leak(string->name);  	string->hash = full_name_hash(name, len);  	return 0;  } @@ -244,20 +235,24 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)  	if (IS_ERR(array))  		return PTR_ERR(array); -	ret = -EIO; -	if (array->size >= MAX_READDIR_ARRAY) -		goto out;  	cache_entry = &array->array[array->size]; + +	/* Check that this entry lies within the page bounds */ +	ret = -ENOSPC; +	if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE) +		goto out; +  	cache_entry->cookie = entry->prev_cookie;  	cache_entry->ino = entry->ino; +	cache_entry->d_type = entry->d_type;  	ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);  	if (ret)  		goto out;  	array->last_cookie = entry->cookie; -	if (entry->eof == 1) -		array->eof_index = array->size;  	array->size++; +	if (entry->eof != 0) +		array->eof_index = array->size;  out:  	nfs_readdir_release_array(page);  	return ret; @@ -266,47 +261,79 @@ out:  static  int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)  { -	loff_t diff = desc->file->f_pos - desc->current_index; +	loff_t diff = desc->ctx->pos - desc->current_index;  	unsigned int index;  	if (diff < 0)  		goto out_eof;  	if (diff >= array->size) { -		if (array->eof_index > 0) +		if (array->eof_index >= 0)  			goto out_eof; -		desc->current_index += array->size;  		return -EAGAIN;  	}  	index = (unsigned int)diff;  	*desc->dir_cookie = array->array[index].cookie;  	desc->cache_entry_index = index; -	if (index == array->eof_index) -		desc->eof = 1;  	return 0;  out_eof:  	desc->eof = 1;  	return -EBADCOOKIE;  } +static bool +nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) +{ +	if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) +		return false; +	smp_rmb(); +	return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); +} +  static  int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)  {  	int i; +	loff_t new_pos;  	int status = -EAGAIN;  	for (i = 0; i < array->size; i++) { -		if (i == array->eof_index) { -			desc->eof = 1; -			status = -EBADCOOKIE; -		}  		if (array->array[i].cookie == *desc->dir_cookie) { +			struct nfs_inode *nfsi = NFS_I(file_inode(desc->file)); +			struct nfs_open_dir_context *ctx = desc->file->private_data; + +			new_pos = desc->current_index + i; +			if (ctx->attr_gencount != nfsi->attr_gencount || +			    !nfs_readdir_inode_mapping_valid(nfsi)) { +				ctx->duped = 0; +				ctx->attr_gencount = nfsi->attr_gencount; +			} else if (new_pos < desc->ctx->pos) { +				if (ctx->duped > 0 +				    && ctx->dup_cookie == *desc->dir_cookie) { +					if (printk_ratelimit()) { +						pr_notice("NFS: directory %pD2 contains a readdir loop." +								"Please contact your server vendor.  " +								"The file: %.*s has duplicate cookie %llu\n", +								desc->file, array->array[i].string.len, +								array->array[i].string.name, *desc->dir_cookie); +					} +					status = -ELOOP; +					goto out; +				} +				ctx->dup_cookie = *desc->dir_cookie; +				ctx->duped = -1; +			} +			desc->ctx->pos = new_pos;  			desc->cache_entry_index = i; -			status = 0; -			break; +			return 0;  		}  	} - +	if (array->eof_index >= 0) { +		status = -EBADCOOKIE; +		if (*desc->dir_cookie == array->last_cookie) +			desc->eof = 1; +	} +out:  	return status;  } @@ -314,10 +341,7 @@ static  int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)  {  	struct nfs_cache_array *array; -	int status = -EBADCOOKIE; - -	if (desc->dir_cookie == NULL) -		goto out; +	int status;  	array = nfs_readdir_get_array(desc->page);  	if (IS_ERR(array)) { @@ -330,6 +354,11 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)  	else  		status = nfs_readdir_search_for_cookie(array, desc); +	if (status == -EAGAIN) { +		desc->last_cookie = array->last_cookie; +		desc->current_index += array->size; +		desc->page_index++; +	}  	nfs_readdir_release_array(desc->page);  out:  	return status; @@ -340,7 +369,8 @@ static  int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,  			struct nfs_entry *entry, struct file *file, struct inode *inode)  { -	struct rpc_cred	*cred = nfs_file_cred(file); +	struct nfs_open_dir_context *ctx = file->private_data; +	struct rpc_cred	*cred = ctx->cred;  	unsigned long	timestamp, gencount;  	int		error; @@ -365,14 +395,14 @@ error:  	return error;  } -/* Fill in an entry based on the xdr code stored in desc->page */ -static -int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *stream) +static int xdr_decode(nfs_readdir_descriptor_t *desc, +		      struct nfs_entry *entry, struct xdr_stream *xdr)  { -	__be32 *p = desc->decode(stream, entry, NFS_SERVER(desc->file->f_path.dentry->d_inode), desc->plus); -	if (IS_ERR(p)) -		return PTR_ERR(p); +	int error; +	error = desc->decode(xdr, entry, desc->plus); +	if (error) +		return error;  	entry->fattr->time_start = desc->timestamp;  	entry->fattr->gencount = desc->gencount;  	return 0; @@ -381,13 +411,9 @@ int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct x  static  int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)  { -	struct nfs_inode *node;  	if (dentry->d_inode == NULL)  		goto different; -	node = NFS_I(dentry->d_inode); -	if (node->fh.size != entry->fh->size) -		goto different; -	if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0) +	if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)  		goto different;  	return 1;  different: @@ -395,16 +421,53 @@ different:  }  static +bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx) +{ +	if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) +		return false; +	if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags)) +		return true; +	if (ctx->pos == 0) +		return true; +	return false; +} + +/* + * This function is called by the lookup code to request the use of + * readdirplus to accelerate any future lookups in the same + * directory. + */ +static +void nfs_advise_use_readdirplus(struct inode *dir) +{ +	set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags); +} + +/* + * This function is mainly for use by nfs_getattr(). + * + * If this is an 'ls -l', we want to force use of readdirplus. + * Do this by checking if there is an active file descriptor + * and calling nfs_advise_use_readdirplus, then forcing a + * cache flush. + */ +void nfs_force_use_readdirplus(struct inode *dir) +{ +	if (!list_empty(&NFS_I(dir)->open_files)) { +		nfs_advise_use_readdirplus(dir); +		nfs_zap_mapping(dir, dir->i_mapping); +	} +} + +static  void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)  { -	struct qstr filename = { -		.len = entry->len, -		.name = entry->name, -	}; +	struct qstr filename = QSTR_INIT(entry->name, entry->len);  	struct dentry *dentry;  	struct dentry *alias;  	struct inode *dir = parent->d_inode;  	struct inode *inode; +	int status;  	if (filename.name[0] == '.') {  		if (filename.len == 1) @@ -417,10 +480,14 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)  	dentry = d_lookup(parent, &filename);  	if (dentry != NULL) {  		if (nfs_same_file(dentry, entry)) { -			nfs_refresh_inode(dentry->d_inode, entry->fattr); +			nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); +			status = nfs_refresh_inode(dentry->d_inode, entry->fattr); +			if (!status) +				nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label);  			goto out;  		} else { -			d_drop(dentry); +			if (d_invalidate(dentry) != 0) +				goto out;  			dput(dentry);  		}  	} @@ -429,8 +496,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)  	if (dentry == NULL)  		return; -	dentry->d_op = NFS_PROTO(dir)->dentry_ops; -	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); +	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);  	if (IS_ERR(inode))  		goto out; @@ -449,43 +515,53 @@ out:  /* Perform conversion from xdr to cache array */  static -void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, -				void *xdr_page, struct page *page, unsigned int buflen) +int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, +				struct page **xdr_pages, struct page *page, unsigned int buflen)  {  	struct xdr_stream stream;  	struct xdr_buf buf; -	__be32 *ptr = xdr_page; -	int status; +	struct page *scratch;  	struct nfs_cache_array *array; +	unsigned int count = 0; +	int status; -	buf.head->iov_base = xdr_page; -	buf.head->iov_len = buflen; -	buf.tail->iov_len = 0; -	buf.page_base = 0; -	buf.page_len = 0; -	buf.buflen = buf.head->iov_len; -	buf.len = buf.head->iov_len; - -	xdr_init_decode(&stream, &buf, ptr); +	scratch = alloc_page(GFP_KERNEL); +	if (scratch == NULL) +		return -ENOMEM; +	xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); +	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);  	do {  		status = xdr_decode(desc, entry, &stream); -		if (status != 0) +		if (status != 0) { +			if (status == -EAGAIN) +				status = 0;  			break; +		} -		if (nfs_readdir_add_to_array(entry, page) == -1) -			break; -		if (desc->plus == 1) +		count++; + +		if (desc->plus != 0)  			nfs_prime_dcache(desc->file->f_path.dentry, entry); + +		status = nfs_readdir_add_to_array(entry, page); +		if (status != 0) +			break;  	} while (!entry->eof); -	if (status == -EBADCOOKIE && entry->eof) { +	if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {  		array = nfs_readdir_get_array(page); -		array->eof_index = array->size - 1; -		status = 0; -		nfs_readdir_release_array(page); +		if (!IS_ERR(array)) { +			array->eof_index = array->size; +			status = 0; +			nfs_readdir_release_array(page); +		} else +			status = PTR_ERR(array);  	} + +	put_page(scratch); +	return status;  }  static @@ -500,7 +576,6 @@ static  void nfs_readdir_free_large_page(void *ptr, struct page **pages,  		unsigned int npages)  { -	vm_unmap_ram(ptr, npages);  	nfs_readdir_free_pagearray(pages, npages);  } @@ -509,9 +584,8 @@ void nfs_readdir_free_large_page(void *ptr, struct page **pages,   * to nfs_readdir_free_large_page   */  static -void *nfs_readdir_large_page(struct page **pages, unsigned int npages) +int nfs_readdir_large_page(struct page **pages, unsigned int npages)  { -	void *ptr;  	unsigned int i;  	for (i = 0; i < npages; i++) { @@ -520,13 +594,11 @@ void *nfs_readdir_large_page(struct page **pages, unsigned int npages)  			goto out_freepages;  		pages[i] = page;  	} +	return 0; -	ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL); -	if (!IS_ERR_OR_NULL(ptr)) -		return ptr;  out_freepages:  	nfs_readdir_free_pagearray(pages, i); -	return NULL; +	return -ENOMEM;  }  static @@ -537,35 +609,55 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,  	struct nfs_entry entry;  	struct file	*file = desc->file;  	struct nfs_cache_array *array; -	int status = 0; +	int status = -ENOMEM;  	unsigned int array_size = ARRAY_SIZE(pages);  	entry.prev_cookie = 0; -	entry.cookie = *desc->dir_cookie; +	entry.cookie = desc->last_cookie;  	entry.eof = 0;  	entry.fh = nfs_alloc_fhandle();  	entry.fattr = nfs_alloc_fattr(); +	entry.server = NFS_SERVER(inode);  	if (entry.fh == NULL || entry.fattr == NULL)  		goto out; +	entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); +	if (IS_ERR(entry.label)) { +		status = PTR_ERR(entry.label); +		goto out; +	} +  	array = nfs_readdir_get_array(page); +	if (IS_ERR(array)) { +		status = PTR_ERR(array); +		goto out_label_free; +	}  	memset(array, 0, sizeof(struct nfs_cache_array));  	array->eof_index = -1; -	pages_ptr = nfs_readdir_large_page(pages, array_size); -	if (!pages_ptr) +	status = nfs_readdir_large_page(pages, array_size); +	if (status < 0)  		goto out_release_array;  	do { +		unsigned int pglen;  		status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);  		if (status < 0)  			break; -		nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE); -	} while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY); +		pglen = status; +		status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen); +		if (status < 0) { +			if (status == -ENOSPC) +				status = 0; +			break; +		} +	} while (array->eof_index < 0);  	nfs_readdir_free_large_page(pages_ptr, pages, array_size);  out_release_array:  	nfs_readdir_release_array(page); +out_label_free: +	nfs4_label_free(entry.label);  out:  	nfs_free_fattr(entry.fattr);  	nfs_free_fhandle(entry.fh); @@ -581,9 +673,11 @@ out:  static  int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)  { -	struct inode	*inode = desc->file->f_path.dentry->d_inode; +	struct inode	*inode = file_inode(desc->file); +	int ret; -	if (nfs_readdir_xdr_to_array(desc, page, inode) < 0) +	ret = nfs_readdir_xdr_to_array(desc, page, inode); +	if (ret < 0)  		goto error;  	SetPageUptodate(page); @@ -595,12 +689,14 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)  	return 0;   error:  	unlock_page(page); -	return -EIO; +	return ret;  }  static  void cache_page_release(nfs_readdir_descriptor_t *desc)  { +	if (!desc->page->mapping) +		nfs_readdir_clear_array(desc->page);  	page_cache_release(desc->page);  	desc->page = NULL;  } @@ -608,12 +704,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)  static  struct page *get_cache_page(nfs_readdir_descriptor_t *desc)  { -	struct page *page; -	page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, +	return read_cache_page(file_inode(desc->file)->i_mapping,  			desc->page_index, (filler_t *)nfs_readdir_filler, desc); -	if (IS_ERR(page)) -		desc->eof = 1; -	return page;  }  /* @@ -629,9 +721,8 @@ int find_cache_page(nfs_readdir_descriptor_t *desc)  		return PTR_ERR(desc->page);  	res = nfs_readdir_search_array(desc); -	if (res == 0) -		return 0; -	cache_page_release(desc); +	if (res != 0) +		cache_page_release(desc);  	return res;  } @@ -639,62 +730,59 @@ int find_cache_page(nfs_readdir_descriptor_t *desc)  static inline  int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)  { -	int res = -EAGAIN; +	int res; -	while (1) { -		res = find_cache_page(desc); -		if (res != -EAGAIN) -			break; -		desc->page_index++; +	if (desc->page_index == 0) { +		desc->current_index = 0; +		desc->last_cookie = 0;  	} +	do { +		res = find_cache_page(desc); +	} while (res == -EAGAIN);  	return res;  } -static inline unsigned int dt_type(struct inode *inode) -{ -	return (inode->i_mode >> 12) & 15; -} -  /*   * Once we've found the start of the dirent within a page: fill 'er up...   */  static  -int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, -		   filldir_t filldir) +int nfs_do_filldir(nfs_readdir_descriptor_t *desc)  {  	struct file	*file = desc->file;  	int i = 0;  	int res = 0;  	struct nfs_cache_array *array = NULL; -	unsigned int d_type = DT_UNKNOWN; -	struct dentry *dentry = NULL; +	struct nfs_open_dir_context *ctx = file->private_data;  	array = nfs_readdir_get_array(desc->page); +	if (IS_ERR(array)) { +		res = PTR_ERR(array); +		goto out; +	}  	for (i = desc->cache_entry_index; i < array->size; i++) { -		d_type = DT_UNKNOWN; +		struct nfs_cache_array_entry *ent; -		res = filldir(dirent, array->array[i].string.name, -			array->array[i].string.len, file->f_pos, -			nfs_compat_user_ino64(array->array[i].ino), d_type); -		if (res < 0) +		ent = &array->array[i]; +		if (!dir_emit(desc->ctx, ent->string.name, ent->string.len, +		    nfs_compat_user_ino64(ent->ino), ent->d_type)) { +			desc->eof = 1;  			break; -		file->f_pos++; -		desc->cache_entry_index = i; +		} +		desc->ctx->pos++;  		if (i < (array->size-1))  			*desc->dir_cookie = array->array[i+1].cookie;  		else  			*desc->dir_cookie = array->last_cookie; -		if (i == array->eof_index) { -			desc->eof = 1; -			break; -		} +		if (ctx->duped != 0) +			ctx->duped = 1;  	} +	if (array->eof_index >= 0) +		desc->eof = 1;  	nfs_readdir_release_array(desc->page); +out:  	cache_page_release(desc); -	if (dentry != NULL) -		dput(dentry);  	dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",  			(unsigned long long)*desc->dir_cookie, res);  	return res; @@ -713,12 +801,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,   *	 directory in the page cache by the time we get here.   */  static inline -int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, -		     filldir_t filldir) +int uncached_readdir(nfs_readdir_descriptor_t *desc)  {  	struct page	*page = NULL;  	int		status; -	struct inode *inode = desc->file->f_path.dentry->d_inode; +	struct inode *inode = file_inode(desc->file); +	struct nfs_open_dir_context *ctx = desc->file->private_data;  	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",  			(unsigned long long)*desc->dir_cookie); @@ -729,14 +817,16 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,  		goto out;  	} -	if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) { -		status = -EIO; -		goto out_release; -	} -  	desc->page_index = 0; +	desc->last_cookie = *desc->dir_cookie;  	desc->page = page; -	status = nfs_do_filldir(desc, dirent, filldir); +	ctx->duped = 0; + +	status = nfs_readdir_xdr_to_array(desc, page, inode); +	if (status < 0) +		goto out_release; + +	status = nfs_do_filldir(desc);   out:  	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", @@ -747,53 +837,66 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,  	goto out;  } +static bool nfs_dir_mapping_need_revalidate(struct inode *dir) +{ +	struct nfs_inode *nfsi = NFS_I(dir); + +	if (nfs_attribute_cache_expired(dir)) +		return true; +	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) +		return true; +	return false; +} +  /* The file offset position represents the dirent entry number.  A     last cookie cache takes care of the common case of reading the     whole directory.   */ -static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int nfs_readdir(struct file *file, struct dir_context *ctx)  { -	struct dentry	*dentry = filp->f_path.dentry; +	struct dentry	*dentry = file->f_path.dentry;  	struct inode	*inode = dentry->d_inode;  	nfs_readdir_descriptor_t my_desc,  			*desc = &my_desc; -	int res = -ENOMEM; +	struct nfs_open_dir_context *dir_ctx = file->private_data; +	int res = 0; -	dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", -			dentry->d_parent->d_name.name, dentry->d_name.name, -			(long long)filp->f_pos); +	dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", +			file, (long long)ctx->pos);  	nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);  	/* -	 * filp->f_pos points to the dirent entry number. +	 * ctx->pos points to the dirent entry number.  	 * *desc->dir_cookie has the cookie for the next entry. We have  	 * to either find the entry with the appropriate number or  	 * revalidate the cookie.  	 */  	memset(desc, 0, sizeof(*desc)); -	desc->file = filp; -	desc->dir_cookie = &nfs_file_open_context(filp)->dir_cookie; +	desc->file = file; +	desc->ctx = ctx; +	desc->dir_cookie = &dir_ctx->dir_cookie;  	desc->decode = NFS_PROTO(inode)->decode_dirent; -	desc->plus = NFS_USE_READDIRPLUS(inode); +	desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;  	nfs_block_sillyrename(dentry); -	res = nfs_revalidate_mapping(inode, filp->f_mapping); +	if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode)) +		res = nfs_revalidate_mapping(inode, file->f_mapping);  	if (res < 0)  		goto out; -	while (desc->eof != 1) { +	do {  		res = readdir_search_pagecache(desc);  		if (res == -EBADCOOKIE) { +			res = 0;  			/* This means either end of directory */  			if (*desc->dir_cookie && desc->eof == 0) {  				/* Or that the server has 'lost' a cookie */ -				res = uncached_readdir(desc, dirent, filldir); -				if (res >= 0) +				res = uncached_readdir(desc); +				if (res == 0)  					continue;  			} -			res = 0;  			break;  		}  		if (res == -ETOOSMALL && desc->plus) { @@ -807,34 +910,28 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)  		if (res < 0)  			break; -		res = nfs_do_filldir(desc, dirent, filldir); -		if (res < 0) { -			res = 0; +		res = nfs_do_filldir(desc); +		if (res < 0)  			break; -		} -	} +	} while (!desc->eof);  out:  	nfs_unblock_sillyrename(dentry);  	if (res > 0)  		res = 0; -	dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n", -			dentry->d_parent->d_name.name, dentry->d_name.name, -			res); +	dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);  	return res;  } -static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) +static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)  { -	struct dentry *dentry = filp->f_path.dentry; -	struct inode *inode = dentry->d_inode; +	struct inode *inode = file_inode(filp); +	struct nfs_open_dir_context *dir_ctx = filp->private_data; -	dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n", -			dentry->d_parent->d_name.name, -			dentry->d_name.name, -			offset, origin); +	dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", +			filp, offset, whence);  	mutex_lock(&inode->i_mutex); -	switch (origin) { +	switch (whence) {  		case 1:  			offset += filp->f_pos;  		case 0: @@ -846,7 +943,8 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)  	}  	if (offset != filp->f_pos) {  		filp->f_pos = offset; -		nfs_file_open_context(filp)->dir_cookie = 0; +		dir_ctx->dir_cookie = 0; +		dir_ctx->duped = 0;  	}  out:  	mutex_unlock(&inode->i_mutex); @@ -857,15 +955,16 @@ out:   * All directory operations under NFS are synchronous, so fsync()   * is a dummy operation.   */ -static int nfs_fsync_dir(struct file *filp, int datasync) +static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, +			 int datasync)  { -	struct dentry *dentry = filp->f_path.dentry; +	struct inode *inode = file_inode(filp); -	dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", -			dentry->d_parent->d_name.name, dentry->d_name.name, -			datasync); +	dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync); -	nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC); +	mutex_lock(&inode->i_mutex); +	nfs_inc_stats(inode, NFSIOS_VFSFSYNC); +	mutex_unlock(&inode->i_mutex);  	return 0;  } @@ -883,6 +982,7 @@ void nfs_force_lookup_revalidate(struct inode *dir)  {  	NFS_I(dir)->cache_change_attribute++;  } +EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);  /*   * A check for whether or not the parent directory has changed. @@ -906,28 +1006,14 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)  }  /* - * Return the intent data that applies to this particular path component - * - * Note that the current set of intents only apply to the very last - * component of the path. - * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT. - */ -static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigned int mask) -{ -	if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT)) -		return 0; -	return nd->flags & mask; -} - -/*   * Use intent information to check whether or not we're going to do   * an O_EXCL create using this path component.   */ -static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd) +static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags)  {  	if (NFS_PROTO(dir)->version == 2)  		return 0; -	return nd && nfs_lookup_check_intent(nd, LOOKUP_EXCL); +	return flags & LOOKUP_EXCL;  }  /* @@ -938,28 +1024,28 @@ static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)   * particular file and the "nocto" mount flag is not set.   *   */ -static inline -int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd) +static +int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)  {  	struct nfs_server *server = NFS_SERVER(inode); +	int ret; -	if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags)) -		return 0; -	if (nd != NULL) { -		/* VFS wants an on-the-wire revalidation */ -		if (nd->flags & LOOKUP_REVAL) -			goto out_force; -		/* This is an open(2) */ -		if (nfs_lookup_check_intent(nd, LOOKUP_OPEN) != 0 && -				!(server->flags & NFS_MOUNT_NOCTO) && -				(S_ISREG(inode->i_mode) || -				 S_ISDIR(inode->i_mode))) -			goto out_force; +	if (IS_AUTOMOUNT(inode))  		return 0; -	} -	return nfs_revalidate_inode(server, inode); +	/* VFS wants an on-the-wire revalidation */ +	if (flags & LOOKUP_REVAL) +		goto out_force; +	/* This is an open(2) */ +	if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) && +	    (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) +		goto out_force; +out: +	return (inode->i_nlink == 0) ? -ENOENT : 0;  out_force: -	return __nfs_revalidate_inode(server, inode); +	ret = __nfs_revalidate_inode(server, inode); +	if (ret != 0) +		return ret; +	goto out;  }  /* @@ -971,10 +1057,10 @@ out_force:   */  static inline  int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, -		       struct nameidata *nd) +		       unsigned int flags)  {  	/* Don't revalidate a negative dentry if we're creating a new file */ -	if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0) +	if (flags & LOOKUP_CREATE)  		return 0;  	if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)  		return 1; @@ -992,39 +1078,42 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,   * If the parent directory is seen to have changed, we throw out the   * cached dentry and do a new lookup.   */ -static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) +static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)  {  	struct inode *dir;  	struct inode *inode;  	struct dentry *parent;  	struct nfs_fh *fhandle = NULL;  	struct nfs_fattr *fattr = NULL; +	struct nfs4_label *label = NULL;  	int error; +	if (flags & LOOKUP_RCU) +		return -ECHILD; +  	parent = dget_parent(dentry);  	dir = parent->d_inode;  	nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);  	inode = dentry->d_inode;  	if (!inode) { -		if (nfs_neg_need_reval(dir, dentry, nd)) +		if (nfs_neg_need_reval(dir, dentry, flags))  			goto out_bad; -		goto out_valid; +		goto out_valid_noent;  	}  	if (is_bad_inode(inode)) { -		dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n", -				__func__, dentry->d_parent->d_name.name, -				dentry->d_name.name); +		dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", +				__func__, dentry);  		goto out_bad;  	} -	if (nfs_have_delegation(inode, FMODE_READ)) +	if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))  		goto out_set_verifier;  	/* Force a full look up iff the parent directory has changed */ -	if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) { -		if (nfs_lookup_verify_inode(inode, nd)) +	if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) { +		if (nfs_lookup_verify_inode(inode, flags))  			goto out_zap_parent;  		goto out_valid;  	} @@ -1038,7 +1127,13 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)  	if (fhandle == NULL || fattr == NULL)  		goto out_error; -	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); +	label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); +	if (IS_ERR(label)) +		goto out_error; + +	trace_nfs_lookup_revalidate_enter(dir, dentry, flags); +	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); +	trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error);  	if (error)  		goto out_bad;  	if (nfs_compare_fh(NFS_FH(inode), fhandle)) @@ -1046,56 +1141,103 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)  	if ((error = nfs_refresh_inode(inode, fattr)) != 0)  		goto out_bad; +	nfs_setsecurity(inode, fattr, label); +  	nfs_free_fattr(fattr);  	nfs_free_fhandle(fhandle); +	nfs4_label_free(label); +  out_set_verifier:  	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));   out_valid: +	/* Success: notify readdir to use READDIRPLUS */ +	nfs_advise_use_readdirplus(dir); + out_valid_noent:  	dput(parent); -	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n", -			__func__, dentry->d_parent->d_name.name, -			dentry->d_name.name); +	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", +			__func__, dentry);  	return 1;  out_zap_parent:  	nfs_zap_caches(dir);   out_bad: +	nfs_free_fattr(fattr); +	nfs_free_fhandle(fhandle); +	nfs4_label_free(label);  	nfs_mark_for_revalidate(dir);  	if (inode && S_ISDIR(inode->i_mode)) {  		/* Purge readdir caches. */  		nfs_zap_caches(inode); -		/* If we have submounts, don't unhash ! */ -		if (have_submounts(dentry)) -			goto out_valid; -		if (dentry->d_flags & DCACHE_DISCONNECTED) +		/* +		 * We can't d_drop the root of a disconnected tree: +		 * its d_hash is on the s_anon list and d_drop() would hide +		 * it from shrink_dcache_for_unmount(), leading to busy +		 * inodes on unmount and further oopses. +		 */ +		if (IS_ROOT(dentry))  			goto out_valid; -		shrink_dcache_parent(dentry);  	} -	d_drop(dentry); -	nfs_free_fattr(fattr); -	nfs_free_fhandle(fhandle); +	/* If we have submounts, don't unhash ! */ +	if (check_submounts_and_drop(dentry) != 0) +		goto out_valid; +  	dput(parent); -	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", -			__func__, dentry->d_parent->d_name.name, -			dentry->d_name.name); +	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", +			__func__, dentry);  	return 0;  out_error:  	nfs_free_fattr(fattr);  	nfs_free_fhandle(fhandle); +	nfs4_label_free(label);  	dput(parent); -	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n", -			__func__, dentry->d_parent->d_name.name, -			dentry->d_name.name, error); +	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n", +			__func__, dentry, error);  	return error;  }  /* + * A weaker form of d_revalidate for revalidating just the dentry->d_inode + * when we don't really care about the dentry name. This is called when a + * pathwalk ends on a dentry that was not found via a normal lookup in the + * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals). + * + * In this situation, we just want to verify that the inode itself is OK + * since the dentry might have changed on the server. + */ +static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) +{ +	int error; +	struct inode *inode = dentry->d_inode; + +	/* +	 * I believe we can only get a negative dentry here in the case of a +	 * procfs-style symlink. Just assume it's correct for now, but we may +	 * eventually need to do something more here. +	 */ +	if (!inode) { +		dfprintk(LOOKUPCACHE, "%s: %pd2 has negative inode\n", +				__func__, dentry); +		return 1; +	} + +	if (is_bad_inode(inode)) { +		dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", +				__func__, dentry); +		return 0; +	} + +	error = nfs_revalidate_inode(NFS_SERVER(inode), inode); +	dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n", +			__func__, inode->i_ino, error ? "invalid" : "valid"); +	return !error; +} + +/*   * This is called from dput() when d_count is going to 0.   */ -static int nfs_dentry_delete(struct dentry *dentry) +static int nfs_dentry_delete(const struct dentry *dentry)  { -	dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name, -		dentry->d_flags); +	dfprintk(VFS, "NFS: dentry_delete(%pd2, %x)\n", +		dentry, dentry->d_flags);  	/* Unhash any dentry with a stale inode */  	if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode)) @@ -1114,11 +1256,14 @@ static int nfs_dentry_delete(struct dentry *dentry)  } +/* Ensure that we revalidate inode->i_nlink */  static void nfs_drop_nlink(struct inode *inode)  {  	spin_lock(&inode->i_lock); -	if (inode->i_nlink > 0) -		drop_nlink(inode); +	/* drop the inode if we're reasonably sure this is the last link */ +	if (inode->i_nlink == 1) +		clear_nlink(inode); +	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;  	spin_unlock(&inode->i_lock);  } @@ -1133,42 +1278,55 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)  		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;  	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { -		drop_nlink(inode);  		nfs_complete_unlink(dentry, inode); +		nfs_drop_nlink(inode);  	}  	iput(inode);  } +static void nfs_d_release(struct dentry *dentry) +{ +	/* free cached devname value, if it survived that far */ +	if (unlikely(dentry->d_fsdata)) { +		if (dentry->d_flags & DCACHE_NFSFS_RENAMED) +			WARN_ON(1); +		else +			kfree(dentry->d_fsdata); +	} +} +  const struct dentry_operations nfs_dentry_operations = {  	.d_revalidate	= nfs_lookup_revalidate, +	.d_weak_revalidate	= nfs_weak_revalidate,  	.d_delete	= nfs_dentry_delete,  	.d_iput		= nfs_dentry_iput, +	.d_automount	= nfs_d_automount, +	.d_release	= nfs_d_release,  }; +EXPORT_SYMBOL_GPL(nfs_dentry_operations); -static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)  {  	struct dentry *res;  	struct dentry *parent;  	struct inode *inode = NULL;  	struct nfs_fh *fhandle = NULL;  	struct nfs_fattr *fattr = NULL; +	struct nfs4_label *label = NULL;  	int error; -	dfprintk(VFS, "NFS: lookup(%s/%s)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name); +	dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry);  	nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);  	res = ERR_PTR(-ENAMETOOLONG);  	if (dentry->d_name.len > NFS_SERVER(dir)->namelen)  		goto out; -	dentry->d_op = NFS_PROTO(dir)->dentry_ops; -  	/*  	 * If we're doing an exclusive create, optimize away the lookup  	 * but don't hash the dentry.  	 */ -	if (nfs_is_exclusive_create(dir, nd)) { +	if (nfs_is_exclusive_create(dir, flags)) {  		d_instantiate(dentry, NULL);  		res = NULL;  		goto out; @@ -1180,21 +1338,29 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru  	if (fhandle == NULL || fattr == NULL)  		goto out; +	label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT); +	if (IS_ERR(label)) +		goto out; +  	parent = dentry->d_parent;  	/* Protect against concurrent sillydeletes */ +	trace_nfs_lookup_enter(dir, dentry, flags);  	nfs_block_sillyrename(parent); -	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); +	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);  	if (error == -ENOENT)  		goto no_entry;  	if (error < 0) {  		res = ERR_PTR(error);  		goto out_unblock_sillyrename;  	} -	inode = nfs_fhget(dentry->d_sb, fhandle, fattr); -	res = (struct dentry *)inode; +	inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); +	res = ERR_CAST(inode);  	if (IS_ERR(res))  		goto out_unblock_sillyrename; +	/* Success: notify readdir to use READDIRPLUS */ +	nfs_advise_use_readdirplus(dir); +  no_entry:  	res = d_materialise_unique(dentry, inode);  	if (res != NULL) { @@ -1205,189 +1371,180 @@ no_entry:  	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));  out_unblock_sillyrename:  	nfs_unblock_sillyrename(parent); +	trace_nfs_lookup_exit(dir, dentry, flags, error); +	nfs4_label_free(label);  out:  	nfs_free_fattr(fattr);  	nfs_free_fhandle(fhandle);  	return res;  } +EXPORT_SYMBOL_GPL(nfs_lookup); -#ifdef CONFIG_NFS_V4 -static int nfs_open_revalidate(struct dentry *, struct nameidata *); +#if IS_ENABLED(CONFIG_NFS_V4) +static int nfs4_lookup_revalidate(struct dentry *, unsigned int);  const struct dentry_operations nfs4_dentry_operations = { -	.d_revalidate	= nfs_open_revalidate, +	.d_revalidate	= nfs4_lookup_revalidate,  	.d_delete	= nfs_dentry_delete,  	.d_iput		= nfs_dentry_iput, +	.d_automount	= nfs_d_automount, +	.d_release	= nfs_d_release,  }; +EXPORT_SYMBOL_GPL(nfs4_dentry_operations); -/* - * Use intent information to determine whether we need to substitute - * the NFSv4-style stateful OPEN for the LOOKUP call - */ -static int is_atomic_open(struct nameidata *nd) +static fmode_t flags_to_mode(int flags)  { -	if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_OPEN) == 0) -		return 0; -	/* NFS does not (yet) have a stateful open for directories */ -	if (nd->flags & LOOKUP_DIRECTORY) -		return 0; -	/* Are we trying to write to a read only partition? */ -	if (__mnt_is_readonly(nd->path.mnt) && -	    (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) -		return 0; -	return 1; +	fmode_t res = (__force fmode_t)flags & FMODE_EXEC; +	if ((flags & O_ACCMODE) != O_WRONLY) +		res |= FMODE_READ; +	if ((flags & O_ACCMODE) != O_RDONLY) +		res |= FMODE_WRITE; +	return res;  } -static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd) +static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags)  { -	struct path path = { -		.mnt = nd->path.mnt, -		.dentry = dentry, -	}; -	struct nfs_open_context *ctx; -	struct rpc_cred *cred; -	fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); - -	cred = rpc_lookup_cred(); -	if (IS_ERR(cred)) -		return ERR_CAST(cred); -	ctx = alloc_nfs_open_context(&path, cred, fmode); -	put_rpccred(cred); -	if (ctx == NULL) -		return ERR_PTR(-ENOMEM); -	return ctx; +	return alloc_nfs_open_context(dentry, flags_to_mode(open_flags));  }  static int do_open(struct inode *inode, struct file *filp)  { -	nfs_fscache_set_inode_cookie(inode, filp); +	nfs_fscache_open_file(inode, filp);  	return 0;  } -static int nfs_intent_set_file(struct nameidata *nd, struct nfs_open_context *ctx) +static int nfs_finish_open(struct nfs_open_context *ctx, +			   struct dentry *dentry, +			   struct file *file, unsigned open_flags, +			   int *opened)  { -	struct file *filp; -	int ret = 0; +	int err; + +	if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) +		*opened |= FILE_CREATED; + +	err = finish_open(file, dentry, do_open, opened); +	if (err) +		goto out; +	nfs_file_set_open_context(file, ctx); -	/* If the open_intent is for execute, we have an extra check to make */ -	if (ctx->mode & FMODE_EXEC) { -		ret = nfs_may_open(ctx->path.dentry->d_inode, -				ctx->cred, -				nd->intent.open.flags); -		if (ret < 0) -			goto out; -	} -	filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open); -	if (IS_ERR(filp)) -		ret = PTR_ERR(filp); -	else -		nfs_file_set_open_context(filp, ctx);  out: -	put_nfs_open_context(ctx); -	return ret; +	return err;  } -static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +int nfs_atomic_open(struct inode *dir, struct dentry *dentry, +		    struct file *file, unsigned open_flags, +		    umode_t mode, int *opened)  {  	struct nfs_open_context *ctx; -	struct iattr attr; -	struct dentry *res = NULL; +	struct dentry *res; +	struct iattr attr = { .ia_valid = ATTR_OPEN };  	struct inode *inode; -	int open_flags; +	unsigned int lookup_flags = 0;  	int err; -	dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n", -			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); +	/* Expect a negative dentry */ +	BUG_ON(dentry->d_inode); -	/* Check that we are indeed trying to open this file */ -	if (!is_atomic_open(nd)) -		goto no_open; +	dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n", +			dir->i_sb->s_id, dir->i_ino, dentry); -	if (dentry->d_name.len > NFS_SERVER(dir)->namelen) { -		res = ERR_PTR(-ENAMETOOLONG); -		goto out; +	err = nfs_check_flags(open_flags); +	if (err) +		return err; + +	/* NFS only supports OPEN on regular files */ +	if ((open_flags & O_DIRECTORY)) { +		if (!d_unhashed(dentry)) { +			/* +			 * Hashed negative dentry with O_DIRECTORY: dentry was +			 * revalidated and is fine, no need to perform lookup +			 * again +			 */ +			return -ENOENT; +		} +		lookup_flags = LOOKUP_OPEN|LOOKUP_DIRECTORY; +		goto no_open;  	} -	dentry->d_op = NFS_PROTO(dir)->dentry_ops; -	/* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash -	 * the dentry. */ -	if (nd->flags & LOOKUP_EXCL) { -		d_instantiate(dentry, NULL); -		goto out; +	if (dentry->d_name.len > NFS_SERVER(dir)->namelen) +		return -ENAMETOOLONG; + +	if (open_flags & O_CREAT) { +		attr.ia_valid |= ATTR_MODE; +		attr.ia_mode = mode & ~current_umask(); +	} +	if (open_flags & O_TRUNC) { +		attr.ia_valid |= ATTR_SIZE; +		attr.ia_size = 0;  	} -	ctx = nameidata_to_nfs_open_context(dentry, nd); -	res = ERR_CAST(ctx); +	ctx = create_nfs_open_context(dentry, open_flags); +	err = PTR_ERR(ctx);  	if (IS_ERR(ctx))  		goto out; -	open_flags = nd->intent.open.flags; -	if (nd->flags & LOOKUP_CREATE) { -		attr.ia_mode = nd->intent.open.create_mode; -		attr.ia_valid = ATTR_MODE; -		if (!IS_POSIXACL(dir)) -			attr.ia_mode &= ~current_umask(); -	} else { -		open_flags &= ~(O_EXCL | O_CREAT); -		attr.ia_valid = 0; -	} - -	/* Open the file on the server */ +	trace_nfs_atomic_open_enter(dir, ctx, open_flags);  	nfs_block_sillyrename(dentry->d_parent); -	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); +	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened); +	nfs_unblock_sillyrename(dentry->d_parent);  	if (IS_ERR(inode)) { -		nfs_unblock_sillyrename(dentry->d_parent); +		err = PTR_ERR(inode); +		trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);  		put_nfs_open_context(ctx); -		switch (PTR_ERR(inode)) { -			/* Make a negative dentry */ -			case -ENOENT: -				d_add(dentry, NULL); -				res = NULL; -				goto out; -			/* This turned out not to be a regular file */ -			case -EISDIR: -			case -ENOTDIR: +		switch (err) { +		case -ENOENT: +			d_drop(dentry); +			d_add(dentry, NULL); +			break; +		case -EISDIR: +		case -ENOTDIR: +			goto no_open; +		case -ELOOP: +			if (!(open_flags & O_NOFOLLOW))  				goto no_open; -			case -ELOOP: -				if (!(nd->intent.open.flags & O_NOFOLLOW)) -					goto no_open; +			break;  			/* case -EINVAL: */ -			default: -				res = ERR_CAST(inode); -				goto out; +		default: +			break;  		} +		goto out;  	} -	res = d_add_unique(dentry, inode); -	nfs_unblock_sillyrename(dentry->d_parent); -	if (res != NULL) { -		dput(ctx->path.dentry); -		ctx->path.dentry = dget(res); -		dentry = res; -	} -	err = nfs_intent_set_file(nd, ctx); -	if (err < 0) { -		if (res != NULL) -			dput(res); -		return ERR_PTR(err); -	} + +	err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened); +	trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); +	put_nfs_open_context(ctx);  out: -	nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); -	return res; +	return err; +  no_open: -	return nfs_lookup(dir, dentry, nd); +	res = nfs_lookup(dir, dentry, lookup_flags); +	err = PTR_ERR(res); +	if (IS_ERR(res)) +		goto out; + +	return finish_no_open(file, res);  } +EXPORT_SYMBOL_GPL(nfs_atomic_open); -static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) +static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)  {  	struct dentry *parent = NULL; -	struct inode *inode = dentry->d_inode; +	struct inode *inode;  	struct inode *dir; -	struct nfs_open_context *ctx; -	int openflags, ret = 0; +	int ret = 0; + +	if (flags & LOOKUP_RCU) +		return -ECHILD; -	if (!is_atomic_open(nd) || d_mountpoint(dentry)) +	if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) +		goto no_open; +	if (d_mountpoint(dentry)) +		goto no_open; +	if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1)  		goto no_open; +	inode = dentry->d_inode;  	parent = dget_parent(dentry);  	dir = parent->d_inode; @@ -1395,7 +1552,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)  	 * optimize away revalidation of negative dentries.  	 */  	if (inode == NULL) { -		if (!nfs_neg_need_reval(dir, dentry, nd)) +		if (!nfs_neg_need_reval(dir, dentry, flags))  			ret = 1;  		goto out;  	} @@ -1403,99 +1560,21 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)  	/* NFS only supports OPEN on regular files */  	if (!S_ISREG(inode->i_mode))  		goto no_open_dput; -	openflags = nd->intent.open.flags;  	/* We cannot do exclusive creation on a positive dentry */ -	if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) +	if (flags & LOOKUP_EXCL)  		goto no_open_dput; -	/* We can't create new files, or truncate existing ones here */ -	openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); -	ctx = nameidata_to_nfs_open_context(dentry, nd); -	ret = PTR_ERR(ctx); -	if (IS_ERR(ctx)) -		goto out; -	/* -	 * Note: we're not holding inode->i_mutex and so may be racing with -	 * operations that change the directory. We therefore save the -	 * change attribute *before* we do the RPC call. -	 */ -	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL); -	if (IS_ERR(inode)) { -		ret = PTR_ERR(inode); -		switch (ret) { -		case -EPERM: -		case -EACCES: -		case -EDQUOT: -		case -ENOSPC: -		case -EROFS: -			goto out_put_ctx; -		default: -			goto out_drop; -		} -	} -	iput(inode); -	if (inode != dentry->d_inode) -		goto out_drop; +	/* Let f_op->open() actually open (and revalidate) the file */ +	ret = 1; -	nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); -	ret = nfs_intent_set_file(nd, ctx); -	if (ret >= 0) -		ret = 1;  out:  	dput(parent);  	return ret; -out_drop: -	d_drop(dentry); -	ret = 0; -out_put_ctx: -	put_nfs_open_context(ctx); -	goto out;  no_open_dput:  	dput(parent);  no_open: -	return nfs_lookup_revalidate(dentry, nd); -} - -static int nfs_open_create(struct inode *dir, struct dentry *dentry, int mode, -		struct nameidata *nd) -{ -	struct nfs_open_context *ctx = NULL; -	struct iattr attr; -	int error; -	int open_flags = 0; - -	dfprintk(VFS, "NFS: create(%s/%ld), %s\n", -			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); - -	attr.ia_mode = mode; -	attr.ia_valid = ATTR_MODE; - -	if ((nd->flags & LOOKUP_CREATE) != 0) { -		open_flags = nd->intent.open.flags; - -		ctx = nameidata_to_nfs_open_context(dentry, nd); -		error = PTR_ERR(ctx); -		if (IS_ERR(ctx)) -			goto out_err_drop; -	} - -	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx); -	if (error != 0) -		goto out_put_ctx; -	if (ctx != NULL) { -		error = nfs_intent_set_file(nd, ctx); -		if (error < 0) -			goto out_err; -	} -	return 0; -out_put_ctx: -	if (ctx != NULL) -		put_nfs_open_context(ctx); -out_err_drop: -	d_drop(dentry); -out_err: -	return error; +	return nfs_lookup_revalidate(dentry, flags);  }  #endif /* CONFIG_NFSV4 */ @@ -1504,7 +1583,8 @@ out_err:   * Code common to create, mkdir, and mknod.   */  int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, -				struct nfs_fattr *fattr) +				struct nfs_fattr *fattr, +				struct nfs4_label *label)  {  	struct dentry *parent = dget_parent(dentry);  	struct inode *dir = parent->d_inode; @@ -1517,18 +1597,18 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,  	if (dentry->d_inode)  		goto out;  	if (fhandle->size == 0) { -		error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); +		error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);  		if (error)  			goto out_error;  	}  	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));  	if (!(fattr->valid & NFS_ATTR_FATTR)) {  		struct nfs_server *server = NFS_SB(dentry->d_sb); -		error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr); +		error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL);  		if (error < 0)  			goto out_error;  	} -	inode = nfs_fhget(dentry->d_sb, fhandle, fattr); +	inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);  	error = PTR_ERR(inode);  	if (IS_ERR(inode))  		goto out_error; @@ -1541,6 +1621,7 @@ out_error:  	dput(parent);  	return error;  } +EXPORT_SYMBOL_GPL(nfs_instantiate);  /*   * Following a failed create operation, we drop the dentry rather @@ -1548,19 +1629,22 @@ out_error:   * that the operation succeeded on the server, but an error in the   * reply path made it appear to have failed.   */ -static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, -		struct nameidata *nd) +int nfs_create(struct inode *dir, struct dentry *dentry, +		umode_t mode, bool excl)  {  	struct iattr attr; +	int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;  	int error; -	dfprintk(VFS, "NFS: create(%s/%ld), %s\n", -			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); +	dfprintk(VFS, "NFS: create(%s/%lu), %pd\n", +			dir->i_sb->s_id, dir->i_ino, dentry);  	attr.ia_mode = mode;  	attr.ia_valid = ATTR_MODE; -	error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL); +	trace_nfs_create_enter(dir, dentry, open_flags); +	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); +	trace_nfs_create_exit(dir, dentry, open_flags, error);  	if (error != 0)  		goto out_err;  	return 0; @@ -1568,18 +1652,19 @@ out_err:  	d_drop(dentry);  	return error;  } +EXPORT_SYMBOL_GPL(nfs_create);  /*   * See comments for nfs_proc_create regarding failed operations.   */ -static int -nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +int +nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)  {  	struct iattr attr;  	int status; -	dfprintk(VFS, "NFS: mknod(%s/%ld), %s\n", -			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); +	dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n", +			dir->i_sb->s_id, dir->i_ino, dentry);  	if (!new_valid_dev(rdev))  		return -EINVAL; @@ -1587,7 +1672,9 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)  	attr.ia_mode = mode;  	attr.ia_valid = ATTR_MODE; +	trace_nfs_mknod_enter(dir, dentry);  	status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); +	trace_nfs_mknod_exit(dir, dentry, status);  	if (status != 0)  		goto out_err;  	return 0; @@ -1595,22 +1682,25 @@ out_err:  	d_drop(dentry);  	return status;  } +EXPORT_SYMBOL_GPL(nfs_mknod);  /*   * See comments for nfs_proc_create regarding failed operations.   */ -static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  {  	struct iattr attr;  	int error; -	dfprintk(VFS, "NFS: mkdir(%s/%ld), %s\n", -			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); +	dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", +			dir->i_sb->s_id, dir->i_ino, dentry);  	attr.ia_valid = ATTR_MODE;  	attr.ia_mode = mode | S_IFDIR; +	trace_nfs_mkdir_enter(dir, dentry);  	error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); +	trace_nfs_mkdir_exit(dir, dentry, error);  	if (error != 0)  		goto out_err;  	return 0; @@ -1618,6 +1708,7 @@ out_err:  	d_drop(dentry);  	return error;  } +EXPORT_SYMBOL_GPL(nfs_mkdir);  static void nfs_dentry_handle_enoent(struct dentry *dentry)  { @@ -1625,22 +1716,32 @@ static void nfs_dentry_handle_enoent(struct dentry *dentry)  		d_delete(dentry);  } -static int nfs_rmdir(struct inode *dir, struct dentry *dentry) +int nfs_rmdir(struct inode *dir, struct dentry *dentry)  {  	int error; -	dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n", -			dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); +	dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n", +			dir->i_sb->s_id, dir->i_ino, dentry); -	error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); -	/* Ensure the VFS deletes this inode */ -	if (error == 0 && dentry->d_inode != NULL) -		clear_nlink(dentry->d_inode); -	else if (error == -ENOENT) -		nfs_dentry_handle_enoent(dentry); +	trace_nfs_rmdir_enter(dir, dentry); +	if (dentry->d_inode) { +		nfs_wait_on_sillyrename(dentry); +		error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); +		/* Ensure the VFS deletes this inode */ +		switch (error) { +		case 0: +			clear_nlink(dentry->d_inode); +			break; +		case -ENOENT: +			nfs_dentry_handle_enoent(dentry); +		} +	} else +		error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); +	trace_nfs_rmdir_exit(dir, dentry, error);  	return error;  } +EXPORT_SYMBOL_GPL(nfs_rmdir);  /*   * Remove a file after making sure there are no pending writes, @@ -1655,8 +1756,7 @@ static int nfs_safe_remove(struct dentry *dentry)  	struct inode *inode = dentry->d_inode;  	int error = -EBUSY; -	dfprintk(VFS, "NFS: safe_remove(%s/%s)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name); +	dfprintk(VFS, "NFS: safe_remove(%pd2)\n", dentry);  	/* If the dentry was sillyrenamed, we simply call d_delete() */  	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { @@ -1664,17 +1764,17 @@ static int nfs_safe_remove(struct dentry *dentry)  		goto out;  	} +	trace_nfs_remove_enter(dir, dentry);  	if (inode != NULL) { -		nfs_inode_return_delegation(inode); +		NFS_PROTO(inode)->return_delegation(inode);  		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); -		/* The VFS may want to delete this inode */  		if (error == 0)  			nfs_drop_nlink(inode); -		nfs_mark_for_revalidate(inode);  	} else  		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);  	if (error == -ENOENT)  		nfs_dentry_handle_enoent(dentry); +	trace_nfs_remove_exit(dir, dentry, error);  out:  	return error;  } @@ -1684,37 +1784,38 @@ out:   *   *  If sillyrename() returns 0, we do nothing, otherwise we unlink.   */ -static int nfs_unlink(struct inode *dir, struct dentry *dentry) +int nfs_unlink(struct inode *dir, struct dentry *dentry)  {  	int error;  	int need_rehash = 0; -	dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, -		dir->i_ino, dentry->d_name.name); +	dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id, +		dir->i_ino, dentry); -	spin_lock(&dcache_lock); +	trace_nfs_unlink_enter(dir, dentry);  	spin_lock(&dentry->d_lock); -	if (atomic_read(&dentry->d_count) > 1) { +	if (d_count(dentry) > 1) {  		spin_unlock(&dentry->d_lock); -		spin_unlock(&dcache_lock);  		/* Start asynchronous writeout of the inode */  		write_inode_now(dentry->d_inode, 0);  		error = nfs_sillyrename(dir, dentry); -		return error; +		goto out;  	}  	if (!d_unhashed(dentry)) {  		__d_drop(dentry);  		need_rehash = 1;  	}  	spin_unlock(&dentry->d_lock); -	spin_unlock(&dcache_lock);  	error = nfs_safe_remove(dentry);  	if (!error || error == -ENOENT) {  		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));  	} else if (need_rehash)  		d_rehash(dentry); +out: +	trace_nfs_unlink_exit(dir, dentry, error);  	return error;  } +EXPORT_SYMBOL_GPL(nfs_unlink);  /*   * To create a symbolic link, most file systems instantiate a new inode, @@ -1731,17 +1832,16 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)   * now have a new file handle and can instantiate an in-core NFS inode   * and move the raw page into its mapping.   */ -static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)  { -	struct pagevec lru_pvec;  	struct page *page;  	char *kaddr;  	struct iattr attr;  	unsigned int pathlen = strlen(symname);  	int error; -	dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id, -		dir->i_ino, dentry->d_name.name, symname); +	dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id, +		dir->i_ino, dentry, symname);  	if (pathlen > PAGE_SIZE)  		return -ENAMETOOLONG; @@ -1753,17 +1853,19 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym  	if (!page)  		return -ENOMEM; -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	memcpy(kaddr, symname, pathlen);  	if (pathlen < PAGE_SIZE)  		memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr); +	trace_nfs_symlink_enter(dir, dentry);  	error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); +	trace_nfs_symlink_exit(dir, dentry, error);  	if (error != 0) { -		dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n", +		dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n",  			dir->i_sb->s_id, dir->i_ino, -			dentry->d_name.name, symname, error); +			dentry, symname, error);  		d_drop(dentry);  		__free_page(page);  		return error; @@ -1773,30 +1875,33 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym  	 * No big deal if we can't add this page to the page cache here.  	 * READLINK will get the missing page from the server if needed.  	 */ -	pagevec_init(&lru_pvec, 0); -	if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, +	if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0,  							GFP_KERNEL)) { -		pagevec_add(&lru_pvec, page); -		pagevec_lru_add_file(&lru_pvec);  		SetPageUptodate(page);  		unlock_page(page); +		/* +		 * add_to_page_cache_lru() grabs an extra page refcount. +		 * Drop it here to avoid leaking this page later. +		 */ +		page_cache_release(page);  	} else  		__free_page(page);  	return 0;  } +EXPORT_SYMBOL_GPL(nfs_symlink); -static int  +int  nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)  {  	struct inode *inode = old_dentry->d_inode;  	int error; -	dfprintk(VFS, "NFS: link(%s/%s -> %s/%s)\n", -		old_dentry->d_parent->d_name.name, old_dentry->d_name.name, -		dentry->d_parent->d_name.name, dentry->d_name.name); +	dfprintk(VFS, "NFS: link(%pd2 -> %pd2)\n", +		old_dentry, dentry); -	nfs_inode_return_delegation(inode); +	trace_nfs_link_enter(inode, dir, dentry); +	NFS_PROTO(inode)->return_delegation(inode);  	d_drop(dentry);  	error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); @@ -1804,8 +1909,10 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)  		ihold(inode);  		d_add(dentry, inode);  	} +	trace_nfs_link_exit(inode, dir, dentry, error);  	return error;  } +EXPORT_SYMBOL_GPL(nfs_link);  /*   * RENAME @@ -1831,19 +1938,20 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)   * If these conditions are met, we can drop the dentries before doing   * the rename.   */ -static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, +int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,  		      struct inode *new_dir, struct dentry *new_dentry)  {  	struct inode *old_inode = old_dentry->d_inode;  	struct inode *new_inode = new_dentry->d_inode;  	struct dentry *dentry = NULL, *rehash = NULL; +	struct rpc_task *task;  	int error = -EBUSY; -	dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", -		 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, -		 new_dentry->d_parent->d_name.name, new_dentry->d_name.name, -		 atomic_read(&new_dentry->d_count)); +	dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n", +		 old_dentry, new_dentry, +		 d_count(new_dentry)); +	trace_nfs_rename_enter(old_dir, old_dentry, new_dir, new_dentry);  	/*  	 * For non-directories, check whether the target is busy and if so,  	 * make a copy of the dentry and then do a silly-rename. If the @@ -1860,7 +1968,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,  			rehash = new_dentry;  		} -		if (atomic_read(&new_dentry->d_count) > 2) { +		if (d_count(new_dentry) > 2) {  			int err;  			/* copy the target dentry's name */ @@ -1880,16 +1988,26 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,  		}  	} -	nfs_inode_return_delegation(old_inode); +	NFS_PROTO(old_inode)->return_delegation(old_inode);  	if (new_inode != NULL) -		nfs_inode_return_delegation(new_inode); +		NFS_PROTO(new_inode)->return_delegation(new_inode); + +	task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); +	if (IS_ERR(task)) { +		error = PTR_ERR(task); +		goto out; +	} -	error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, -					   new_dir, &new_dentry->d_name); +	error = rpc_wait_for_completion_task(task); +	if (error == 0) +		error = task->tk_status; +	rpc_put_task(task);  	nfs_mark_for_revalidate(old_inode);  out:  	if (rehash)  		d_rehash(rehash); +	trace_nfs_rename_exit(old_dir, old_dentry, +			new_dir, new_dentry, error);  	if (!error) {  		if (new_inode != NULL)  			nfs_drop_nlink(new_inode); @@ -1904,6 +2022,7 @@ out:  		dput(dentry);  	return error;  } +EXPORT_SYMBOL_GPL(nfs_rename);  static DEFINE_SPINLOCK(nfs_access_lru_lock);  static LIST_HEAD(nfs_access_lru_list); @@ -1913,9 +2032,9 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry)  {  	put_rpccred(entry->cred);  	kfree(entry); -	smp_mb__before_atomic_dec(); +	smp_mb__before_atomic();  	atomic_long_dec(&nfs_access_nr_entries); -	smp_mb__after_atomic_dec(); +	smp_mb__after_atomic();  }  static void nfs_access_free_list(struct list_head *head) @@ -1929,14 +2048,18 @@ static void nfs_access_free_list(struct list_head *head)  	}  } -int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) +unsigned long +nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)  {  	LIST_HEAD(head);  	struct nfs_inode *nfsi, *next;  	struct nfs_access_entry *cache; +	int nr_to_scan = sc->nr_to_scan; +	gfp_t gfp_mask = sc->gfp_mask; +	long freed = 0;  	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) -		return (nr_to_scan == 0) ? 0 : -1; +		return SHRINK_STOP;  	spin_lock(&nfs_access_lru_lock);  	list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { @@ -1952,21 +2075,28 @@ int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp  				struct nfs_access_entry, lru);  		list_move(&cache->lru, &head);  		rb_erase(&cache->rb_node, &nfsi->access_cache); +		freed++;  		if (!list_empty(&nfsi->access_cache_entry_lru))  			list_move_tail(&nfsi->access_cache_inode_lru,  					&nfs_access_lru_list);  		else {  remove_lru_entry:  			list_del_init(&nfsi->access_cache_inode_lru); -			smp_mb__before_clear_bit(); +			smp_mb__before_atomic();  			clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); -			smp_mb__after_clear_bit(); +			smp_mb__after_atomic();  		}  		spin_unlock(&inode->i_lock);  	}  	spin_unlock(&nfs_access_lru_lock);  	nfs_access_free_list(&head); -	return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; +	return freed; +} + +unsigned long +nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc) +{ +	return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));  }  static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) @@ -2001,6 +2131,7 @@ void nfs_access_zap_cache(struct inode *inode)  	spin_unlock(&nfs_access_lru_lock);  	nfs_access_free_list(&head);  } +EXPORT_SYMBOL_GPL(nfs_access_zap_cache);  static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)  { @@ -2088,7 +2219,7 @@ found:  	nfs_access_free_entry(entry);  } -static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) +void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)  {  	struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);  	if (cache == NULL) @@ -2101,9 +2232,9 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s  	nfs_access_add_rbtree(inode, cache);  	/* Update accounting */ -	smp_mb__before_atomic_inc(); +	smp_mb__before_atomic();  	atomic_long_inc(&nfs_access_nr_entries); -	smp_mb__after_atomic_inc(); +	smp_mb__after_atomic();  	/* Add inode to global LRU list */  	if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { @@ -2114,15 +2245,31 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s  		spin_unlock(&nfs_access_lru_lock);  	}  } +EXPORT_SYMBOL_GPL(nfs_access_add_cache); + +void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result) +{ +	entry->mask = 0; +	if (access_result & NFS4_ACCESS_READ) +		entry->mask |= MAY_READ; +	if (access_result & +	    (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE)) +		entry->mask |= MAY_WRITE; +	if (access_result & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) +		entry->mask |= MAY_EXEC; +} +EXPORT_SYMBOL_GPL(nfs_access_set_mask);  static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)  {  	struct nfs_access_entry cache;  	int status; +	trace_nfs_access_enter(inode); +  	status = nfs_access_get_cached(inode, cred, &cache);  	if (status == 0) -		goto out; +		goto out_cached;  	/* Be clever: ask server to check for all possible rights */  	cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; @@ -2135,25 +2282,31 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)  			if (!S_ISDIR(inode->i_mode))  				set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);  		} -		return status; +		goto out;  	}  	nfs_access_add_cache(inode, &cache); +out_cached: +	if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) +		status = -EACCES;  out: -	if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) -		return 0; -	return -EACCES; +	trace_nfs_access_exit(inode, status); +	return status;  }  static int nfs_open_permission_mask(int openflags)  {  	int mask = 0; -	if (openflags & FMODE_READ) -		mask |= MAY_READ; -	if (openflags & FMODE_WRITE) -		mask |= MAY_WRITE; -	if (openflags & FMODE_EXEC) -		mask |= MAY_EXEC; +	if (openflags & __FMODE_EXEC) { +		/* ONLY check exec rights */ +		mask = MAY_EXEC; +	} else { +		if ((openflags & O_ACCMODE) != O_WRONLY) +			mask |= MAY_READ; +		if ((openflags & O_ACCMODE) != O_RDONLY) +			mask |= MAY_WRITE; +	} +  	return mask;  } @@ -2161,12 +2314,16 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)  {  	return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));  } +EXPORT_SYMBOL_GPL(nfs_may_open);  int nfs_permission(struct inode *inode, int mask)  {  	struct rpc_cred *cred;  	int res = 0; +	if (mask & MAY_NOT_BLOCK) +		return -ECHILD; +  	nfs_inc_stats(inode, NFSIOS_VFSACCESS);  	if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) @@ -2179,11 +2336,6 @@ int nfs_permission(struct inode *inode, int mask)  		case S_IFLNK:  			goto out;  		case S_IFREG: -			/* NFSv4 has atomic_open... */ -			if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN) -					&& (mask & MAY_OPEN) -					&& !(mask & MAY_EXEC)) -				goto out;  			break;  		case S_IFDIR:  			/* @@ -2208,15 +2360,16 @@ out:  	if (!res && (mask & MAY_EXEC) && !execute_ok(inode))  		res = -EACCES; -	dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", +	dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n",  		inode->i_sb->s_id, inode->i_ino, mask, res);  	return res;  out_notsup:  	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);  	if (res == 0) -		res = generic_permission(inode, mask, NULL); +		res = generic_permission(inode, mask);  	goto out;  } +EXPORT_SYMBOL_GPL(nfs_permission);  /*   * Local variables: diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 84d3c8b9020..f11b9eed0de 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -45,17 +45,19 @@  #include <linux/pagemap.h>  #include <linux/kref.h>  #include <linux/slab.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/module.h>  #include <linux/nfs_fs.h>  #include <linux/nfs_page.h>  #include <linux/sunrpc/clnt.h> -#include <asm/system.h>  #include <asm/uaccess.h> -#include <asm/atomic.h> +#include <linux/atomic.h>  #include "internal.h"  #include "iostat.h" +#include "pnfs.h"  #define NFSDBG_FACILITY		NFSDBG_VFS @@ -77,20 +79,24 @@ struct nfs_direct_req {  	atomic_t		io_count;	/* i/os we're waiting for */  	spinlock_t		lock;		/* protect completion state */  	ssize_t			count,		/* bytes actually processed */ +				bytes_left,	/* bytes left to be sent */  				error;		/* any reported error */  	struct completion	completion;	/* wait for i/o completion */  	/* commit state */ -	struct list_head	rewrite_list;	/* saved nfs_write_data structs */ -	struct nfs_write_data *	commit_data;	/* special write_data for commits */ +	struct nfs_mds_commit_info mds_cinfo;	/* Storage for cinfo */ +	struct pnfs_ds_commit_info ds_cinfo;	/* Storage for cinfo */ +	struct work_struct	work;  	int			flags;  #define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */  #define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */  	struct nfs_writeverf	verf;		/* unstable write verifier */  }; +static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops; +static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;  static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode); -static const struct rpc_call_ops nfs_write_direct_ops; +static void nfs_direct_write_schedule_work(struct work_struct *work);  static inline void get_dreq(struct nfs_direct_req *dreq)  { @@ -102,6 +108,97 @@ static inline int put_dreq(struct nfs_direct_req *dreq)  	return atomic_dec_and_test(&dreq->io_count);  } +/* + * nfs_direct_select_verf - select the right verifier + * @dreq - direct request possibly spanning multiple servers + * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs + * @ds_idx - index of data server in data server list, only valid if ds_clp set + * + * returns the correct verifier to use given the role of the server + */ +static struct nfs_writeverf * +nfs_direct_select_verf(struct nfs_direct_req *dreq, +		       struct nfs_client *ds_clp, +		       int ds_idx) +{ +	struct nfs_writeverf *verfp = &dreq->verf; + +#ifdef CONFIG_NFS_V4_1 +	if (ds_clp) { +		/* pNFS is in use, use the DS verf */ +		if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets) +			verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf; +		else +			WARN_ON_ONCE(1); +	} +#endif +	return verfp; +} + + +/* + * nfs_direct_set_hdr_verf - set the write/commit verifier + * @dreq - direct request possibly spanning multiple servers + * @hdr - pageio header to validate against previously seen verfs + * + * Set the server's (MDS or DS) "seen" verifier + */ +static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, +				    struct nfs_pgio_header *hdr) +{ +	struct nfs_writeverf *verfp; + +	verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, +				      hdr->data->ds_idx); +	WARN_ON_ONCE(verfp->committed >= 0); +	memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); +	WARN_ON_ONCE(verfp->committed < 0); +} + +/* + * nfs_direct_cmp_hdr_verf - compare verifier for pgio header + * @dreq - direct request possibly spanning multiple servers + * @hdr - pageio header to validate against previously seen verf + * + * set the server's "seen" verf if not initialized. + * returns result of comparison between @hdr->verf and the "seen" + * verf of the server used by @hdr (DS or MDS) + */ +static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, +					  struct nfs_pgio_header *hdr) +{ +	struct nfs_writeverf *verfp; + +	verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, +					 hdr->data->ds_idx); +	if (verfp->committed < 0) { +		nfs_direct_set_hdr_verf(dreq, hdr); +		return 0; +	} +	return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); +} + +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) +/* + * nfs_direct_cmp_commit_data_verf - compare verifier for commit data + * @dreq - direct request possibly spanning multiple servers + * @data - commit data to validate against previously seen verf + * + * returns result of comparison between @data->verf and the verf of + * the server used by @data (DS or MDS) + */ +static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, +					   struct nfs_commit_data *data) +{ +	struct nfs_writeverf *verfp; + +	verfp = nfs_direct_select_verf(dreq, data->ds_clp, +					 data->ds_commit_index); +	WARN_ON_ONCE(verfp->committed < 0); +	return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); +} +#endif +  /**   * nfs_direct_IO - NFS address space operation for direct I/O   * @rw: direction (read or write) @@ -111,33 +208,26 @@ static inline int put_dreq(struct nfs_direct_req *dreq)   * @nr_segs: size of iovec array   *   * The presence of this routine in the address space ops vector means - * the NFS client supports direct I/O.  However, we shunt off direct - * read and write requests before the VFS gets them, so this method - * should never be called. + * the NFS client supports direct I/O. However, for most direct IO, we + * shunt off direct read and write requests before the VFS gets them, + * so this method is only ever called for swap.   */ -ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) +ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)  { -	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", -			iocb->ki_filp->f_path.dentry->d_name.name, -			(long long) pos, nr_segs); +#ifndef CONFIG_NFS_SWAP +	dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n", +			iocb->ki_filp, (long long) pos, iter->nr_segs);  	return -EINVAL; -} - -static void nfs_direct_dirty_pages(struct page **pages, unsigned int pgbase, size_t count) -{ -	unsigned int npages; -	unsigned int i; +#else +	VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); -	if (count == 0) -		return; -	pages += (pgbase >> PAGE_SHIFT); -	npages = (count + (pgbase & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; -	for (i = 0; i < npages; i++) { -		struct page *page = pages[i]; -		if (!PageCompound(page)) -			set_page_dirty(page); -	} +	if (rw == READ || rw == KERNEL_READ) +		return nfs_file_direct_read(iocb, iter, pos, +				rw == READ ? true : false); +	return nfs_file_direct_write(iocb, iter, pos, +				rw == WRITE ? true : false); +#endif /* CONFIG_NFS_SWAP */  }  static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -147,26 +237,31 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)  		page_cache_release(pages[i]);  } +void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, +			      struct nfs_direct_req *dreq) +{ +	cinfo->lock = &dreq->lock; +	cinfo->mds = &dreq->mds_cinfo; +	cinfo->ds = &dreq->ds_cinfo; +	cinfo->dreq = dreq; +	cinfo->completion_ops = &nfs_direct_commit_completion_ops; +} +  static inline struct nfs_direct_req *nfs_direct_req_alloc(void)  {  	struct nfs_direct_req *dreq; -	dreq = kmem_cache_alloc(nfs_direct_cachep, GFP_KERNEL); +	dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);  	if (!dreq)  		return NULL;  	kref_init(&dreq->kref);  	kref_get(&dreq->kref);  	init_completion(&dreq->completion); -	INIT_LIST_HEAD(&dreq->rewrite_list); -	dreq->iocb = NULL; -	dreq->ctx = NULL; -	dreq->l_ctx = NULL; +	INIT_LIST_HEAD(&dreq->mds_cinfo.list); +	dreq->verf.committed = NFS_INVALID_STABLE_HOW;	/* not set yet */ +	INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);  	spin_lock_init(&dreq->lock); -	atomic_set(&dreq->io_count, 0); -	dreq->count = 0; -	dreq->error = 0; -	dreq->flags = 0;  	return dreq;  } @@ -187,6 +282,12 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq)  	kref_put(&dreq->kref, nfs_direct_req_free);  } +ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq) +{ +	return dreq->bytes_left; +} +EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left); +  /*   * Collects and returns the final error value/byte-count.   */ @@ -213,62 +314,97 @@ out:   * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust   * the iocb is still valid here if this is a synchronous request.   */ -static void nfs_direct_complete(struct nfs_direct_req *dreq) +static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)  { +	struct inode *inode = dreq->inode; + +	if (dreq->iocb && write) { +		loff_t pos = dreq->iocb->ki_pos + dreq->count; + +		spin_lock(&inode->i_lock); +		if (i_size_read(inode) < pos) +			i_size_write(inode, pos); +		spin_unlock(&inode->i_lock); +	} + +	if (write) +		nfs_zap_mapping(inode, inode->i_mapping); + +	inode_dio_done(inode); +  	if (dreq->iocb) {  		long res = (long) dreq->error;  		if (!res)  			res = (long) dreq->count;  		aio_complete(dreq->iocb, res, 0);  	} +  	complete_all(&dreq->completion);  	nfs_direct_req_release(dreq);  } -/* - * We must hold a reference to all the pages in this direct read request - * until the RPCs complete.  This could be long *after* we are woken up in - * nfs_direct_wait (for instance, if someone hits ^C on a slow server). - */ -static void nfs_direct_read_result(struct rpc_task *task, void *calldata) +static void nfs_direct_readpage_release(struct nfs_page *req)  { -	struct nfs_read_data *data = calldata; - -	nfs_readpage_result(task, data); +	dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", +		req->wb_context->dentry->d_inode->i_sb->s_id, +		(unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), +		req->wb_bytes, +		(long long)req_offset(req)); +	nfs_release_request(req);  } -static void nfs_direct_read_release(void *calldata) +static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)  { +	unsigned long bytes = 0; +	struct nfs_direct_req *dreq = hdr->dreq; -	struct nfs_read_data *data = calldata; -	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; -	int status = data->task.tk_status; +	if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) +		goto out_put;  	spin_lock(&dreq->lock); -	if (unlikely(status < 0)) { -		dreq->error = status; -		spin_unlock(&dreq->lock); -	} else { -		dreq->count += data->res.count; -		spin_unlock(&dreq->lock); -		nfs_direct_dirty_pages(data->pagevec, -				data->args.pgbase, -				data->res.count); -	} -	nfs_direct_release_pages(data->pagevec, data->npages); +	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0)) +		dreq->error = hdr->error; +	else +		dreq->count += hdr->good_bytes; +	spin_unlock(&dreq->lock); + +	while (!list_empty(&hdr->pages)) { +		struct nfs_page *req = nfs_list_entry(hdr->pages.next); +		struct page *page = req->wb_page; +		if (!PageCompound(page) && bytes < hdr->good_bytes) +			set_page_dirty(page); +		bytes += req->wb_bytes; +		nfs_list_remove_request(req); +		nfs_direct_readpage_release(req); +	} +out_put:  	if (put_dreq(dreq)) -		nfs_direct_complete(dreq); -	nfs_readdata_free(data); +		nfs_direct_complete(dreq, false); +	hdr->release(hdr); +} + +static void nfs_read_sync_pgio_error(struct list_head *head) +{ +	struct nfs_page *req; + +	while (!list_empty(head)) { +		req = nfs_list_entry(head->next); +		nfs_list_remove_request(req); +		nfs_release_request(req); +	}  } -static const struct rpc_call_ops nfs_read_direct_ops = { -#if defined(CONFIG_NFS_V4_1) -	.rpc_call_prepare = nfs_read_prepare, -#endif /* CONFIG_NFS_V4_1 */ -	.rpc_call_done = nfs_direct_read_result, -	.rpc_release = nfs_direct_read_release, +static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr) +{ +	get_dreq(hdr->dreq); +} + +static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = { +	.error_cleanup = nfs_read_sync_pgio_error, +	.init_hdr = nfs_direct_pgio_init, +	.completion = nfs_direct_read_completion,  };  /* @@ -278,331 +414,272 @@ static const struct rpc_call_ops nfs_read_direct_ops = {   * handled automatically by nfs_direct_read_result().  Otherwise, if   * no requests have been sent, just return an error.   */ -static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, -						const struct iovec *iov, -						loff_t pos) -{ -	struct nfs_open_context *ctx = dreq->ctx; -	struct inode *inode = ctx->path.dentry->d_inode; -	unsigned long user_addr = (unsigned long)iov->iov_base; -	size_t count = iov->iov_len; -	size_t rsize = NFS_SERVER(inode)->rsize; -	struct rpc_task *task; -	struct rpc_message msg = { -		.rpc_cred = ctx->cred, -	}; -	struct rpc_task_setup task_setup_data = { -		.rpc_client = NFS_CLIENT(inode), -		.rpc_message = &msg, -		.callback_ops = &nfs_read_direct_ops, -		.workqueue = nfsiod_workqueue, -		.flags = RPC_TASK_ASYNC, -	}; -	unsigned int pgbase; -	int result; -	ssize_t started = 0; - -	do { -		struct nfs_read_data *data; -		size_t bytes; - -		pgbase = user_addr & ~PAGE_MASK; -		bytes = min(rsize,count); - -		result = -ENOMEM; -		data = nfs_readdata_alloc(nfs_page_array_len(pgbase, bytes)); -		if (unlikely(!data)) -			break; - -		down_read(¤t->mm->mmap_sem); -		result = get_user_pages(current, current->mm, user_addr, -					data->npages, 1, 0, data->pagevec, NULL); -		up_read(¤t->mm->mmap_sem); -		if (result < 0) { -			nfs_readdata_free(data); -			break; -		} -		if ((unsigned)result < data->npages) { -			bytes = result * PAGE_SIZE; -			if (bytes <= pgbase) { -				nfs_direct_release_pages(data->pagevec, result); -				nfs_readdata_free(data); -				break; -			} -			bytes -= pgbase; -			data->npages = result; -		} - -		get_dreq(dreq); - -		data->req = (struct nfs_page *) dreq; -		data->inode = inode; -		data->cred = msg.rpc_cred; -		data->args.fh = NFS_FH(inode); -		data->args.context = ctx; -		data->args.lock_context = dreq->l_ctx; -		data->args.offset = pos; -		data->args.pgbase = pgbase; -		data->args.pages = data->pagevec; -		data->args.count = bytes; -		data->res.fattr = &data->fattr; -		data->res.eof = 0; -		data->res.count = bytes; -		nfs_fattr_init(&data->fattr); -		msg.rpc_argp = &data->args; -		msg.rpc_resp = &data->res; - -		task_setup_data.task = &data->task; -		task_setup_data.callback_data = data; -		NFS_PROTO(inode)->read_setup(data, &msg); - -		task = rpc_run_task(&task_setup_data); -		if (IS_ERR(task)) -			break; -		rpc_put_task(task); - -		dprintk("NFS: %5u initiated direct read call " -			"(req %s/%Ld, %zu bytes @ offset %Lu)\n", -				data->task.tk_pid, -				inode->i_sb->s_id, -				(long long)NFS_FILEID(inode), -				bytes, -				(unsigned long long)data->args.offset); - -		started += bytes; -		user_addr += bytes; -		pos += bytes; -		/* FIXME: Remove this unnecessary math from final patch */ -		pgbase += bytes; -		pgbase &= ~PAGE_MASK; -		BUG_ON(pgbase != (user_addr & ~PAGE_MASK)); - -		count -= bytes; -	} while (count != 0); - -	if (started) -		return started; -	return result < 0 ? (ssize_t) result : -EFAULT; -}  static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, -					      const struct iovec *iov, -					      unsigned long nr_segs, +					      struct iov_iter *iter,  					      loff_t pos)  { +	struct nfs_pageio_descriptor desc; +	struct inode *inode = dreq->inode;  	ssize_t result = -EINVAL;  	size_t requested_bytes = 0; -	unsigned long seg; +	size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE); +	nfs_pageio_init_read(&desc, dreq->inode, false, +			     &nfs_direct_read_completion_ops);  	get_dreq(dreq); +	desc.pg_dreq = dreq; +	atomic_inc(&inode->i_dio_count); + +	while (iov_iter_count(iter)) { +		struct page **pagevec; +		size_t bytes; +		size_t pgbase; +		unsigned npages, i; -	for (seg = 0; seg < nr_segs; seg++) { -		const struct iovec *vec = &iov[seg]; -		result = nfs_direct_read_schedule_segment(dreq, vec, pos); +		result = iov_iter_get_pages_alloc(iter, &pagevec,  +						  rsize, &pgbase);  		if (result < 0)  			break; -		requested_bytes += result; -		if ((size_t)result < vec->iov_len) +	 +		bytes = result; +		iov_iter_advance(iter, bytes); +		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; +		for (i = 0; i < npages; i++) { +			struct nfs_page *req; +			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); +			/* XXX do we need to do the eof zeroing found in async_filler? */ +			req = nfs_create_request(dreq->ctx, pagevec[i], NULL, +						 pgbase, req_len); +			if (IS_ERR(req)) { +				result = PTR_ERR(req); +				break; +			} +			req->wb_index = pos >> PAGE_SHIFT; +			req->wb_offset = pos & ~PAGE_MASK; +			if (!nfs_pageio_add_request(&desc, req)) { +				result = desc.pg_error; +				nfs_release_request(req); +				break; +			} +			pgbase = 0; +			bytes -= req_len; +			requested_bytes += req_len; +			pos += req_len; +			dreq->bytes_left -= req_len; +		} +		nfs_direct_release_pages(pagevec, npages); +		kvfree(pagevec); +		if (result < 0)  			break; -		pos += vec->iov_len;  	} -	if (put_dreq(dreq)) -		nfs_direct_complete(dreq); +	nfs_pageio_complete(&desc); -	if (requested_bytes != 0) -		return 0; +	/* +	 * If no bytes were started, return the error, and let the +	 * generic layer handle the completion. +	 */ +	if (requested_bytes == 0) { +		inode_dio_done(inode); +		nfs_direct_req_release(dreq); +		return result < 0 ? result : -EIO; +	} -	if (result < 0) -		return result; -	return -EIO; +	if (put_dreq(dreq)) +		nfs_direct_complete(dreq, false); +	return 0;  } -static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, -			       unsigned long nr_segs, loff_t pos) +/** + * nfs_file_direct_read - file direct read operation for NFS files + * @iocb: target I/O control block + * @iter: vector of user buffers into which to read data + * @pos: byte offset in file where reading starts + * + * We use this function for direct reads instead of calling + * generic_file_aio_read() in order to avoid gfar's check to see if + * the request starts before the end of the file.  For that check + * to work, we must generate a GETATTR before each direct read, and + * even then there is a window between the GETATTR and the subsequent + * READ where the file size could change.  Our preference is simply + * to do all reads the application wants, and the server will take + * care of managing the end of file boundary. + * + * This function also eliminates unnecessarily updating the file's + * atime locally, as the NFS server sets the file's atime, and this + * client must read the updated atime from the server back into its + * cache. + */ +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, +				loff_t pos, bool uio)  { -	ssize_t result = -ENOMEM; -	struct inode *inode = iocb->ki_filp->f_mapping->host; +	struct file *file = iocb->ki_filp; +	struct address_space *mapping = file->f_mapping; +	struct inode *inode = mapping->host;  	struct nfs_direct_req *dreq; +	struct nfs_lock_context *l_ctx; +	ssize_t result = -EINVAL; +	size_t count = iov_iter_count(iter); +	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); + +	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", +		file, count, (long long) pos); + +	result = 0; +	if (!count) +		goto out; +	mutex_lock(&inode->i_mutex); +	result = nfs_sync_mapping(mapping); +	if (result) +		goto out_unlock; + +	task_io_account_read(count); + +	result = -ENOMEM;  	dreq = nfs_direct_req_alloc();  	if (dreq == NULL) -		goto out; +		goto out_unlock;  	dreq->inode = inode; +	dreq->bytes_left = count;  	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); -	dreq->l_ctx = nfs_get_lock_context(dreq->ctx); -	if (dreq->l_ctx == NULL) +	l_ctx = nfs_get_lock_context(dreq->ctx); +	if (IS_ERR(l_ctx)) { +		result = PTR_ERR(l_ctx);  		goto out_release; +	} +	dreq->l_ctx = l_ctx;  	if (!is_sync_kiocb(iocb))  		dreq->iocb = iocb; -	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); -	if (!result) +	NFS_I(inode)->read_io += count; +	result = nfs_direct_read_schedule_iovec(dreq, iter, pos); + +	mutex_unlock(&inode->i_mutex); + +	if (!result) {  		result = nfs_direct_wait(dreq); +		if (result > 0) +			iocb->ki_pos = pos + result; +	} + +	nfs_direct_req_release(dreq); +	return result; +  out_release:  	nfs_direct_req_release(dreq); +out_unlock: +	mutex_unlock(&inode->i_mutex);  out:  	return result;  } -static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) -{ -	while (!list_empty(&dreq->rewrite_list)) { -		struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages); -		list_del(&data->pages); -		nfs_direct_release_pages(data->pagevec, data->npages); -		nfs_writedata_free(data); -	} -} - -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)  static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)  { -	struct inode *inode = dreq->inode; -	struct list_head *p; -	struct nfs_write_data *data; -	struct rpc_task *task; -	struct rpc_message msg = { -		.rpc_cred = dreq->ctx->cred, -	}; -	struct rpc_task_setup task_setup_data = { -		.rpc_client = NFS_CLIENT(inode), -		.rpc_message = &msg, -		.callback_ops = &nfs_write_direct_ops, -		.workqueue = nfsiod_workqueue, -		.flags = RPC_TASK_ASYNC, -	}; +	struct nfs_pageio_descriptor desc; +	struct nfs_page *req, *tmp; +	LIST_HEAD(reqs); +	struct nfs_commit_info cinfo; +	LIST_HEAD(failed); + +	nfs_init_cinfo_from_dreq(&cinfo, dreq); +	pnfs_recover_commit_reqs(dreq->inode, &reqs, &cinfo); +	spin_lock(cinfo.lock); +	nfs_scan_commit_list(&cinfo.mds->list, &reqs, &cinfo, 0); +	spin_unlock(cinfo.lock);  	dreq->count = 0;  	get_dreq(dreq); -	list_for_each(p, &dreq->rewrite_list) { -		data = list_entry(p, struct nfs_write_data, pages); - -		get_dreq(dreq); - -		/* Use stable writes */ -		data->args.stable = NFS_FILE_SYNC; - -		/* -		 * Reset data->res. -		 */ -		nfs_fattr_init(&data->fattr); -		data->res.count = data->args.count; -		memset(&data->verf, 0, sizeof(data->verf)); - -		/* -		 * Reuse data->task; data->args should not have changed -		 * since the original request was sent. -		 */ -		task_setup_data.task = &data->task; -		task_setup_data.callback_data = data; -		msg.rpc_argp = &data->args; -		msg.rpc_resp = &data->res; -		NFS_PROTO(inode)->write_setup(data, &msg); - -		/* -		 * We're called via an RPC callback, so BKL is already held. -		 */ -		task = rpc_run_task(&task_setup_data); -		if (!IS_ERR(task)) -			rpc_put_task(task); - -		dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", -				data->task.tk_pid, -				inode->i_sb->s_id, -				(long long)NFS_FILEID(inode), -				data->args.count, -				(unsigned long long)data->args.offset); +	nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false, +			      &nfs_direct_write_completion_ops); +	desc.pg_dreq = dreq; + +	list_for_each_entry_safe(req, tmp, &reqs, wb_list) { +		if (!nfs_pageio_add_request(&desc, req)) { +			nfs_list_remove_request(req); +			nfs_list_add_request(req, &failed); +			spin_lock(cinfo.lock); +			dreq->flags = 0; +			dreq->error = -EIO; +			spin_unlock(cinfo.lock); +		} +		nfs_release_request(req);  	} +	nfs_pageio_complete(&desc); -	if (put_dreq(dreq)) -		nfs_direct_write_complete(dreq, inode); -} - -static void nfs_direct_commit_result(struct rpc_task *task, void *calldata) -{ -	struct nfs_write_data *data = calldata; +	while (!list_empty(&failed)) { +		req = nfs_list_entry(failed.next); +		nfs_list_remove_request(req); +		nfs_unlock_and_release_request(req); +	} -	/* Call the NFS version-specific code */ -	NFS_PROTO(data->inode)->commit_done(task, data); +	if (put_dreq(dreq)) +		nfs_direct_write_complete(dreq, dreq->inode);  } -static void nfs_direct_commit_release(void *calldata) +static void nfs_direct_commit_complete(struct nfs_commit_data *data)  { -	struct nfs_write_data *data = calldata; -	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; +	struct nfs_direct_req *dreq = data->dreq; +	struct nfs_commit_info cinfo; +	struct nfs_page *req;  	int status = data->task.tk_status; +	nfs_init_cinfo_from_dreq(&cinfo, dreq);  	if (status < 0) {  		dprintk("NFS: %5u commit failed with error %d.\n", -				data->task.tk_pid, status); +			data->task.tk_pid, status);  		dreq->flags = NFS_ODIRECT_RESCHED_WRITES; -	} else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) { +	} else if (nfs_direct_cmp_commit_data_verf(dreq, data)) {  		dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);  		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;  	}  	dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); -	nfs_direct_write_complete(dreq, data->inode); -	nfs_commit_free(data); +	while (!list_empty(&data->pages)) { +		req = nfs_list_entry(data->pages.next); +		nfs_list_remove_request(req); +		if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) { +			/* Note the rewrite will go through mds */ +			nfs_mark_request_commit(req, NULL, &cinfo); +		} else +			nfs_release_request(req); +		nfs_unlock_and_release_request(req); +	} + +	if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) +		nfs_direct_write_complete(dreq, data->inode); +} + +static void nfs_direct_error_cleanup(struct nfs_inode *nfsi) +{ +	/* There is no lock to clear */  } -static const struct rpc_call_ops nfs_commit_direct_ops = { -#if defined(CONFIG_NFS_V4_1) -	.rpc_call_prepare = nfs_write_prepare, -#endif /* CONFIG_NFS_V4_1 */ -	.rpc_call_done = nfs_direct_commit_result, -	.rpc_release = nfs_direct_commit_release, +static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = { +	.completion = nfs_direct_commit_complete, +	.error_cleanup = nfs_direct_error_cleanup,  };  static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)  { -	struct nfs_write_data *data = dreq->commit_data; -	struct rpc_task *task; -	struct rpc_message msg = { -		.rpc_argp = &data->args, -		.rpc_resp = &data->res, -		.rpc_cred = dreq->ctx->cred, -	}; -	struct rpc_task_setup task_setup_data = { -		.task = &data->task, -		.rpc_client = NFS_CLIENT(dreq->inode), -		.rpc_message = &msg, -		.callback_ops = &nfs_commit_direct_ops, -		.callback_data = data, -		.workqueue = nfsiod_workqueue, -		.flags = RPC_TASK_ASYNC, -	}; - -	data->inode = dreq->inode; -	data->cred = msg.rpc_cred; - -	data->args.fh = NFS_FH(data->inode); -	data->args.offset = 0; -	data->args.count = 0; -	data->args.context = dreq->ctx; -	data->args.lock_context = dreq->l_ctx; -	data->res.count = 0; -	data->res.fattr = &data->fattr; -	data->res.verf = &data->verf; -	nfs_fattr_init(&data->fattr); - -	NFS_PROTO(data->inode)->commit_setup(data, &msg); - -	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */ -	dreq->commit_data = NULL; - -	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); - -	task = rpc_run_task(&task_setup_data); -	if (!IS_ERR(task)) -		rpc_put_task(task); +	int res; +	struct nfs_commit_info cinfo; +	LIST_HEAD(mds_list); + +	nfs_init_cinfo_from_dreq(&cinfo, dreq); +	nfs_scan_commit(dreq->inode, &mds_list, &cinfo); +	res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo); +	if (res < 0) /* res == -ENOMEM */ +		nfs_direct_write_reschedule(dreq);  } -static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +static void nfs_direct_write_schedule_work(struct work_struct *work)  { +	struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);  	int flags = dreq->flags;  	dreq->flags = 0; @@ -614,92 +691,111 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode  			nfs_direct_write_reschedule(dreq);  			break;  		default: -			if (dreq->commit_data != NULL) -				nfs_commit_free(dreq->commit_data); -			nfs_direct_free_writedata(dreq); -			nfs_zap_mapping(inode, inode->i_mapping); -			nfs_direct_complete(dreq); +			nfs_direct_complete(dreq, true);  	}  } -static void nfs_alloc_commit_data(struct nfs_direct_req *dreq) +static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)  { -	dreq->commit_data = nfs_commitdata_alloc(); -	if (dreq->commit_data != NULL) -		dreq->commit_data->req = (struct nfs_page *) dreq; +	schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */  } +  #else -static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq) +static void nfs_direct_write_schedule_work(struct work_struct *work)  { -	dreq->commit_data = NULL;  }  static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)  { -	nfs_direct_free_writedata(dreq); -	nfs_zap_mapping(inode, inode->i_mapping); -	nfs_direct_complete(dreq); +	nfs_direct_complete(dreq, true);  }  #endif -static void nfs_direct_write_result(struct rpc_task *task, void *calldata) +static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)  { -	struct nfs_write_data *data = calldata; +	struct nfs_direct_req *dreq = hdr->dreq; +	struct nfs_commit_info cinfo; +	int bit = -1; +	struct nfs_page *req = nfs_list_entry(hdr->pages.next); -	if (nfs_writeback_done(task, data) != 0) -		return; -} +	if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) +		goto out_put; -/* - * NB: Return the value of the first error return code.  Subsequent - *     errors after the first one are ignored. - */ -static void nfs_direct_write_release(void *calldata) -{ -	struct nfs_write_data *data = calldata; -	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; -	int status = data->task.tk_status; +	nfs_init_cinfo_from_dreq(&cinfo, dreq);  	spin_lock(&dreq->lock); -	if (unlikely(status < 0)) { -		/* An error has occurred, so we should not commit */ +	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {  		dreq->flags = 0; -		dreq->error = status; +		dreq->error = hdr->error;  	} -	if (unlikely(dreq->error != 0)) -		goto out_unlock; - -	dreq->count += data->res.count; - -	if (data->res.verf->committed != NFS_FILE_SYNC) { -		switch (dreq->flags) { -			case 0: -				memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf)); +	if (dreq->error != 0) +		bit = NFS_IOHDR_ERROR; +	else { +		dreq->count += hdr->good_bytes; +		if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) { +			dreq->flags = NFS_ODIRECT_RESCHED_WRITES; +			bit = NFS_IOHDR_NEED_RESCHED; +		} else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { +			if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) +				bit = NFS_IOHDR_NEED_RESCHED; +			else if (dreq->flags == 0) { +				nfs_direct_set_hdr_verf(dreq, hdr); +				bit = NFS_IOHDR_NEED_COMMIT;  				dreq->flags = NFS_ODIRECT_DO_COMMIT; -				break; -			case NFS_ODIRECT_DO_COMMIT: -				if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) { -					dprintk("NFS: %5u write verify failed\n", data->task.tk_pid); -					dreq->flags = NFS_ODIRECT_RESCHED_WRITES; -				} +			} else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { +				if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) { +					dreq->flags = +						NFS_ODIRECT_RESCHED_WRITES; +					bit = NFS_IOHDR_NEED_RESCHED; +				} else +					bit = NFS_IOHDR_NEED_COMMIT; +			}  		}  	} -out_unlock:  	spin_unlock(&dreq->lock); +	while (!list_empty(&hdr->pages)) { + +		req = nfs_list_entry(hdr->pages.next); +		nfs_list_remove_request(req); +		switch (bit) { +		case NFS_IOHDR_NEED_RESCHED: +		case NFS_IOHDR_NEED_COMMIT: +			kref_get(&req->wb_kref); +			nfs_mark_request_commit(req, hdr->lseg, &cinfo); +		} +		nfs_unlock_and_release_request(req); +	} + +out_put:  	if (put_dreq(dreq)) -		nfs_direct_write_complete(dreq, data->inode); +		nfs_direct_write_complete(dreq, hdr->inode); +	hdr->release(hdr);  } -static const struct rpc_call_ops nfs_write_direct_ops = { -#if defined(CONFIG_NFS_V4_1) -	.rpc_call_prepare = nfs_write_prepare, -#endif /* CONFIG_NFS_V4_1 */ -	.rpc_call_done = nfs_direct_write_result, -	.rpc_release = nfs_direct_write_release, +static void nfs_write_sync_pgio_error(struct list_head *head) +{ +	struct nfs_page *req; + +	while (!list_empty(head)) { +		req = nfs_list_entry(head->next); +		nfs_list_remove_request(req); +		nfs_unlock_and_release_request(req); +	} +} + +static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { +	.error_cleanup = nfs_write_sync_pgio_error, +	.init_hdr = nfs_direct_pgio_init, +	.completion = nfs_direct_write_completion,  }; + +/* + * NB: Return the value of the first error return code.  Subsequent + *     errors after the first one are ignored. + */  /*   * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE   * operation.  If nfs_writedata_alloc() or get_user_pages() fails, @@ -707,244 +803,87 @@ static const struct rpc_call_ops nfs_write_direct_ops = {   * handled automatically by nfs_direct_write_result().  Otherwise, if   * no requests have been sent, just return an error.   */ -static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, -						 const struct iovec *iov, -						 loff_t pos, int sync) -{ -	struct nfs_open_context *ctx = dreq->ctx; -	struct inode *inode = ctx->path.dentry->d_inode; -	unsigned long user_addr = (unsigned long)iov->iov_base; -	size_t count = iov->iov_len; -	struct rpc_task *task; -	struct rpc_message msg = { -		.rpc_cred = ctx->cred, -	}; -	struct rpc_task_setup task_setup_data = { -		.rpc_client = NFS_CLIENT(inode), -		.rpc_message = &msg, -		.callback_ops = &nfs_write_direct_ops, -		.workqueue = nfsiod_workqueue, -		.flags = RPC_TASK_ASYNC, -	}; -	size_t wsize = NFS_SERVER(inode)->wsize; -	unsigned int pgbase; -	int result; -	ssize_t started = 0; - -	do { -		struct nfs_write_data *data; -		size_t bytes; - -		pgbase = user_addr & ~PAGE_MASK; -		bytes = min(wsize,count); - -		result = -ENOMEM; -		data = nfs_writedata_alloc(nfs_page_array_len(pgbase, bytes)); -		if (unlikely(!data)) -			break; - -		down_read(¤t->mm->mmap_sem); -		result = get_user_pages(current, current->mm, user_addr, -					data->npages, 0, 0, data->pagevec, NULL); -		up_read(¤t->mm->mmap_sem); -		if (result < 0) { -			nfs_writedata_free(data); -			break; -		} -		if ((unsigned)result < data->npages) { -			bytes = result * PAGE_SIZE; -			if (bytes <= pgbase) { -				nfs_direct_release_pages(data->pagevec, result); -				nfs_writedata_free(data); -				break; -			} -			bytes -= pgbase; -			data->npages = result; -		} - -		get_dreq(dreq); - -		list_move_tail(&data->pages, &dreq->rewrite_list); - -		data->req = (struct nfs_page *) dreq; -		data->inode = inode; -		data->cred = msg.rpc_cred; -		data->args.fh = NFS_FH(inode); -		data->args.context = ctx; -		data->args.lock_context = dreq->l_ctx; -		data->args.offset = pos; -		data->args.pgbase = pgbase; -		data->args.pages = data->pagevec; -		data->args.count = bytes; -		data->args.stable = sync; -		data->res.fattr = &data->fattr; -		data->res.count = bytes; -		data->res.verf = &data->verf; -		nfs_fattr_init(&data->fattr); - -		task_setup_data.task = &data->task; -		task_setup_data.callback_data = data; -		msg.rpc_argp = &data->args; -		msg.rpc_resp = &data->res; -		NFS_PROTO(inode)->write_setup(data, &msg); - -		task = rpc_run_task(&task_setup_data); -		if (IS_ERR(task)) -			break; -		rpc_put_task(task); - -		dprintk("NFS: %5u initiated direct write call " -			"(req %s/%Ld, %zu bytes @ offset %Lu)\n", -				data->task.tk_pid, -				inode->i_sb->s_id, -				(long long)NFS_FILEID(inode), -				bytes, -				(unsigned long long)data->args.offset); - -		started += bytes; -		user_addr += bytes; -		pos += bytes; - -		/* FIXME: Remove this useless math from the final patch */ -		pgbase += bytes; -		pgbase &= ~PAGE_MASK; -		BUG_ON(pgbase != (user_addr & ~PAGE_MASK)); - -		count -= bytes; -	} while (count != 0); - -	if (started) -		return started; -	return result < 0 ? (ssize_t) result : -EFAULT; -} -  static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, -					       const struct iovec *iov, -					       unsigned long nr_segs, -					       loff_t pos, int sync) +					       struct iov_iter *iter, +					       loff_t pos)  { +	struct nfs_pageio_descriptor desc; +	struct inode *inode = dreq->inode;  	ssize_t result = 0;  	size_t requested_bytes = 0; -	unsigned long seg; +	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); +	nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false, +			      &nfs_direct_write_completion_ops); +	desc.pg_dreq = dreq;  	get_dreq(dreq); +	atomic_inc(&inode->i_dio_count); -	for (seg = 0; seg < nr_segs; seg++) { -		const struct iovec *vec = &iov[seg]; -		result = nfs_direct_write_schedule_segment(dreq, vec, -							   pos, sync); +	NFS_I(inode)->write_io += iov_iter_count(iter); +	while (iov_iter_count(iter)) { +		struct page **pagevec; +		size_t bytes; +		size_t pgbase; +		unsigned npages, i; + +		result = iov_iter_get_pages_alloc(iter, &pagevec,  +						  wsize, &pgbase);  		if (result < 0)  			break; -		requested_bytes += result; -		if ((size_t)result < vec->iov_len) + +		bytes = result; +		iov_iter_advance(iter, bytes); +		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; +		for (i = 0; i < npages; i++) { +			struct nfs_page *req; +			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); + +			req = nfs_create_request(dreq->ctx, pagevec[i], NULL, +						 pgbase, req_len); +			if (IS_ERR(req)) { +				result = PTR_ERR(req); +				break; +			} +			nfs_lock_request(req); +			req->wb_index = pos >> PAGE_SHIFT; +			req->wb_offset = pos & ~PAGE_MASK; +			if (!nfs_pageio_add_request(&desc, req)) { +				result = desc.pg_error; +				nfs_unlock_and_release_request(req); +				break; +			} +			pgbase = 0; +			bytes -= req_len; +			requested_bytes += req_len; +			pos += req_len; +			dreq->bytes_left -= req_len; +		} +		nfs_direct_release_pages(pagevec, npages); +		kvfree(pagevec); +		if (result < 0)  			break; -		pos += vec->iov_len; +	} +	nfs_pageio_complete(&desc); + +	/* +	 * If no bytes were started, return the error, and let the +	 * generic layer handle the completion. +	 */ +	if (requested_bytes == 0) { +		inode_dio_done(inode); +		nfs_direct_req_release(dreq); +		return result < 0 ? result : -EIO;  	}  	if (put_dreq(dreq))  		nfs_direct_write_complete(dreq, dreq->inode); - -	if (requested_bytes != 0) -		return 0; - -	if (result < 0) -		return result; -	return -EIO; -} - -static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, -				unsigned long nr_segs, loff_t pos, -				size_t count) -{ -	ssize_t result = -ENOMEM; -	struct inode *inode = iocb->ki_filp->f_mapping->host; -	struct nfs_direct_req *dreq; -	size_t wsize = NFS_SERVER(inode)->wsize; -	int sync = NFS_UNSTABLE; - -	dreq = nfs_direct_req_alloc(); -	if (!dreq) -		goto out; -	nfs_alloc_commit_data(dreq); - -	if (dreq->commit_data == NULL || count < wsize) -		sync = NFS_FILE_SYNC; - -	dreq->inode = inode; -	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); -	dreq->l_ctx = nfs_get_lock_context(dreq->ctx); -	if (dreq->l_ctx == NULL) -		goto out_release; -	if (!is_sync_kiocb(iocb)) -		dreq->iocb = iocb; - -	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync); -	if (!result) -		result = nfs_direct_wait(dreq); -out_release: -	nfs_direct_req_release(dreq); -out: -	return result; -} - -/** - * nfs_file_direct_read - file direct read operation for NFS files - * @iocb: target I/O control block - * @iov: vector of user buffers into which to read data - * @nr_segs: size of iov vector - * @pos: byte offset in file where reading starts - * - * We use this function for direct reads instead of calling - * generic_file_aio_read() in order to avoid gfar's check to see if - * the request starts before the end of the file.  For that check - * to work, we must generate a GETATTR before each direct read, and - * even then there is a window between the GETATTR and the subsequent - * READ where the file size could change.  Our preference is simply - * to do all reads the application wants, and the server will take - * care of managing the end of file boundary. - * - * This function also eliminates unnecessarily updating the file's - * atime locally, as the NFS server sets the file's atime, and this - * client must read the updated atime from the server back into its - * cache. - */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, -				unsigned long nr_segs, loff_t pos) -{ -	ssize_t retval = -EINVAL; -	struct file *file = iocb->ki_filp; -	struct address_space *mapping = file->f_mapping; -	size_t count; - -	count = iov_length(iov, nr_segs); -	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); - -	dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n", -		file->f_path.dentry->d_parent->d_name.name, -		file->f_path.dentry->d_name.name, -		count, (long long) pos); - -	retval = 0; -	if (!count) -		goto out; - -	retval = nfs_sync_mapping(mapping); -	if (retval) -		goto out; - -	retval = nfs_direct_read(iocb, iov, nr_segs, pos); -	if (retval > 0) -		iocb->ki_pos = pos + retval; - -out: -	return retval; +	return 0;  }  /**   * nfs_file_direct_write - file direct write operation for NFS files   * @iocb: target I/O control block - * @iov: vector of user buffers from which to write data - * @nr_segs: size of iov vector + * @iter: vector of user buffers from which to write data   * @pos: byte offset in file where writing starts   *   * We use this function for direct writes instead of calling @@ -962,44 +901,97 @@ out:   * Note that O_APPEND is not supported for NFS direct writes, as there   * is no atomic O_APPEND write facility in the NFS protocol.   */ -ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, -				unsigned long nr_segs, loff_t pos) +ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, +				loff_t pos, bool uio)  { -	ssize_t retval = -EINVAL; +	ssize_t result = -EINVAL;  	struct file *file = iocb->ki_filp;  	struct address_space *mapping = file->f_mapping; -	size_t count; +	struct inode *inode = mapping->host; +	struct nfs_direct_req *dreq; +	struct nfs_lock_context *l_ctx; +	loff_t end; +	size_t count = iov_iter_count(iter); +	end = (pos + count - 1) >> PAGE_CACHE_SHIFT; -	count = iov_length(iov, nr_segs);  	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); -	dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n", -		file->f_path.dentry->d_parent->d_name.name, -		file->f_path.dentry->d_name.name, -		count, (long long) pos); +	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", +		file, count, (long long) pos); -	retval = generic_write_checks(file, &pos, &count, 0); -	if (retval) +	result = generic_write_checks(file, &pos, &count, 0); +	if (result)  		goto out; -	retval = -EINVAL; +	result = -EINVAL;  	if ((ssize_t) count < 0)  		goto out; -	retval = 0; +	result = 0;  	if (!count)  		goto out; -	retval = nfs_sync_mapping(mapping); -	if (retval) -		goto out; +	mutex_lock(&inode->i_mutex); + +	result = nfs_sync_mapping(mapping); +	if (result) +		goto out_unlock; + +	if (mapping->nrpages) { +		result = invalidate_inode_pages2_range(mapping, +					pos >> PAGE_CACHE_SHIFT, end); +		if (result) +			goto out_unlock; +	} -	retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); +	task_io_account_write(count); -	if (retval > 0) -		iocb->ki_pos = pos + retval; +	result = -ENOMEM; +	dreq = nfs_direct_req_alloc(); +	if (!dreq) +		goto out_unlock; +	dreq->inode = inode; +	dreq->bytes_left = count; +	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); +	l_ctx = nfs_get_lock_context(dreq->ctx); +	if (IS_ERR(l_ctx)) { +		result = PTR_ERR(l_ctx); +		goto out_release; +	} +	dreq->l_ctx = l_ctx; +	if (!is_sync_kiocb(iocb)) +		dreq->iocb = iocb; + +	result = nfs_direct_write_schedule_iovec(dreq, iter, pos); + +	if (mapping->nrpages) { +		invalidate_inode_pages2_range(mapping, +					      pos >> PAGE_CACHE_SHIFT, end); +	} + +	mutex_unlock(&inode->i_mutex); + +	if (!result) { +		result = nfs_direct_wait(dreq); +		if (result > 0) { +			struct inode *inode = mapping->host; + +			iocb->ki_pos = pos + result; +			spin_lock(&inode->i_lock); +			if (i_size_read(inode) < iocb->ki_pos) +				i_size_write(inode, iocb->ki_pos); +			spin_unlock(&inode->i_lock); +		} +	} +	nfs_direct_req_release(dreq); +	return result; + +out_release: +	nfs_direct_req_release(dreq); +out_unlock: +	mutex_unlock(&inode->i_mutex);  out: -	return retval; +	return result;  }  /** diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index a6e711ad130..d25f10fb492 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -8,10 +8,13 @@  #ifdef CONFIG_NFS_USE_KERNEL_DNS +#include <linux/module.h>  #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h>  #include <linux/dns_resolver.h> +#include "dns_resolve.h" -ssize_t nfs_dns_resolve_name(char *name, size_t namelen, +ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,  		struct sockaddr *sa, size_t salen)  {  	ssize_t ret; @@ -20,7 +23,7 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,  	ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);  	if (ip_len > 0) -		ret = rpc_pton(ip_addr, ip_len, sa, salen); +		ret = rpc_pton(net, ip_addr, ip_len, sa, salen);  	else  		ret = -ESRCH;  	kfree(ip_addr); @@ -29,6 +32,7 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,  #else +#include <linux/module.h>  #include <linux/hash.h>  #include <linux/string.h>  #include <linux/kmod.h> @@ -38,17 +42,20 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,  #include <linux/seq_file.h>  #include <linux/inet.h>  #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h>  #include <linux/sunrpc/cache.h>  #include <linux/sunrpc/svcauth.h> +#include <linux/sunrpc/rpc_pipe_fs.h> +#include <linux/nfs_fs.h> +#include "nfs4_fs.h"  #include "dns_resolve.h"  #include "cache_lib.h" +#include "netns.h"  #define NFS_DNS_HASHBITS 4  #define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS) -static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE]; -  struct nfs_dns_ent {  	struct cache_head h; @@ -138,7 +145,7 @@ static int nfs_dns_upcall(struct cache_detail *cd,  	ret = nfs_cache_upcall(cd, key->hostname);  	if (ret) -		ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request); +		ret = sunrpc_cache_pipe_upcall(cd, ch);  	return ret;  } @@ -213,7 +220,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)  {  	char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];  	struct nfs_dns_ent key, *item; -	unsigned long ttl; +	unsigned int ttl;  	ssize_t len;  	int ret = -EINVAL; @@ -224,7 +231,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)  	len = qword_get(&buf, buf1, sizeof(buf1));  	if (len <= 0)  		goto out; -	key.addrlen = rpc_pton(buf1, len, +	key.addrlen = rpc_pton(cd->net, buf1, len,  			(struct sockaddr *)&key.addr,  			sizeof(key.addr)); @@ -236,7 +243,8 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)  	key.namelen = len;  	memset(&key.h, 0, sizeof(key.h)); -	ttl = get_expiry(&buf); +	if (get_uint(&buf, &ttl) < 0) +		goto out;  	if (ttl == 0)  		goto out;  	key.h.expiry_time = ttl + seconds_since_boot(); @@ -259,21 +267,6 @@ out:  	return ret;  } -static struct cache_detail nfs_dns_resolve = { -	.owner = THIS_MODULE, -	.hash_size = NFS_DNS_HASHTBL_SIZE, -	.hash_table = nfs_dns_table, -	.name = "dns_resolve", -	.cache_put = nfs_dns_ent_put, -	.cache_upcall = nfs_dns_upcall, -	.cache_parse = nfs_dns_parse, -	.cache_show = nfs_dns_show, -	.match = nfs_dns_match, -	.init = nfs_dns_ent_init, -	.update = nfs_dns_ent_update, -	.alloc = nfs_dns_ent_alloc, -}; -  static int do_cache_lookup(struct cache_detail *cd,  		struct nfs_dns_ent *key,  		struct nfs_dns_ent **item, @@ -336,8 +329,8 @@ out:  	return ret;  } -ssize_t nfs_dns_resolve_name(char *name, size_t namelen, -		struct sockaddr *sa, size_t salen) +ssize_t nfs_dns_resolve_name(struct net *net, char *name, +		size_t namelen, struct sockaddr *sa, size_t salen)  {  	struct nfs_dns_ent key = {  		.hostname = name, @@ -345,28 +338,133 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,  	};  	struct nfs_dns_ent *item = NULL;  	ssize_t ret; +	struct nfs_net *nn = net_generic(net, nfs_net_id); -	ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item); +	ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item);  	if (ret == 0) {  		if (salen >= item->addrlen) {  			memcpy(sa, &item->addr, item->addrlen);  			ret = item->addrlen;  		} else  			ret = -EOVERFLOW; -		cache_put(&item->h, &nfs_dns_resolve); +		cache_put(&item->h, nn->nfs_dns_resolve);  	} else if (ret == -ENOENT)  		ret = -ESRCH;  	return ret;  } +static struct cache_detail nfs_dns_resolve_template = { +	.owner		= THIS_MODULE, +	.hash_size	= NFS_DNS_HASHTBL_SIZE, +	.name		= "dns_resolve", +	.cache_put	= nfs_dns_ent_put, +	.cache_upcall	= nfs_dns_upcall, +	.cache_request	= nfs_dns_request, +	.cache_parse	= nfs_dns_parse, +	.cache_show	= nfs_dns_show, +	.match		= nfs_dns_match, +	.init		= nfs_dns_ent_init, +	.update		= nfs_dns_ent_update, +	.alloc		= nfs_dns_ent_alloc, +}; + + +int nfs_dns_resolver_cache_init(struct net *net) +{ +	int err; +	struct nfs_net *nn = net_generic(net, nfs_net_id); + +	nn->nfs_dns_resolve = cache_create_net(&nfs_dns_resolve_template, net); +	if (IS_ERR(nn->nfs_dns_resolve)) +		return PTR_ERR(nn->nfs_dns_resolve); + +	err = nfs_cache_register_net(net, nn->nfs_dns_resolve); +	if (err) +		goto err_reg; +	return 0; + +err_reg: +	cache_destroy_net(nn->nfs_dns_resolve, net); +	return err; +} + +void nfs_dns_resolver_cache_destroy(struct net *net) +{ +	struct nfs_net *nn = net_generic(net, nfs_net_id); + +	nfs_cache_unregister_net(net, nn->nfs_dns_resolve); +	cache_destroy_net(nn->nfs_dns_resolve, net); +} + +static int nfs4_dns_net_init(struct net *net) +{ +	return nfs_dns_resolver_cache_init(net); +} + +static void nfs4_dns_net_exit(struct net *net) +{ +	nfs_dns_resolver_cache_destroy(net); +} + +static struct pernet_operations nfs4_dns_resolver_ops = { +	.init = nfs4_dns_net_init, +	.exit = nfs4_dns_net_exit, +}; + +static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, +			   void *ptr) +{ +	struct super_block *sb = ptr; +	struct net *net = sb->s_fs_info; +	struct nfs_net *nn = net_generic(net, nfs_net_id); +	struct cache_detail *cd = nn->nfs_dns_resolve; +	int ret = 0; + +	if (cd == NULL) +		return 0; + +	if (!try_module_get(THIS_MODULE)) +		return 0; + +	switch (event) { +	case RPC_PIPEFS_MOUNT: +		ret = nfs_cache_register_sb(sb, cd); +		break; +	case RPC_PIPEFS_UMOUNT: +		nfs_cache_unregister_sb(sb, cd); +		break; +	default: +		ret = -ENOTSUPP; +		break; +	} +	module_put(THIS_MODULE); +	return ret; +} + +static struct notifier_block nfs_dns_resolver_block = { +	.notifier_call	= rpc_pipefs_event, +}; +  int nfs_dns_resolver_init(void)  { -	return nfs_cache_register(&nfs_dns_resolve); +	int err; + +	err = register_pernet_subsys(&nfs4_dns_resolver_ops); +	if (err < 0) +		goto out; +	err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block); +	if (err < 0) +		goto out1; +	return 0; +out1: +	unregister_pernet_subsys(&nfs4_dns_resolver_ops); +out: +	return err;  }  void nfs_dns_resolver_destroy(void)  { -	nfs_cache_unregister(&nfs_dns_resolve); +	rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block); +	unregister_pernet_subsys(&nfs4_dns_resolver_ops);  } -  #endif diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h index 199bb5543a9..2e4f596d292 100644 --- a/fs/nfs/dns_resolve.h +++ b/fs/nfs/dns_resolve.h @@ -15,12 +15,22 @@ static inline int nfs_dns_resolver_init(void)  static inline void nfs_dns_resolver_destroy(void)  {} + +static inline int nfs_dns_resolver_cache_init(struct net *net) +{ +	return 0; +} + +static inline void nfs_dns_resolver_cache_destroy(struct net *net) +{}  #else  extern int nfs_dns_resolver_init(void);  extern void nfs_dns_resolver_destroy(void); +extern int nfs_dns_resolver_cache_init(struct net *net); +extern void nfs_dns_resolver_cache_destroy(struct net *net);  #endif -extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen, -		struct sockaddr *sa, size_t salen); +extern ssize_t nfs_dns_resolve_name(struct net *net, char *name, +		size_t namelen,	struct sockaddr *sa, size_t salen);  #endif diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 60677f9f131..4042ff58fe3 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -16,6 +16,7 @@   *  nfs regular file handling functions   */ +#include <linux/module.h>  #include <linux/time.h>  #include <linux/kernel.h>  #include <linux/errno.h> @@ -30,88 +31,31 @@  #include <linux/swap.h>  #include <asm/uaccess.h> -#include <asm/system.h>  #include "delegation.h"  #include "internal.h"  #include "iostat.h"  #include "fscache.h" -#include "pnfs.h" -#define NFSDBG_FACILITY		NFSDBG_FILE +#include "nfstrace.h" -static int nfs_file_open(struct inode *, struct file *); -static int nfs_file_release(struct inode *, struct file *); -static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin); -static int  nfs_file_mmap(struct file *, struct vm_area_struct *); -static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos, -					struct pipe_inode_info *pipe, -					size_t count, unsigned int flags); -static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov, -				unsigned long nr_segs, loff_t pos); -static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, -					struct file *filp, loff_t *ppos, -					size_t count, unsigned int flags); -static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, -				unsigned long nr_segs, loff_t pos); -static int  nfs_file_flush(struct file *, fl_owner_t id); -static int  nfs_file_fsync(struct file *, int datasync); -static int nfs_check_flags(int flags); -static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); -static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); -static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); +#define NFSDBG_FACILITY		NFSDBG_FILE  static const struct vm_operations_struct nfs_file_vm_ops; -const struct file_operations nfs_file_operations = { -	.llseek		= nfs_file_llseek, -	.read		= do_sync_read, -	.write		= do_sync_write, -	.aio_read	= nfs_file_read, -	.aio_write	= nfs_file_write, -	.mmap		= nfs_file_mmap, -	.open		= nfs_file_open, -	.flush		= nfs_file_flush, -	.release	= nfs_file_release, -	.fsync		= nfs_file_fsync, -	.lock		= nfs_lock, -	.flock		= nfs_flock, -	.splice_read	= nfs_file_splice_read, -	.splice_write	= nfs_file_splice_write, -	.check_flags	= nfs_check_flags, -	.setlease	= nfs_setlease, -}; - -const struct inode_operations nfs_file_inode_operations = { -	.permission	= nfs_permission, -	.getattr	= nfs_getattr, -	.setattr	= nfs_setattr, -}; - -#ifdef CONFIG_NFS_V3 -const struct inode_operations nfs3_file_inode_operations = { -	.permission	= nfs_permission, -	.getattr	= nfs_getattr, -	.setattr	= nfs_setattr, -	.listxattr	= nfs3_listxattr, -	.getxattr	= nfs3_getxattr, -	.setxattr	= nfs3_setxattr, -	.removexattr	= nfs3_removexattr, -}; -#endif  /* CONFIG_NFS_v3 */ -  /* Hack for future NFS swap support */  #ifndef IS_SWAPFILE  # define IS_SWAPFILE(inode)	(0)  #endif -static int nfs_check_flags(int flags) +int nfs_check_flags(int flags)  {  	if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT))  		return -EINVAL;  	return 0;  } +EXPORT_SYMBOL_GPL(nfs_check_flags);  /*   * Open file @@ -121,9 +65,7 @@ nfs_file_open(struct inode *inode, struct file *filp)  {  	int res; -	dprintk("NFS: open file(%s/%s)\n", -			filp->f_path.dentry->d_parent->d_name.name, -			filp->f_path.dentry->d_name.name); +	dprintk("NFS: open file(%pD2)\n", filp);  	nfs_inc_stats(inode, NFSIOS_VFSOPEN);  	res = nfs_check_flags(filp->f_flags); @@ -134,18 +76,15 @@ nfs_file_open(struct inode *inode, struct file *filp)  	return res;  } -static int +int  nfs_file_release(struct inode *inode, struct file *filp)  { -	struct dentry *dentry = filp->f_path.dentry; - -	dprintk("NFS: release(%s/%s)\n", -			dentry->d_parent->d_name.name, -			dentry->d_name.name); +	dprintk("NFS: release(%pD2)\n", filp);  	nfs_inc_stats(inode, NFSIOS_VFSRELEASE);  	return nfs_release(inode, filp);  } +EXPORT_SYMBOL_GPL(nfs_file_release);  /**   * nfs_revalidate_size - Revalidate the file size @@ -178,89 +117,86 @@ force_reval:  	return __nfs_revalidate_inode(server, inode);  } -static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) +loff_t nfs_file_llseek(struct file *filp, loff_t offset, int whence)  { -	loff_t loff; +	dprintk("NFS: llseek file(%pD2, %lld, %d)\n", +			filp, offset, whence); -	dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", -			filp->f_path.dentry->d_parent->d_name.name, -			filp->f_path.dentry->d_name.name, -			offset, origin); - -	/* origin == SEEK_END => we must revalidate the cached file length */ -	if (origin == SEEK_END) { +	/* +	 * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate +	 * the cached file length +	 */ +	if (whence != SEEK_SET && whence != SEEK_CUR) {  		struct inode *inode = filp->f_mapping->host;  		int retval = nfs_revalidate_file_size(inode, filp);  		if (retval < 0)  			return (loff_t)retval; +	} -		spin_lock(&inode->i_lock); -		loff = generic_file_llseek_unlocked(filp, offset, origin); -		spin_unlock(&inode->i_lock); -	} else -		loff = generic_file_llseek_unlocked(filp, offset, origin); -	return loff; +	return generic_file_llseek(filp, offset, whence);  } +EXPORT_SYMBOL_GPL(nfs_file_llseek);  /*   * Flush all dirty pages, and check for write errors.   */ -static int +int  nfs_file_flush(struct file *file, fl_owner_t id)  { -	struct dentry	*dentry = file->f_path.dentry; -	struct inode	*inode = dentry->d_inode; +	struct inode	*inode = file_inode(file); -	dprintk("NFS: flush(%s/%s)\n", -			dentry->d_parent->d_name.name, -			dentry->d_name.name); +	dprintk("NFS: flush(%pD2)\n", file);  	nfs_inc_stats(inode, NFSIOS_VFSFLUSH);  	if ((file->f_mode & FMODE_WRITE) == 0)  		return 0; +	/* +	 * If we're holding a write delegation, then just start the i/o +	 * but don't wait for completion (or send a commit). +	 */ +	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) +		return filemap_fdatawrite(file->f_mapping); +  	/* Flush writes to the server and return any errors */  	return vfs_fsync(file, 0);  } +EXPORT_SYMBOL_GPL(nfs_file_flush); -static ssize_t -nfs_file_read(struct kiocb *iocb, const struct iovec *iov, -		unsigned long nr_segs, loff_t pos) +ssize_t +nfs_file_read(struct kiocb *iocb, struct iov_iter *to)  { -	struct dentry * dentry = iocb->ki_filp->f_path.dentry; -	struct inode * inode = dentry->d_inode; +	struct inode *inode = file_inode(iocb->ki_filp);  	ssize_t result; -	size_t count = iov_length(iov, nr_segs);  	if (iocb->ki_filp->f_flags & O_DIRECT) -		return nfs_file_direct_read(iocb, iov, nr_segs, pos); +		return nfs_file_direct_read(iocb, to, iocb->ki_pos, true); -	dprintk("NFS: read(%s/%s, %lu@%lu)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name, -		(unsigned long) count, (unsigned long) pos); +	dprintk("NFS: read(%pD2, %zu@%lu)\n", +		iocb->ki_filp, +		iov_iter_count(to), (unsigned long) iocb->ki_pos);  	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);  	if (!result) { -		result = generic_file_aio_read(iocb, iov, nr_segs, pos); +		result = generic_file_read_iter(iocb, to);  		if (result > 0)  			nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);  	}  	return result;  } +EXPORT_SYMBOL_GPL(nfs_file_read); -static ssize_t +ssize_t  nfs_file_splice_read(struct file *filp, loff_t *ppos,  		     struct pipe_inode_info *pipe, size_t count,  		     unsigned int flags)  { -	struct dentry *dentry = filp->f_path.dentry; -	struct inode *inode = dentry->d_inode; +	struct inode *inode = file_inode(filp);  	ssize_t res; -	dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name, -		(unsigned long) count, (unsigned long long) *ppos); +	dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", +		filp, (unsigned long) count, (unsigned long long) *ppos);  	res = nfs_revalidate_mapping(inode, filp->f_mapping);  	if (!res) { @@ -270,16 +206,15 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,  	}  	return res;  } +EXPORT_SYMBOL_GPL(nfs_file_splice_read); -static int +int  nfs_file_mmap(struct file * file, struct vm_area_struct * vma)  { -	struct dentry *dentry = file->f_path.dentry; -	struct inode *inode = dentry->d_inode; +	struct inode *inode = file_inode(file);  	int	status; -	dprintk("NFS: mmap(%s/%s)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name); +	dprintk("NFS: mmap(%pD2)\n", file);  	/* Note: generic_file_mmap() returns ENOSYS on nommu systems  	 *       so we call that before revalidating the mapping @@ -291,6 +226,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)  	}  	return status;  } +EXPORT_SYMBOL_GPL(nfs_file_mmap);  /*   * Flush any dirty pages for this process, and check for write errors. @@ -301,31 +237,66 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)   * disk, but it retrieves and clears ctx->error after synching, despite   * the two being set at the same time in nfs_context_set_write_error().   * This is because the former is used to notify the _next_ call to - * nfs_file_write() that a write error occured, and hence cause it to + * nfs_file_write() that a write error occurred, and hence cause it to   * fall back to doing a synchronous write.   */ -static int -nfs_file_fsync(struct file *file, int datasync) +int +nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)  { -	struct dentry *dentry = file->f_path.dentry;  	struct nfs_open_context *ctx = nfs_file_open_context(file); -	struct inode *inode = dentry->d_inode; -	int have_error, status; +	struct inode *inode = file_inode(file); +	int have_error, do_resend, status;  	int ret = 0; - -	dprintk("NFS: fsync file(%s/%s) datasync %d\n", -			dentry->d_parent->d_name.name, dentry->d_name.name, -			datasync); +	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC); +	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);  	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);  	status = nfs_commit_inode(inode, FLUSH_SYNC);  	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); -	if (have_error) +	if (have_error) {  		ret = xchg(&ctx->error, 0); -	if (!ret && status < 0) +		if (ret) +			goto out; +	} +	if (status < 0) {  		ret = status; +		goto out; +	} +	do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); +	if (do_resend) +		ret = -EAGAIN; +out: +	return ret; +} +EXPORT_SYMBOL_GPL(nfs_file_fsync_commit); + +static int +nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ +	int ret; +	struct inode *inode = file_inode(file); + +	trace_nfs_fsync_enter(inode); + +	do { +		ret = filemap_write_and_wait_range(inode->i_mapping, start, end); +		if (ret != 0) +			break; +		mutex_lock(&inode->i_mutex); +		ret = nfs_file_fsync_commit(file, start, end, datasync); +		mutex_unlock(&inode->i_mutex); +		/* +		 * If nfs_file_fsync_commit detected a server reboot, then +		 * resend all dirty pages that might have been covered by +		 * the NFS_CONTEXT_RESEND_WRITES flag +		 */ +		start = 0; +		end = LLONG_MAX; +	} while (ret == -EAGAIN); + +	trace_nfs_fsync_exit(inode, ret);  	return ret;  } @@ -382,14 +353,8 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,  	struct page *page;  	int once_thru = 0; -	dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", -		file->f_path.dentry->d_parent->d_name.name, -		file->f_path.dentry->d_name.name, -		mapping->host->i_ino, len, (long long) pos); - -	pnfs_update_layout(mapping->host, -			   nfs_file_open_context(file), -			   IOMODE_RW); +	dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", +		file, mapping->host->i_ino, len, (long long) pos);  start:  	/* @@ -426,12 +391,11 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,  			struct page *page, void *fsdata)  {  	unsigned offset = pos & (PAGE_CACHE_SIZE - 1); +	struct nfs_open_context *ctx = nfs_file_open_context(file);  	int status; -	dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", -		file->f_path.dentry->d_parent->d_name.name, -		file->f_path.dentry->d_name.name, -		mapping->host->i_ino, len, (long long) pos); +	dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", +		file, mapping->host->i_ino, len, (long long) pos);  	/*  	 * Zero any uninitialised parts of the page, and then mark the page @@ -460,6 +424,14 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,  	if (status < 0)  		return status; +	NFS_I(mapping->host)->write_io += copied; + +	if (nfs_ctx_key_to_expire(ctx)) { +		status = nfs_wb_all(mapping->host); +		if (status < 0) +			return status; +	} +  	return copied;  } @@ -470,14 +442,16 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,   * - Called if either PG_private or PG_fscache is set on the page   * - Caller holds page lock   */ -static void nfs_invalidate_page(struct page *page, unsigned long offset) +static void nfs_invalidate_page(struct page *page, unsigned int offset, +				unsigned int length)  { -	dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); +	dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n", +		 page, offset, length); -	if (offset != 0) +	if (offset != 0 || length < PAGE_CACHE_SIZE)  		return;  	/* Cancel any unstarted writes on this page */ -	nfs_wb_page_cancel(page->mapping->host, page); +	nfs_wb_page_cancel(page_file_mapping(page)->host, page);  	nfs_fscache_invalidate_page(page, page->mapping->host);  } @@ -494,8 +468,11 @@ static int nfs_release_page(struct page *page, gfp_t gfp)  	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); -	/* Only do I/O if gfp is a superset of GFP_KERNEL */ -	if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) { +	/* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not +	 * doing this memory reclaim for a fs-related allocation. +	 */ +	if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL && +	    !(current->flags & PF_FSTRANS)) {  		int how = FLUSH_SYNC;  		/* Don't let kswapd deadlock waiting for OOM RPC calls */ @@ -509,6 +486,35 @@ static int nfs_release_page(struct page *page, gfp_t gfp)  	return nfs_fscache_release_page(page, gfp);  } +static void nfs_check_dirty_writeback(struct page *page, +				bool *dirty, bool *writeback) +{ +	struct nfs_inode *nfsi; +	struct address_space *mapping = page_file_mapping(page); + +	if (!mapping || PageSwapCache(page)) +		return; + +	/* +	 * Check if an unstable page is currently being committed and +	 * if so, have the VM treat it as if the page is under writeback +	 * so it will not block due to pages that will shortly be freeable. +	 */ +	nfsi = NFS_I(mapping->host); +	if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) { +		*writeback = true; +		return; +	} + +	/* +	 * If PagePrivate() is set, then the page is not freeable and as the +	 * inode is not being committed, it's not going to be cleaned in the +	 * near future so treat it as dirty +	 */ +	if (PagePrivate(page)) +		*dirty = true; +} +  /*   * Attempt to clear the private state associated with a page when an error   * occurs that requires the cached contents of an inode to be written back or @@ -519,7 +525,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp)   */  static int nfs_launder_page(struct page *page)  { -	struct inode *inode = page->mapping->host; +	struct inode *inode = page_file_mapping(page)->host;  	struct nfs_inode *nfsi = NFS_I(inode);  	dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", @@ -529,6 +535,20 @@ static int nfs_launder_page(struct page *page)  	return nfs_wb_page(inode, page);  } +#ifdef CONFIG_NFS_SWAP +static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, +						sector_t *span) +{ +	*span = sis->pages; +	return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1); +} + +static void nfs_swap_deactivate(struct file *file) +{ +	xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0); +} +#endif +  const struct address_space_operations nfs_file_aops = {  	.readpage = nfs_readpage,  	.readpages = nfs_readpages, @@ -542,7 +562,12 @@ const struct address_space_operations nfs_file_aops = {  	.direct_IO = nfs_direct_IO,  	.migratepage = nfs_migrate_page,  	.launder_page = nfs_launder_page, +	.is_dirty_writeback = nfs_check_dirty_writeback,  	.error_remove_page = generic_error_remove_page, +#ifdef CONFIG_NFS_SWAP +	.swap_activate = nfs_swap_activate, +	.swap_deactivate = nfs_swap_deactivate, +#endif  };  /* @@ -554,24 +579,25 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  {  	struct page *page = vmf->page;  	struct file *filp = vma->vm_file; -	struct dentry *dentry = filp->f_path.dentry; +	struct inode *inode = file_inode(filp);  	unsigned pagelen;  	int ret = VM_FAULT_NOPAGE;  	struct address_space *mapping; -	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name, -		filp->f_mapping->host->i_ino, +	dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n", +		filp, filp->f_mapping->host->i_ino,  		(long long)page_offset(page));  	/* make sure the cache has finished storing the page */ -	nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); +	nfs_fscache_wait_on_page_write(NFS_I(inode), page);  	lock_page(page); -	mapping = page->mapping; -	if (mapping != dentry->d_inode->i_mapping) +	mapping = page_file_mapping(page); +	if (mapping != inode->i_mapping)  		goto out_unlock; +	wait_on_page_writeback(page); +  	pagelen = nfs_page_length(page);  	if (pagelen == 0)  		goto out_unlock; @@ -590,7 +616,9 @@ out:  static const struct vm_operations_struct nfs_file_vm_ops = {  	.fault = filemap_fault, +	.map_pages = filemap_map_pages,  	.page_mkwrite = nfs_vm_page_mkwrite, +	.remap_pages = generic_file_remap_pages,  };  static int nfs_need_sync_write(struct file *filp, struct inode *inode) @@ -600,26 +628,30 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode)  	if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))  		return 1;  	ctx = nfs_file_open_context(filp); -	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) +	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || +	    nfs_ctx_key_to_expire(ctx))  		return 1;  	return 0;  } -static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, -				unsigned long nr_segs, loff_t pos) +ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)  { -	struct dentry * dentry = iocb->ki_filp->f_path.dentry; -	struct inode * inode = dentry->d_inode; +	struct file *file = iocb->ki_filp; +	struct inode *inode = file_inode(file);  	unsigned long written = 0;  	ssize_t result; -	size_t count = iov_length(iov, nr_segs); +	size_t count = iov_iter_count(from); +	loff_t pos = iocb->ki_pos; -	if (iocb->ki_filp->f_flags & O_DIRECT) -		return nfs_file_direct_write(iocb, iov, nr_segs, pos); +	result = nfs_key_timeout_notify(file, inode); +	if (result) +		return result; + +	if (file->f_flags & O_DIRECT) +		return nfs_file_direct_write(iocb, from, pos, true); -	dprintk("NFS: write(%s/%s, %lu@%Ld)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name, -		(unsigned long) count, (long long) pos); +	dprintk("NFS: write(%pD2, %zu@%Ld)\n", +		file, count, (long long) pos);  	result = -EBUSY;  	if (IS_SWAPFILE(inode)) @@ -627,8 +659,8 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,  	/*  	 * O_APPEND implies that we must revalidate the file length.  	 */ -	if (iocb->ki_filp->f_flags & O_APPEND) { -		result = nfs_revalidate_file_size(inode, iocb->ki_filp); +	if (file->f_flags & O_APPEND) { +		result = nfs_revalidate_file_size(inode, file);  		if (result)  			goto out;  	} @@ -637,13 +669,13 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,  	if (!count)  		goto out; -	result = generic_file_aio_write(iocb, iov, nr_segs, pos); +	result = generic_file_write_iter(iocb, from);  	if (result > 0)  		written = result;  	/* Return error values for O_DSYNC and IS_SYNC() */ -	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { -		int err = vfs_fsync(iocb->ki_filp, 0); +	if (result >= 0 && nfs_need_sync_write(file, inode)) { +		int err = vfs_fsync(file, 0);  		if (err < 0)  			result = err;  	} @@ -656,43 +688,14 @@ out_swapfile:  	printk(KERN_INFO "NFS: attempt to write to active swap file!\n");  	goto out;  } - -static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, -				     struct file *filp, loff_t *ppos, -				     size_t count, unsigned int flags) -{ -	struct dentry *dentry = filp->f_path.dentry; -	struct inode *inode = dentry->d_inode; -	unsigned long written = 0; -	ssize_t ret; - -	dprintk("NFS splice_write(%s/%s, %lu@%llu)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name, -		(unsigned long) count, (unsigned long long) *ppos); - -	/* -	 * The combination of splice and an O_APPEND destination is disallowed. -	 */ - -	ret = generic_file_splice_write(pipe, filp, ppos, count, flags); -	if (ret > 0) -		written = ret; - -	if (ret >= 0 && nfs_need_sync_write(filp, inode)) { -		int err = vfs_fsync(filp, 0); -		if (err < 0) -			ret = err; -	} -	if (ret > 0) -		nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); -	return ret; -} +EXPORT_SYMBOL_GPL(nfs_file_write);  static int  do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)  {  	struct inode *inode = filp->f_mapping->host;  	int status = 0; +	unsigned int saved_type = fl->fl_type;  	/* Try local locking first */  	posix_test_lock(filp, fl); @@ -700,8 +703,9 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)  		/* found a conflict */  		goto out;  	} +	fl->fl_type = saved_type; -	if (nfs_have_delegation(inode, FMODE_READ)) +	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))  		goto out_noconflict;  	if (is_local) @@ -735,6 +739,7 @@ static int  do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)  {  	struct inode *inode = filp->f_mapping->host; +	struct nfs_lock_context *l_ctx;  	int status;  	/* @@ -743,6 +748,14 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)  	 */  	nfs_sync_mapping(filp->f_mapping); +	l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); +	if (!IS_ERR(l_ctx)) { +		status = nfs_iocounter_wait(&l_ctx->io_count); +		nfs_put_lock_context(l_ctx); +		if (status < 0) +			return status; +	} +  	/* NOTE: special case  	 * 	If we're signalled while cleaning up locks on process exit, we  	 * 	still need to complete the unlock. @@ -796,7 +809,7 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)  	 * This makes locking act as a cache coherency point.  	 */  	nfs_sync_mapping(filp->f_mapping); -	if (!nfs_have_delegation(inode, FMODE_READ)) { +	if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {  		if (is_time_granular(&NFS_SERVER(inode)->time_delta))  			__nfs_revalidate_inode(NFS_SERVER(inode), inode);  		else @@ -809,16 +822,14 @@ out:  /*   * Lock a (portion of) a file   */ -static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) +int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)  {  	struct inode *inode = filp->f_mapping->host;  	int ret = -ENOLCK;  	int is_local = 0; -	dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n", -			filp->f_path.dentry->d_parent->d_name.name, -			filp->f_path.dentry->d_name.name, -			fl->fl_type, fl->fl_flags, +	dprintk("NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld)\n", +			filp, fl->fl_type, fl->fl_flags,  			(long long)fl->fl_start, (long long)fl->fl_end);  	nfs_inc_stats(inode, NFSIOS_VFSLOCK); @@ -845,44 +856,68 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)  out_err:  	return ret;  } +EXPORT_SYMBOL_GPL(nfs_lock);  /*   * Lock a (portion of) a file   */ -static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) +int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)  {  	struct inode *inode = filp->f_mapping->host;  	int is_local = 0; -	dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n", -			filp->f_path.dentry->d_parent->d_name.name, -			filp->f_path.dentry->d_name.name, -			fl->fl_type, fl->fl_flags); +	dprintk("NFS: flock(%pD2, t=%x, fl=%x)\n", +			filp, fl->fl_type, fl->fl_flags);  	if (!(fl->fl_flags & FL_FLOCK))  		return -ENOLCK; +	/* +	 * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of +	 * any standard. In principle we might be able to support LOCK_MAND +	 * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the +	 * NFS code is not set up for it. +	 */ +	if (fl->fl_type & LOCK_MAND) +		return -EINVAL; +  	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)  		is_local = 1;  	/* We're simulating flock() locks using posix locks on the server */ -	fl->fl_owner = (fl_owner_t)filp; -	fl->fl_start = 0; -	fl->fl_end = OFFSET_MAX; -  	if (fl->fl_type == F_UNLCK)  		return do_unlk(filp, cmd, fl, is_local);  	return do_setlk(filp, cmd, fl, is_local);  } +EXPORT_SYMBOL_GPL(nfs_flock);  /*   * There is no protocol support for leases, so we have no way to implement   * them correctly in the face of opens by other clients.   */ -static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) +int nfs_setlease(struct file *file, long arg, struct file_lock **fl)  { -	dprintk("NFS: setlease(%s/%s, arg=%ld)\n", -			file->f_path.dentry->d_parent->d_name.name, -			file->f_path.dentry->d_name.name, arg); +	dprintk("NFS: setlease(%pD2, arg=%ld)\n", file, arg);  	return -EINVAL;  } +EXPORT_SYMBOL_GPL(nfs_setlease); + +const struct file_operations nfs_file_operations = { +	.llseek		= nfs_file_llseek, +	.read		= new_sync_read, +	.write		= new_sync_write, +	.read_iter	= nfs_file_read, +	.write_iter	= nfs_file_write, +	.mmap		= nfs_file_mmap, +	.open		= nfs_file_open, +	.flush		= nfs_file_flush, +	.release	= nfs_file_release, +	.fsync		= nfs_file_fsync, +	.lock		= nfs_lock, +	.flock		= nfs_flock, +	.splice_read	= nfs_file_splice_read, +	.splice_write	= iter_file_splice_write, +	.check_flags	= nfs_check_flags, +	.setlease	= nfs_setlease, +}; +EXPORT_SYMBOL_GPL(nfs_file_operations); diff --git a/fs/nfs/filelayout/Makefile b/fs/nfs/filelayout/Makefile new file mode 100644 index 00000000000..8516cdffb9e --- /dev/null +++ b/fs/nfs/filelayout/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS Files Layout Driver kernel module +# +obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o +nfs_layout_nfsv41_files-y := filelayout.o filelayoutdev.o diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c new file mode 100644 index 00000000000..d2eba1c13b7 --- /dev/null +++ b/fs/nfs/filelayout/filelayout.c @@ -0,0 +1,1406 @@ +/* + *  Module for the pnfs nfs4 file layout driver. + *  Defines all I/O and Policy interface operations, plus code + *  to register itself with the pNFS client. + * + *  Copyright (c) 2002 + *  The Regents of the University of Michigan + *  All Rights Reserved + * + *  Dean Hildebrand <dhildebz@umich.edu> + * + *  Permission is granted to use, copy, create derivative works, and + *  redistribute this software and such derivative works for any purpose, + *  so long as the name of the University of Michigan is not used in + *  any advertising or publicity pertaining to the use or distribution + *  of this software without specific, written prior authorization. If + *  the above copyright notice or any other identification of the + *  University of Michigan is included in any copy of any portion of + *  this software, then the disclaimer below must also be included. + * + *  This software is provided as is, without representation or warranty + *  of any kind either express or implied, including without limitation + *  the implied warranties of merchantability, fitness for a particular + *  purpose, or noninfringement.  The Regents of the University of + *  Michigan shall not be liable for any damages, including special, + *  indirect, incidental, or consequential damages, with respect to any + *  claim arising out of or in connection with the use of the software, + *  even if it has been or is hereafter advised of the possibility of + *  such damages. + */ + +#include <linux/nfs_fs.h> +#include <linux/nfs_page.h> +#include <linux/module.h> + +#include <linux/sunrpc/metrics.h> + +#include "../nfs4session.h" +#include "../internal.h" +#include "../delegation.h" +#include "filelayout.h" +#include "../nfs4trace.h" + +#define NFSDBG_FACILITY         NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>"); +MODULE_DESCRIPTION("The NFSv4 file layout driver"); + +#define FILELAYOUT_POLL_RETRY_MAX     (15*HZ) + +static loff_t +filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg, +			    loff_t offset) +{ +	u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count; +	u64 stripe_no; +	u32 rem; + +	offset -= flseg->pattern_offset; +	stripe_no = div_u64(offset, stripe_width); +	div_u64_rem(offset, flseg->stripe_unit, &rem); + +	return stripe_no * flseg->stripe_unit + rem; +} + +/* This function is used by the layout driver to calculate the + * offset of the file on the dserver based on whether the + * layout type is STRIPE_DENSE or STRIPE_SPARSE + */ +static loff_t +filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) +{ +	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + +	switch (flseg->stripe_type) { +	case STRIPE_SPARSE: +		return offset; + +	case STRIPE_DENSE: +		return filelayout_get_dense_offset(flseg, offset); +	} + +	BUG(); +} + +static void filelayout_reset_write(struct nfs_pgio_data *data) +{ +	struct nfs_pgio_header *hdr = data->header; +	struct rpc_task *task = &data->task; + +	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { +		dprintk("%s Reset task %5u for i/o through MDS " +			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__, +			data->task.tk_pid, +			hdr->inode->i_sb->s_id, +			(unsigned long long)NFS_FILEID(hdr->inode), +			data->args.count, +			(unsigned long long)data->args.offset); + +		task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode, +							&hdr->pages, +							hdr->completion_ops, +							hdr->dreq); +	} +} + +static void filelayout_reset_read(struct nfs_pgio_data *data) +{ +	struct nfs_pgio_header *hdr = data->header; +	struct rpc_task *task = &data->task; + +	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { +		dprintk("%s Reset task %5u for i/o through MDS " +			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__, +			data->task.tk_pid, +			hdr->inode->i_sb->s_id, +			(unsigned long long)NFS_FILEID(hdr->inode), +			data->args.count, +			(unsigned long long)data->args.offset); + +		task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode, +							&hdr->pages, +							hdr->completion_ops, +							hdr->dreq); +	} +} + +static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo) +{ +	if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) +		return; +	pnfs_return_layout(inode); +} + +static int filelayout_async_handle_error(struct rpc_task *task, +					 struct nfs4_state *state, +					 struct nfs_client *clp, +					 struct pnfs_layout_segment *lseg) +{ +	struct pnfs_layout_hdr *lo = lseg->pls_layout; +	struct inode *inode = lo->plh_inode; +	struct nfs_server *mds_server = NFS_SERVER(inode); +	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); +	struct nfs_client *mds_client = mds_server->nfs_client; +	struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; + +	if (task->tk_status >= 0) +		return 0; + +	switch (task->tk_status) { +	/* MDS state errors */ +	case -NFS4ERR_DELEG_REVOKED: +	case -NFS4ERR_ADMIN_REVOKED: +	case -NFS4ERR_BAD_STATEID: +		if (state == NULL) +			break; +		nfs_remove_bad_delegation(state->inode); +	case -NFS4ERR_OPENMODE: +		if (state == NULL) +			break; +		if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) +			goto out_bad_stateid; +		goto wait_on_recovery; +	case -NFS4ERR_EXPIRED: +		if (state != NULL) { +			if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) +				goto out_bad_stateid; +		} +		nfs4_schedule_lease_recovery(mds_client); +		goto wait_on_recovery; +	/* DS session errors */ +	case -NFS4ERR_BADSESSION: +	case -NFS4ERR_BADSLOT: +	case -NFS4ERR_BAD_HIGH_SLOT: +	case -NFS4ERR_DEADSESSION: +	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: +	case -NFS4ERR_SEQ_FALSE_RETRY: +	case -NFS4ERR_SEQ_MISORDERED: +		dprintk("%s ERROR %d, Reset session. Exchangeid " +			"flags 0x%x\n", __func__, task->tk_status, +			clp->cl_exchange_flags); +		nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); +		break; +	case -NFS4ERR_DELAY: +	case -NFS4ERR_GRACE: +		rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX); +		break; +	case -NFS4ERR_RETRY_UNCACHED_REP: +		break; +	/* Invalidate Layout errors */ +	case -NFS4ERR_PNFS_NO_LAYOUT: +	case -ESTALE:           /* mapped NFS4ERR_STALE */ +	case -EBADHANDLE:       /* mapped NFS4ERR_BADHANDLE */ +	case -EISDIR:           /* mapped NFS4ERR_ISDIR */ +	case -NFS4ERR_FHEXPIRED: +	case -NFS4ERR_WRONG_TYPE: +		dprintk("%s Invalid layout error %d\n", __func__, +			task->tk_status); +		/* +		 * Destroy layout so new i/o will get a new layout. +		 * Layout will not be destroyed until all current lseg +		 * references are put. Mark layout as invalid to resend failed +		 * i/o and all i/o waiting on the slot table to the MDS until +		 * layout is destroyed and a new valid layout is obtained. +		 */ +		pnfs_destroy_layout(NFS_I(inode)); +		rpc_wake_up(&tbl->slot_tbl_waitq); +		goto reset; +	/* RPC connection errors */ +	case -ECONNREFUSED: +	case -EHOSTDOWN: +	case -EHOSTUNREACH: +	case -ENETUNREACH: +	case -EIO: +	case -ETIMEDOUT: +	case -EPIPE: +		dprintk("%s DS connection error %d\n", __func__, +			task->tk_status); +		nfs4_mark_deviceid_unavailable(devid); +		set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); +		rpc_wake_up(&tbl->slot_tbl_waitq); +		/* fall through */ +	default: +reset: +		dprintk("%s Retry through MDS. Error %d\n", __func__, +			task->tk_status); +		return -NFS4ERR_RESET_TO_MDS; +	} +out: +	task->tk_status = 0; +	return -EAGAIN; +out_bad_stateid: +	task->tk_status = -EIO; +	return 0; +wait_on_recovery: +	rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL); +	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0) +		rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task); +	goto out; +} + +/* NFS_PROTO call done callback routines */ + +static int filelayout_read_done_cb(struct rpc_task *task, +				struct nfs_pgio_data *data) +{ +	struct nfs_pgio_header *hdr = data->header; +	int err; + +	trace_nfs4_pnfs_read(data, task->tk_status); +	err = filelayout_async_handle_error(task, data->args.context->state, +					    data->ds_clp, hdr->lseg); + +	switch (err) { +	case -NFS4ERR_RESET_TO_MDS: +		filelayout_reset_read(data); +		return task->tk_status; +	case -EAGAIN: +		rpc_restart_call_prepare(task); +		return -EAGAIN; +	} + +	return 0; +} + +/* + * We reference the rpc_cred of the first WRITE that triggers the need for + * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. + * rfc5661 is not clear about which credential should be used. + */ +static void +filelayout_set_layoutcommit(struct nfs_pgio_data *wdata) +{ +	struct nfs_pgio_header *hdr = wdata->header; + +	if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || +	    wdata->res.verf->committed == NFS_FILE_SYNC) +		return; + +	pnfs_set_layoutcommit(wdata); +	dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, +		(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); +} + +bool +filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node) +{ +	return filelayout_test_devid_invalid(node) || +		nfs4_test_deviceid_unavailable(node); +} + +static bool +filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) +{ +	struct nfs4_deviceid_node *node = FILELAYOUT_DEVID_NODE(lseg); + +	return filelayout_test_devid_unavailable(node); +} + +/* + * Call ops for the async read/write cases + * In the case of dense layouts, the offset needs to be reset to its + * original value. + */ +static void filelayout_read_prepare(struct rpc_task *task, void *data) +{ +	struct nfs_pgio_data *rdata = data; + +	if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) { +		rpc_exit(task, -EIO); +		return; +	} +	if (filelayout_reset_to_mds(rdata->header->lseg)) { +		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); +		filelayout_reset_read(rdata); +		rpc_exit(task, 0); +		return; +	} +	rdata->pgio_done_cb = filelayout_read_done_cb; + +	if (nfs41_setup_sequence(rdata->ds_clp->cl_session, +			&rdata->args.seq_args, +			&rdata->res.seq_res, +			task)) +		return; +	if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, +			rdata->args.lock_context, FMODE_READ) == -EIO) +		rpc_exit(task, -EIO); /* lost lock, terminate I/O */ +} + +static void filelayout_read_call_done(struct rpc_task *task, void *data) +{ +	struct nfs_pgio_data *rdata = data; + +	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); + +	if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) && +	    task->tk_status == 0) { +		nfs41_sequence_done(task, &rdata->res.seq_res); +		return; +	} + +	/* Note this may cause RPC to be resent */ +	rdata->header->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_read_count_stats(struct rpc_task *task, void *data) +{ +	struct nfs_pgio_data *rdata = data; + +	rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics); +} + +static void filelayout_read_release(void *data) +{ +	struct nfs_pgio_data *rdata = data; +	struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout; + +	filelayout_fenceme(lo->plh_inode, lo); +	nfs_put_client(rdata->ds_clp); +	rdata->header->mds_ops->rpc_release(data); +} + +static int filelayout_write_done_cb(struct rpc_task *task, +				struct nfs_pgio_data *data) +{ +	struct nfs_pgio_header *hdr = data->header; +	int err; + +	trace_nfs4_pnfs_write(data, task->tk_status); +	err = filelayout_async_handle_error(task, data->args.context->state, +					    data->ds_clp, hdr->lseg); + +	switch (err) { +	case -NFS4ERR_RESET_TO_MDS: +		filelayout_reset_write(data); +		return task->tk_status; +	case -EAGAIN: +		rpc_restart_call_prepare(task); +		return -EAGAIN; +	} + +	filelayout_set_layoutcommit(data); +	return 0; +} + +/* Fake up some data that will cause nfs_commit_release to retry the writes. */ +static void prepare_to_resend_writes(struct nfs_commit_data *data) +{ +	struct nfs_page *first = nfs_list_entry(data->pages.next); + +	data->task.tk_status = 0; +	memcpy(&data->verf.verifier, &first->wb_verf, +	       sizeof(data->verf.verifier)); +	data->verf.verifier.data[0]++; /* ensure verifier mismatch */ +} + +static int filelayout_commit_done_cb(struct rpc_task *task, +				     struct nfs_commit_data *data) +{ +	int err; + +	trace_nfs4_pnfs_commit_ds(data, task->tk_status); +	err = filelayout_async_handle_error(task, NULL, data->ds_clp, +					    data->lseg); + +	switch (err) { +	case -NFS4ERR_RESET_TO_MDS: +		prepare_to_resend_writes(data); +		return -EAGAIN; +	case -EAGAIN: +		rpc_restart_call_prepare(task); +		return -EAGAIN; +	} + +	return 0; +} + +static void filelayout_write_prepare(struct rpc_task *task, void *data) +{ +	struct nfs_pgio_data *wdata = data; + +	if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) { +		rpc_exit(task, -EIO); +		return; +	} +	if (filelayout_reset_to_mds(wdata->header->lseg)) { +		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); +		filelayout_reset_write(wdata); +		rpc_exit(task, 0); +		return; +	} +	if (nfs41_setup_sequence(wdata->ds_clp->cl_session, +			&wdata->args.seq_args, +			&wdata->res.seq_res, +			task)) +		return; +	if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, +			wdata->args.lock_context, FMODE_WRITE) == -EIO) +		rpc_exit(task, -EIO); /* lost lock, terminate I/O */ +} + +static void filelayout_write_call_done(struct rpc_task *task, void *data) +{ +	struct nfs_pgio_data *wdata = data; + +	if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) && +	    task->tk_status == 0) { +		nfs41_sequence_done(task, &wdata->res.seq_res); +		return; +	} + +	/* Note this may cause RPC to be resent */ +	wdata->header->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_write_count_stats(struct rpc_task *task, void *data) +{ +	struct nfs_pgio_data *wdata = data; + +	rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics); +} + +static void filelayout_write_release(void *data) +{ +	struct nfs_pgio_data *wdata = data; +	struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout; + +	filelayout_fenceme(lo->plh_inode, lo); +	nfs_put_client(wdata->ds_clp); +	wdata->header->mds_ops->rpc_release(data); +} + +static void filelayout_commit_prepare(struct rpc_task *task, void *data) +{ +	struct nfs_commit_data *wdata = data; + +	nfs41_setup_sequence(wdata->ds_clp->cl_session, +			&wdata->args.seq_args, +			&wdata->res.seq_res, +			task); +} + +static void filelayout_write_commit_done(struct rpc_task *task, void *data) +{ +	struct nfs_commit_data *wdata = data; + +	/* Note this may cause RPC to be resent */ +	wdata->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_commit_count_stats(struct rpc_task *task, void *data) +{ +	struct nfs_commit_data *cdata = data; + +	rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics); +} + +static void filelayout_commit_release(void *calldata) +{ +	struct nfs_commit_data *data = calldata; + +	data->completion_ops->completion(data); +	pnfs_put_lseg(data->lseg); +	nfs_put_client(data->ds_clp); +	nfs_commitdata_release(data); +} + +static const struct rpc_call_ops filelayout_read_call_ops = { +	.rpc_call_prepare = filelayout_read_prepare, +	.rpc_call_done = filelayout_read_call_done, +	.rpc_count_stats = filelayout_read_count_stats, +	.rpc_release = filelayout_read_release, +}; + +static const struct rpc_call_ops filelayout_write_call_ops = { +	.rpc_call_prepare = filelayout_write_prepare, +	.rpc_call_done = filelayout_write_call_done, +	.rpc_count_stats = filelayout_write_count_stats, +	.rpc_release = filelayout_write_release, +}; + +static const struct rpc_call_ops filelayout_commit_call_ops = { +	.rpc_call_prepare = filelayout_commit_prepare, +	.rpc_call_done = filelayout_write_commit_done, +	.rpc_count_stats = filelayout_commit_count_stats, +	.rpc_release = filelayout_commit_release, +}; + +static enum pnfs_try_status +filelayout_read_pagelist(struct nfs_pgio_data *data) +{ +	struct nfs_pgio_header *hdr = data->header; +	struct pnfs_layout_segment *lseg = hdr->lseg; +	struct nfs4_pnfs_ds *ds; +	struct rpc_clnt *ds_clnt; +	loff_t offset = data->args.offset; +	u32 j, idx; +	struct nfs_fh *fh; + +	dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", +		__func__, hdr->inode->i_ino, +		data->args.pgbase, (size_t)data->args.count, offset); + +	/* Retrieve the correct rpc_client for the byte range */ +	j = nfs4_fl_calc_j_index(lseg, offset); +	idx = nfs4_fl_calc_ds_index(lseg, j); +	ds = nfs4_fl_prepare_ds(lseg, idx); +	if (!ds) +		return PNFS_NOT_ATTEMPTED; + +	ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode); +	if (IS_ERR(ds_clnt)) +		return PNFS_NOT_ATTEMPTED; + +	dprintk("%s USE DS: %s cl_count %d\n", __func__, +		ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); + +	/* No multipath support. Use first DS */ +	atomic_inc(&ds->ds_clp->cl_count); +	data->ds_clp = ds->ds_clp; +	data->ds_idx = idx; +	fh = nfs4_fl_select_ds_fh(lseg, j); +	if (fh) +		data->args.fh = fh; + +	data->args.offset = filelayout_get_dserver_offset(lseg, offset); +	data->mds_offset = offset; + +	/* Perform an asynchronous read to ds */ +	nfs_initiate_pgio(ds_clnt, data, +			    &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN); +	return PNFS_ATTEMPTED; +} + +/* Perform async writes. */ +static enum pnfs_try_status +filelayout_write_pagelist(struct nfs_pgio_data *data, int sync) +{ +	struct nfs_pgio_header *hdr = data->header; +	struct pnfs_layout_segment *lseg = hdr->lseg; +	struct nfs4_pnfs_ds *ds; +	struct rpc_clnt *ds_clnt; +	loff_t offset = data->args.offset; +	u32 j, idx; +	struct nfs_fh *fh; + +	/* Retrieve the correct rpc_client for the byte range */ +	j = nfs4_fl_calc_j_index(lseg, offset); +	idx = nfs4_fl_calc_ds_index(lseg, j); +	ds = nfs4_fl_prepare_ds(lseg, idx); +	if (!ds) +		return PNFS_NOT_ATTEMPTED; + +	ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode); +	if (IS_ERR(ds_clnt)) +		return PNFS_NOT_ATTEMPTED; + +	dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n", +		__func__, hdr->inode->i_ino, sync, (size_t) data->args.count, +		offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); + +	data->pgio_done_cb = filelayout_write_done_cb; +	atomic_inc(&ds->ds_clp->cl_count); +	data->ds_clp = ds->ds_clp; +	data->ds_idx = idx; +	fh = nfs4_fl_select_ds_fh(lseg, j); +	if (fh) +		data->args.fh = fh; + +	data->args.offset = filelayout_get_dserver_offset(lseg, offset); + +	/* Perform an asynchronous write */ +	nfs_initiate_pgio(ds_clnt, data, +				    &filelayout_write_call_ops, sync, +				    RPC_TASK_SOFTCONN); +	return PNFS_ATTEMPTED; +} + +/* + * filelayout_check_layout() + * + * Make sure layout segment parameters are sane WRT the device. + * At this point no generic layer initialization of the lseg has occurred, + * and nothing has been added to the layout_hdr cache. + * + */ +static int +filelayout_check_layout(struct pnfs_layout_hdr *lo, +			struct nfs4_filelayout_segment *fl, +			struct nfs4_layoutget_res *lgr, +			struct nfs4_deviceid *id, +			gfp_t gfp_flags) +{ +	struct nfs4_deviceid_node *d; +	struct nfs4_file_layout_dsaddr *dsaddr; +	int status = -EINVAL; + +	dprintk("--> %s\n", __func__); + +	/* FIXME: remove this check when layout segment support is added */ +	if (lgr->range.offset != 0 || +	    lgr->range.length != NFS4_MAX_UINT64) { +		dprintk("%s Only whole file layouts supported. Use MDS i/o\n", +			__func__); +		goto out; +	} + +	if (fl->pattern_offset > lgr->range.offset) { +		dprintk("%s pattern_offset %lld too large\n", +				__func__, fl->pattern_offset); +		goto out; +	} + +	if (!fl->stripe_unit) { +		dprintk("%s Invalid stripe unit (%u)\n", +			__func__, fl->stripe_unit); +		goto out; +	} + +	/* find and reference the deviceid */ +	d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, +				   NFS_SERVER(lo->plh_inode)->nfs_client, id); +	if (d == NULL) { +		dsaddr = filelayout_get_device_info(lo->plh_inode, id, +				lo->plh_lc_cred, gfp_flags); +		if (dsaddr == NULL) +			goto out; +	} else +		dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); +	/* Found deviceid is unavailable */ +	if (filelayout_test_devid_unavailable(&dsaddr->id_node)) +			goto out_put; + +	fl->dsaddr = dsaddr; + +	if (fl->first_stripe_index >= dsaddr->stripe_count) { +		dprintk("%s Bad first_stripe_index %u\n", +				__func__, fl->first_stripe_index); +		goto out_put; +	} + +	if ((fl->stripe_type == STRIPE_SPARSE && +	    fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) || +	    (fl->stripe_type == STRIPE_DENSE && +	    fl->num_fh != dsaddr->stripe_count)) { +		dprintk("%s num_fh %u not valid for given packing\n", +			__func__, fl->num_fh); +		goto out_put; +	} + +	status = 0; +out: +	dprintk("--> %s returns %d\n", __func__, status); +	return status; +out_put: +	nfs4_fl_put_deviceid(dsaddr); +	goto out; +} + +static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) +{ +	int i; + +	for (i = 0; i < fl->num_fh; i++) { +		if (!fl->fh_array[i]) +			break; +		kfree(fl->fh_array[i]); +	} +	kfree(fl->fh_array); +	fl->fh_array = NULL; +} + +static void +_filelayout_free_lseg(struct nfs4_filelayout_segment *fl) +{ +	filelayout_free_fh_array(fl); +	kfree(fl); +} + +static int +filelayout_decode_layout(struct pnfs_layout_hdr *flo, +			 struct nfs4_filelayout_segment *fl, +			 struct nfs4_layoutget_res *lgr, +			 struct nfs4_deviceid *id, +			 gfp_t gfp_flags) +{ +	struct xdr_stream stream; +	struct xdr_buf buf; +	struct page *scratch; +	__be32 *p; +	uint32_t nfl_util; +	int i; + +	dprintk("%s: set_layout_map Begin\n", __func__); + +	scratch = alloc_page(gfp_flags); +	if (!scratch) +		return -ENOMEM; + +	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); +	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + +	/* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), +	 * num_fh (4) */ +	p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20); +	if (unlikely(!p)) +		goto out_err; + +	memcpy(id, p, sizeof(*id)); +	p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); +	nfs4_print_deviceid(id); + +	nfl_util = be32_to_cpup(p++); +	if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) +		fl->commit_through_mds = 1; +	if (nfl_util & NFL4_UFLG_DENSE) +		fl->stripe_type = STRIPE_DENSE; +	else +		fl->stripe_type = STRIPE_SPARSE; +	fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; + +	fl->first_stripe_index = be32_to_cpup(p++); +	p = xdr_decode_hyper(p, &fl->pattern_offset); +	fl->num_fh = be32_to_cpup(p++); + +	dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n", +		__func__, nfl_util, fl->num_fh, fl->first_stripe_index, +		fl->pattern_offset); + +	/* Note that a zero value for num_fh is legal for STRIPE_SPARSE. +	 * Futher checking is done in filelayout_check_layout */ +	if (fl->num_fh > +	    max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) +		goto out_err; + +	if (fl->num_fh > 0) { +		fl->fh_array = kcalloc(fl->num_fh, sizeof(fl->fh_array[0]), +				       gfp_flags); +		if (!fl->fh_array) +			goto out_err; +	} + +	for (i = 0; i < fl->num_fh; i++) { +		/* Do we want to use a mempool here? */ +		fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), gfp_flags); +		if (!fl->fh_array[i]) +			goto out_err_free; + +		p = xdr_inline_decode(&stream, 4); +		if (unlikely(!p)) +			goto out_err_free; +		fl->fh_array[i]->size = be32_to_cpup(p++); +		if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { +			printk(KERN_ERR "NFS: Too big fh %d received %d\n", +			       i, fl->fh_array[i]->size); +			goto out_err_free; +		} + +		p = xdr_inline_decode(&stream, fl->fh_array[i]->size); +		if (unlikely(!p)) +			goto out_err_free; +		memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); +		dprintk("DEBUG: %s: fh len %d\n", __func__, +			fl->fh_array[i]->size); +	} + +	__free_page(scratch); +	return 0; + +out_err_free: +	filelayout_free_fh_array(fl); +out_err: +	__free_page(scratch); +	return -EIO; +} + +static void +filelayout_free_lseg(struct pnfs_layout_segment *lseg) +{ +	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + +	dprintk("--> %s\n", __func__); +	nfs4_fl_put_deviceid(fl->dsaddr); +	/* This assumes a single RW lseg */ +	if (lseg->pls_range.iomode == IOMODE_RW) { +		struct nfs4_filelayout *flo; + +		flo = FILELAYOUT_FROM_HDR(lseg->pls_layout); +		flo->commit_info.nbuckets = 0; +		kfree(flo->commit_info.buckets); +		flo->commit_info.buckets = NULL; +	} +	_filelayout_free_lseg(fl); +} + +static int +filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, +			     struct nfs_commit_info *cinfo, +			     gfp_t gfp_flags) +{ +	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); +	struct pnfs_commit_bucket *buckets; +	int size, i; + +	if (fl->commit_through_mds) +		return 0; + +	size = (fl->stripe_type == STRIPE_SPARSE) ? +		fl->dsaddr->ds_num : fl->dsaddr->stripe_count; + +	if (cinfo->ds->nbuckets >= size) { +		/* This assumes there is only one IOMODE_RW lseg.  What +		 * we really want to do is have a layout_hdr level +		 * dictionary of <multipath_list4, fh> keys, each +		 * associated with a struct list_head, populated by calls +		 * to filelayout_write_pagelist(). +		 * */ +		return 0; +	} + +	buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), +			  gfp_flags); +	if (!buckets) +		return -ENOMEM; +	for (i = 0; i < size; i++) { +		INIT_LIST_HEAD(&buckets[i].written); +		INIT_LIST_HEAD(&buckets[i].committing); +		/* mark direct verifier as unset */ +		buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; +	} + +	spin_lock(cinfo->lock); +	if (cinfo->ds->nbuckets >= size) +		goto out; +	for (i = 0; i < cinfo->ds->nbuckets; i++) { +		list_splice(&cinfo->ds->buckets[i].written, +			    &buckets[i].written); +		list_splice(&cinfo->ds->buckets[i].committing, +			    &buckets[i].committing); +		buckets[i].direct_verf.committed = +			cinfo->ds->buckets[i].direct_verf.committed; +		buckets[i].wlseg = cinfo->ds->buckets[i].wlseg; +		buckets[i].clseg = cinfo->ds->buckets[i].clseg; +	} +	swap(cinfo->ds->buckets, buckets); +	cinfo->ds->nbuckets = size; +out: +	spin_unlock(cinfo->lock); +	kfree(buckets); +	return 0; +} + +static struct pnfs_layout_segment * +filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, +		      struct nfs4_layoutget_res *lgr, +		      gfp_t gfp_flags) +{ +	struct nfs4_filelayout_segment *fl; +	int rc; +	struct nfs4_deviceid id; + +	dprintk("--> %s\n", __func__); +	fl = kzalloc(sizeof(*fl), gfp_flags); +	if (!fl) +		return NULL; + +	rc = filelayout_decode_layout(layoutid, fl, lgr, &id, gfp_flags); +	if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id, gfp_flags)) { +		_filelayout_free_lseg(fl); +		return NULL; +	} +	return &fl->generic_hdr; +} + +/* + * filelayout_pg_test(). Called by nfs_can_coalesce_requests() + * + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t +filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, +		   struct nfs_page *req) +{ +	unsigned int size; +	u64 p_stripe, r_stripe; +	u32 stripe_offset; +	u64 segment_offset = pgio->pg_lseg->pls_range.offset; +	u32 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; + +	/* calls nfs_generic_pg_test */ +	size = pnfs_generic_pg_test(pgio, prev, req); +	if (!size) +		return 0; + +	/* see if req and prev are in the same stripe */ +	if (prev) { +		p_stripe = (u64)req_offset(prev) - segment_offset; +		r_stripe = (u64)req_offset(req) - segment_offset; +		do_div(p_stripe, stripe_unit); +		do_div(r_stripe, stripe_unit); + +		if (p_stripe != r_stripe) +			return 0; +	} + +	/* calculate remaining bytes in the current stripe */ +	div_u64_rem((u64)req_offset(req) - segment_offset, +			stripe_unit, +			&stripe_offset); +	WARN_ON_ONCE(stripe_offset > stripe_unit); +	if (stripe_offset >= stripe_unit) +		return 0; +	return min(stripe_unit - (unsigned int)stripe_offset, size); +} + +static void +filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, +			struct nfs_page *req) +{ +	if (!pgio->pg_lseg) +		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, +					   req->wb_context, +					   0, +					   NFS4_MAX_UINT64, +					   IOMODE_READ, +					   GFP_KERNEL); +	/* If no lseg, fall back to read through mds */ +	if (pgio->pg_lseg == NULL) +		nfs_pageio_reset_read_mds(pgio); +} + +static void +filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, +			 struct nfs_page *req) +{ +	struct nfs_commit_info cinfo; +	int status; + +	if (!pgio->pg_lseg) +		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, +					   req->wb_context, +					   0, +					   NFS4_MAX_UINT64, +					   IOMODE_RW, +					   GFP_NOFS); +	/* If no lseg, fall back to write through mds */ +	if (pgio->pg_lseg == NULL) +		goto out_mds; +	nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); +	status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); +	if (status < 0) { +		pnfs_put_lseg(pgio->pg_lseg); +		pgio->pg_lseg = NULL; +		goto out_mds; +	} +	return; +out_mds: +	nfs_pageio_reset_write_mds(pgio); +} + +static const struct nfs_pageio_ops filelayout_pg_read_ops = { +	.pg_init = filelayout_pg_init_read, +	.pg_test = filelayout_pg_test, +	.pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops filelayout_pg_write_ops = { +	.pg_init = filelayout_pg_init_write, +	.pg_test = filelayout_pg_test, +	.pg_doio = pnfs_generic_pg_writepages, +}; + +static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) +{ +	if (fl->stripe_type == STRIPE_SPARSE) +		return nfs4_fl_calc_ds_index(&fl->generic_hdr, j); +	else +		return j; +} + +/* The generic layer is about to remove the req from the commit list. + * If this will make the bucket empty, it will need to put the lseg reference. + */ +static void +filelayout_clear_request_commit(struct nfs_page *req, +				struct nfs_commit_info *cinfo) +{ +	struct pnfs_layout_segment *freeme = NULL; + +	spin_lock(cinfo->lock); +	if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags)) +		goto out; +	cinfo->ds->nwritten--; +	if (list_is_singular(&req->wb_list)) { +		struct pnfs_commit_bucket *bucket; + +		bucket = list_first_entry(&req->wb_list, +					  struct pnfs_commit_bucket, +					  written); +		freeme = bucket->wlseg; +		bucket->wlseg = NULL; +	} +out: +	nfs_request_remove_commit_list(req, cinfo); +	spin_unlock(cinfo->lock); +	pnfs_put_lseg(freeme); +} + +static struct list_head * +filelayout_choose_commit_list(struct nfs_page *req, +			      struct pnfs_layout_segment *lseg, +			      struct nfs_commit_info *cinfo) +{ +	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); +	u32 i, j; +	struct list_head *list; +	struct pnfs_commit_bucket *buckets; + +	if (fl->commit_through_mds) +		return &cinfo->mds->list; + +	/* Note that we are calling nfs4_fl_calc_j_index on each page +	 * that ends up being committed to a data server.  An attractive +	 * alternative is to add a field to nfs_write_data and nfs_page +	 * to store the value calculated in filelayout_write_pagelist +	 * and just use that here. +	 */ +	j = nfs4_fl_calc_j_index(lseg, req_offset(req)); +	i = select_bucket_index(fl, j); +	spin_lock(cinfo->lock); +	buckets = cinfo->ds->buckets; +	list = &buckets[i].written; +	if (list_empty(list)) { +		/* Non-empty buckets hold a reference on the lseg.  That ref +		 * is normally transferred to the COMMIT call and released +		 * there.  It could also be released if the last req is pulled +		 * off due to a rewrite, in which case it will be done in +		 * filelayout_clear_request_commit +		 */ +		buckets[i].wlseg = pnfs_get_lseg(lseg); +	} +	set_bit(PG_COMMIT_TO_DS, &req->wb_flags); +	cinfo->ds->nwritten++; +	spin_unlock(cinfo->lock); +	return list; +} + +static void +filelayout_mark_request_commit(struct nfs_page *req, +			       struct pnfs_layout_segment *lseg, +			       struct nfs_commit_info *cinfo) +{ +	struct list_head *list; + +	list = filelayout_choose_commit_list(req, lseg, cinfo); +	nfs_request_add_commit_list(req, list, cinfo); +} + +static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ +	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + +	if (flseg->stripe_type == STRIPE_SPARSE) +		return i; +	else +		return nfs4_fl_calc_ds_index(lseg, i); +} + +static struct nfs_fh * +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ +	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + +	if (flseg->stripe_type == STRIPE_SPARSE) { +		if (flseg->num_fh == 1) +			i = 0; +		else if (flseg->num_fh == 0) +			/* Use the MDS OPEN fh set in nfs_read_rpcsetup */ +			return NULL; +	} +	return flseg->fh_array[i]; +} + +static int filelayout_initiate_commit(struct nfs_commit_data *data, int how) +{ +	struct pnfs_layout_segment *lseg = data->lseg; +	struct nfs4_pnfs_ds *ds; +	struct rpc_clnt *ds_clnt; +	u32 idx; +	struct nfs_fh *fh; + +	idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); +	ds = nfs4_fl_prepare_ds(lseg, idx); +	if (!ds) +		goto out_err; + +	ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, data->inode); +	if (IS_ERR(ds_clnt)) +		goto out_err; + +	dprintk("%s ino %lu, how %d cl_count %d\n", __func__, +		data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count)); +	data->commit_done_cb = filelayout_commit_done_cb; +	atomic_inc(&ds->ds_clp->cl_count); +	data->ds_clp = ds->ds_clp; +	fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); +	if (fh) +		data->args.fh = fh; +	return nfs_initiate_commit(ds_clnt, data, +				   &filelayout_commit_call_ops, how, +				   RPC_TASK_SOFTCONN); +out_err: +	prepare_to_resend_writes(data); +	filelayout_commit_release(data); +	return -EAGAIN; +} + +static int +transfer_commit_list(struct list_head *src, struct list_head *dst, +		     struct nfs_commit_info *cinfo, int max) +{ +	struct nfs_page *req, *tmp; +	int ret = 0; + +	list_for_each_entry_safe(req, tmp, src, wb_list) { +		if (!nfs_lock_request(req)) +			continue; +		kref_get(&req->wb_kref); +		if (cond_resched_lock(cinfo->lock)) +			list_safe_reset_next(req, tmp, wb_list); +		nfs_request_remove_commit_list(req, cinfo); +		clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); +		nfs_list_add_request(req, dst); +		ret++; +		if ((ret == max) && !cinfo->dreq) +			break; +	} +	return ret; +} + +/* Note called with cinfo->lock held. */ +static int +filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, +			       struct nfs_commit_info *cinfo, +			       int max) +{ +	struct list_head *src = &bucket->written; +	struct list_head *dst = &bucket->committing; +	int ret; + +	ret = transfer_commit_list(src, dst, cinfo, max); +	if (ret) { +		cinfo->ds->nwritten -= ret; +		cinfo->ds->ncommitting += ret; +		bucket->clseg = bucket->wlseg; +		if (list_empty(src)) +			bucket->wlseg = NULL; +		else +			pnfs_get_lseg(bucket->clseg); +	} +	return ret; +} + +/* Move reqs from written to committing lists, returning count of number moved. + * Note called with cinfo->lock held. + */ +static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo, +					int max) +{ +	int i, rv = 0, cnt; + +	for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { +		cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i], +						     cinfo, max); +		max -= cnt; +		rv += cnt; +	} +	return rv; +} + +/* Pull everything off the committing lists and dump into @dst */ +static void filelayout_recover_commit_reqs(struct list_head *dst, +					   struct nfs_commit_info *cinfo) +{ +	struct pnfs_commit_bucket *b; +	struct pnfs_layout_segment *freeme; +	int i; + +restart: +	spin_lock(cinfo->lock); +	for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { +		if (transfer_commit_list(&b->written, dst, cinfo, 0)) { +			freeme = b->wlseg; +			b->wlseg = NULL; +			spin_unlock(cinfo->lock); +			pnfs_put_lseg(freeme); +			goto restart; +		} +	} +	cinfo->ds->nwritten = 0; +	spin_unlock(cinfo->lock); +} + +static unsigned int +alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) +{ +	struct pnfs_ds_commit_info *fl_cinfo; +	struct pnfs_commit_bucket *bucket; +	struct nfs_commit_data *data; +	int i, j; +	unsigned int nreq = 0; +	struct pnfs_layout_segment *freeme; + +	fl_cinfo = cinfo->ds; +	bucket = fl_cinfo->buckets; +	for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { +		if (list_empty(&bucket->committing)) +			continue; +		data = nfs_commitdata_alloc(); +		if (!data) +			break; +		data->ds_commit_index = i; +		spin_lock(cinfo->lock); +		data->lseg = bucket->clseg; +		bucket->clseg = NULL; +		spin_unlock(cinfo->lock); +		list_add(&data->pages, list); +		nreq++; +	} + +	/* Clean up on error */ +	for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) { +		if (list_empty(&bucket->committing)) +			continue; +		nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); +		spin_lock(cinfo->lock); +		freeme = bucket->clseg; +		bucket->clseg = NULL; +		spin_unlock(cinfo->lock); +		pnfs_put_lseg(freeme); +	} +	/* Caller will clean up entries put on list */ +	return nreq; +} + +/* This follows nfs_commit_list pretty closely */ +static int +filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, +			   int how, struct nfs_commit_info *cinfo) +{ +	struct nfs_commit_data *data, *tmp; +	LIST_HEAD(list); +	unsigned int nreq = 0; + +	if (!list_empty(mds_pages)) { +		data = nfs_commitdata_alloc(); +		if (data != NULL) { +			data->lseg = NULL; +			list_add(&data->pages, &list); +			nreq++; +		} else +			nfs_retry_commit(mds_pages, NULL, cinfo); +	} + +	nreq += alloc_ds_commits(cinfo, &list); + +	if (nreq == 0) { +		cinfo->completion_ops->error_cleanup(NFS_I(inode)); +		goto out; +	} + +	atomic_add(nreq, &cinfo->mds->rpcs_out); + +	list_for_each_entry_safe(data, tmp, &list, pages) { +		list_del_init(&data->pages); +		if (!data->lseg) { +			nfs_init_commit(data, mds_pages, NULL, cinfo); +			nfs_initiate_commit(NFS_CLIENT(inode), data, +					    data->mds_ops, how, 0); +		} else { +			struct pnfs_commit_bucket *buckets; + +			buckets = cinfo->ds->buckets; +			nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo); +			filelayout_initiate_commit(data, how); +		} +	} +out: +	cinfo->ds->ncommitting = 0; +	return PNFS_ATTEMPTED; +} + +static void +filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) +{ +	nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node)); +} + +static struct pnfs_layout_hdr * +filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) +{ +	struct nfs4_filelayout *flo; + +	flo = kzalloc(sizeof(*flo), gfp_flags); +	return flo != NULL ? &flo->generic_hdr : NULL; +} + +static void +filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ +	kfree(FILELAYOUT_FROM_HDR(lo)); +} + +static struct pnfs_ds_commit_info * +filelayout_get_ds_info(struct inode *inode) +{ +	struct pnfs_layout_hdr *layout = NFS_I(inode)->layout; + +	if (layout == NULL) +		return NULL; +	else +		return &FILELAYOUT_FROM_HDR(layout)->commit_info; +} + +static struct pnfs_layoutdriver_type filelayout_type = { +	.id			= LAYOUT_NFSV4_1_FILES, +	.name			= "LAYOUT_NFSV4_1_FILES", +	.owner			= THIS_MODULE, +	.alloc_layout_hdr	= filelayout_alloc_layout_hdr, +	.free_layout_hdr	= filelayout_free_layout_hdr, +	.alloc_lseg		= filelayout_alloc_lseg, +	.free_lseg		= filelayout_free_lseg, +	.pg_read_ops		= &filelayout_pg_read_ops, +	.pg_write_ops		= &filelayout_pg_write_ops, +	.get_ds_info		= &filelayout_get_ds_info, +	.mark_request_commit	= filelayout_mark_request_commit, +	.clear_request_commit	= filelayout_clear_request_commit, +	.scan_commit_lists	= filelayout_scan_commit_lists, +	.recover_commit_reqs	= filelayout_recover_commit_reqs, +	.commit_pagelist	= filelayout_commit_pagelist, +	.read_pagelist		= filelayout_read_pagelist, +	.write_pagelist		= filelayout_write_pagelist, +	.free_deviceid_node	= filelayout_free_deveiceid_node, +}; + +static int __init nfs4filelayout_init(void) +{ +	printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", +	       __func__); +	return pnfs_register_layoutdriver(&filelayout_type); +} + +static void __exit nfs4filelayout_exit(void) +{ +	printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", +	       __func__); +	pnfs_unregister_layoutdriver(&filelayout_type); +} + +MODULE_ALIAS("nfs-layouttype4-1"); + +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/filelayout/filelayout.h index bbf60dd2ab9..ffbddf2219e 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -30,10 +30,17 @@  #ifndef FS_NFS_NFS4FILELAYOUT_H  #define FS_NFS_NFS4FILELAYOUT_H -#include "pnfs.h" +#include "../pnfs.h"  /* - * Field testing shows we need to support upto 4096 stripe indices. + * Default data server connection timeout and retrans vaules. + * Set by module paramters dataserver_timeo and dataserver_retrans. + */ +#define NFS4_DEF_DS_TIMEO   600 /* in tenths of a second */ +#define NFS4_DEF_DS_RETRANS 5 + +/* + * Field testing shows we need to support up to 4096 stripe indices.   * We store each index as a u8 (u32 on the wire) to keep the memory footprint   * reasonable. This in turn means we support a maximum of 256   * RFC 5661 multipath_list4 structures. @@ -41,22 +48,34 @@  #define NFS4_PNFS_MAX_STRIPE_CNT 4096  #define NFS4_PNFS_MAX_MULTI_CNT  256 /* 256 fit into a u8 stripe_index */ +/* error codes for internal use */ +#define NFS4ERR_RESET_TO_MDS   12001 +  enum stripetype4 {  	STRIPE_SPARSE = 1,  	STRIPE_DENSE = 2  };  /* Individual ip address */ +struct nfs4_pnfs_ds_addr { +	struct sockaddr_storage	da_addr; +	size_t			da_addrlen; +	struct list_head	da_node;  /* nfs4_pnfs_dev_hlist dev_dslist */ +	char			*da_remotestr;	/* human readable addr+port */ +}; +  struct nfs4_pnfs_ds {  	struct list_head	ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */ -	u32			ds_ip_addr; -	u32			ds_port; +	char			*ds_remotestr;	/* comma sep list of addrs */ +	struct list_head	ds_addrs;  	struct nfs_client	*ds_clp;  	atomic_t		ds_count; +	unsigned long		ds_state; +#define NFS4DS_CONNECTING	0	/* ds is establishing connection */  };  struct nfs4_file_layout_dsaddr { -	struct pnfs_deviceid_node	deviceid; +	struct nfs4_deviceid_node	id_node;  	u32				stripe_count;  	u8				*stripe_indices;  	u32				ds_num; @@ -75,6 +94,17 @@ struct nfs4_filelayout_segment {  	struct nfs_fh **fh_array;  }; +struct nfs4_filelayout { +	struct pnfs_layout_hdr generic_hdr; +	struct pnfs_ds_commit_info commit_info; +}; + +static inline struct nfs4_filelayout * +FILELAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo) +{ +	return container_of(lo, struct nfs4_filelayout, generic_hdr); +} +  static inline struct nfs4_filelayout_segment *  FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)  { @@ -83,12 +113,44 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)  			    generic_hdr);  } -extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *); +static inline struct nfs4_deviceid_node * +FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg) +{ +	return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node; +} + +static inline void +filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node) +{ +	u32 *p = (u32 *)&node->deviceid; + +	printk(KERN_WARNING "NFS: Deviceid [%x%x%x%x] marked out of use.\n", +		p[0], p[1], p[2], p[3]); + +	set_bit(NFS_DEVICEID_INVALID, &node->flags); +} + +static inline bool +filelayout_test_devid_invalid(struct nfs4_deviceid_node *node) +{ +	return test_bit(NFS_DEVICEID_INVALID, &node->flags); +} + +extern bool +filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node); + +extern struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); +  extern void print_ds(struct nfs4_pnfs_ds *ds); -extern void print_deviceid(struct nfs4_deviceid *dev_id); -extern struct nfs4_file_layout_dsaddr * -nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id); +u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); +u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); +struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, +					u32 ds_idx); +extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); +extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);  struct nfs4_file_layout_dsaddr * -get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id); +filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, +		struct rpc_cred *cred, gfp_t gfp_flags);  #endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c new file mode 100644 index 00000000000..44bf0140a4c --- /dev/null +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -0,0 +1,843 @@ +/* + *  Device operations for the pnfs nfs4 file layout driver. + * + *  Copyright (c) 2002 + *  The Regents of the University of Michigan + *  All Rights Reserved + * + *  Dean Hildebrand <dhildebz@umich.edu> + *  Garth Goodson   <Garth.Goodson@netapp.com> + * + *  Permission is granted to use, copy, create derivative works, and + *  redistribute this software and such derivative works for any purpose, + *  so long as the name of the University of Michigan is not used in + *  any advertising or publicity pertaining to the use or distribution + *  of this software without specific, written prior authorization. If + *  the above copyright notice or any other identification of the + *  University of Michigan is included in any copy of any portion of + *  this software, then the disclaimer below must also be included. + * + *  This software is provided as is, without representation or warranty + *  of any kind either express or implied, including without limitation + *  the implied warranties of merchantability, fitness for a particular + *  purpose, or noninfringement.  The Regents of the University of + *  Michigan shall not be liable for any damages, including special, + *  indirect, incidental, or consequential damages, with respect to any + *  claim arising out of or in connection with the use of the software, + *  even if it has been or is hereafter advised of the possibility of + *  such damages. + */ + +#include <linux/nfs_fs.h> +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/sunrpc/addr.h> + +#include "../internal.h" +#include "../nfs4session.h" +#include "filelayout.h" + +#define NFSDBG_FACILITY		NFSDBG_PNFS_LD + +static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; +static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; + +/* + * Data server cache + * + * Data servers can be mapped to different device ids. + * nfs4_pnfs_ds reference counting + *   - set to 1 on allocation + *   - incremented when a device id maps a data server already in the cache. + *   - decremented when deviceid is removed from the cache. + */ +static DEFINE_SPINLOCK(nfs4_ds_cache_lock); +static LIST_HEAD(nfs4_data_server_cache); + +/* Debug routines */ +void +print_ds(struct nfs4_pnfs_ds *ds) +{ +	if (ds == NULL) { +		printk("%s NULL device\n", __func__); +		return; +	} +	printk("        ds %s\n" +		"        ref count %d\n" +		"        client %p\n" +		"        cl_exchange_flags %x\n", +		ds->ds_remotestr, +		atomic_read(&ds->ds_count), ds->ds_clp, +		ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); +} + +static bool +same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) +{ +	struct sockaddr_in *a, *b; +	struct sockaddr_in6 *a6, *b6; + +	if (addr1->sa_family != addr2->sa_family) +		return false; + +	switch (addr1->sa_family) { +	case AF_INET: +		a = (struct sockaddr_in *)addr1; +		b = (struct sockaddr_in *)addr2; + +		if (a->sin_addr.s_addr == b->sin_addr.s_addr && +		    a->sin_port == b->sin_port) +			return true; +		break; + +	case AF_INET6: +		a6 = (struct sockaddr_in6 *)addr1; +		b6 = (struct sockaddr_in6 *)addr2; + +		/* LINKLOCAL addresses must have matching scope_id */ +		if (ipv6_addr_src_scope(&a6->sin6_addr) == +		    IPV6_ADDR_SCOPE_LINKLOCAL && +		    a6->sin6_scope_id != b6->sin6_scope_id) +			return false; + +		if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) && +		    a6->sin6_port == b6->sin6_port) +			return true; +		break; + +	default: +		dprintk("%s: unhandled address family: %u\n", +			__func__, addr1->sa_family); +		return false; +	} + +	return false; +} + +static bool +_same_data_server_addrs_locked(const struct list_head *dsaddrs1, +			       const struct list_head *dsaddrs2) +{ +	struct nfs4_pnfs_ds_addr *da1, *da2; + +	/* step through both lists, comparing as we go */ +	for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node), +	     da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node); +	     da1 != NULL && da2 != NULL; +	     da1 = list_entry(da1->da_node.next, typeof(*da1), da_node), +	     da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) { +		if (!same_sockaddr((struct sockaddr *)&da1->da_addr, +				   (struct sockaddr *)&da2->da_addr)) +			return false; +	} +	if (da1 == NULL && da2 == NULL) +		return true; + +	return false; +} + +/* + * Lookup DS by addresses.  nfs4_ds_cache_lock is held + */ +static struct nfs4_pnfs_ds * +_data_server_lookup_locked(const struct list_head *dsaddrs) +{ +	struct nfs4_pnfs_ds *ds; + +	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) +		if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) +			return ds; +	return NULL; +} + +/* + * Create an rpc connection to the nfs4_pnfs_ds data server + * Currently only supports IPv4 and IPv6 addresses + */ +static int +nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) +{ +	struct nfs_client *clp = ERR_PTR(-EIO); +	struct nfs4_pnfs_ds_addr *da; +	int status = 0; + +	dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr, +		mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); + +	list_for_each_entry(da, &ds->ds_addrs, da_node) { +		dprintk("%s: DS %s: trying address %s\n", +			__func__, ds->ds_remotestr, da->da_remotestr); + +		clp = nfs4_set_ds_client(mds_srv->nfs_client, +					(struct sockaddr *)&da->da_addr, +					da->da_addrlen, IPPROTO_TCP, +					dataserver_timeo, dataserver_retrans); +		if (!IS_ERR(clp)) +			break; +	} + +	if (IS_ERR(clp)) { +		status = PTR_ERR(clp); +		goto out; +	} + +	status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time); +	if (status) +		goto out_put; + +	smp_wmb(); +	ds->ds_clp = clp; +	dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); +out: +	return status; +out_put: +	nfs_put_client(clp); +	goto out; +} + +static void +destroy_ds(struct nfs4_pnfs_ds *ds) +{ +	struct nfs4_pnfs_ds_addr *da; + +	dprintk("--> %s\n", __func__); +	ifdebug(FACILITY) +		print_ds(ds); + +	if (ds->ds_clp) +		nfs_put_client(ds->ds_clp); + +	while (!list_empty(&ds->ds_addrs)) { +		da = list_first_entry(&ds->ds_addrs, +				      struct nfs4_pnfs_ds_addr, +				      da_node); +		list_del_init(&da->da_node); +		kfree(da->da_remotestr); +		kfree(da); +	} + +	kfree(ds->ds_remotestr); +	kfree(ds); +} + +void +nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) +{ +	struct nfs4_pnfs_ds *ds; +	int i; + +	nfs4_print_deviceid(&dsaddr->id_node.deviceid); + +	for (i = 0; i < dsaddr->ds_num; i++) { +		ds = dsaddr->ds_list[i]; +		if (ds != NULL) { +			if (atomic_dec_and_lock(&ds->ds_count, +						&nfs4_ds_cache_lock)) { +				list_del_init(&ds->ds_node); +				spin_unlock(&nfs4_ds_cache_lock); +				destroy_ds(ds); +			} +		} +	} +	kfree(dsaddr->stripe_indices); +	kfree(dsaddr); +} + +/* + * Create a string with a human readable address and port to avoid + * complicated setup around many dprinks. + */ +static char * +nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags) +{ +	struct nfs4_pnfs_ds_addr *da; +	char *remotestr; +	size_t len; +	char *p; + +	len = 3;        /* '{', '}' and eol */ +	list_for_each_entry(da, dsaddrs, da_node) { +		len += strlen(da->da_remotestr) + 1;    /* string plus comma */ +	} + +	remotestr = kzalloc(len, gfp_flags); +	if (!remotestr) +		return NULL; + +	p = remotestr; +	*(p++) = '{'; +	len--; +	list_for_each_entry(da, dsaddrs, da_node) { +		size_t ll = strlen(da->da_remotestr); + +		if (ll > len) +			goto out_err; + +		memcpy(p, da->da_remotestr, ll); +		p += ll; +		len -= ll; + +		if (len < 1) +			goto out_err; +		(*p++) = ','; +		len--; +	} +	if (len < 2) +		goto out_err; +	*(p++) = '}'; +	*p = '\0'; +	return remotestr; +out_err: +	kfree(remotestr); +	return NULL; +} + +static struct nfs4_pnfs_ds * +nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) +{ +	struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; +	char *remotestr; + +	if (list_empty(dsaddrs)) { +		dprintk("%s: no addresses defined\n", __func__); +		goto out; +	} + +	ds = kzalloc(sizeof(*ds), gfp_flags); +	if (!ds) +		goto out; + +	/* this is only used for debugging, so it's ok if its NULL */ +	remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); + +	spin_lock(&nfs4_ds_cache_lock); +	tmp_ds = _data_server_lookup_locked(dsaddrs); +	if (tmp_ds == NULL) { +		INIT_LIST_HEAD(&ds->ds_addrs); +		list_splice_init(dsaddrs, &ds->ds_addrs); +		ds->ds_remotestr = remotestr; +		atomic_set(&ds->ds_count, 1); +		INIT_LIST_HEAD(&ds->ds_node); +		ds->ds_clp = NULL; +		list_add(&ds->ds_node, &nfs4_data_server_cache); +		dprintk("%s add new data server %s\n", __func__, +			ds->ds_remotestr); +	} else { +		kfree(remotestr); +		kfree(ds); +		atomic_inc(&tmp_ds->ds_count); +		dprintk("%s data server %s found, inc'ed ds_count to %d\n", +			__func__, tmp_ds->ds_remotestr, +			atomic_read(&tmp_ds->ds_count)); +		ds = tmp_ds; +	} +	spin_unlock(&nfs4_ds_cache_lock); +out: +	return ds; +} + +/* + * Currently only supports ipv4, ipv6 and one multi-path address. + */ +static struct nfs4_pnfs_ds_addr * +decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags) +{ +	struct nfs4_pnfs_ds_addr *da = NULL; +	char *buf, *portstr; +	__be16 port; +	int nlen, rlen; +	int tmp[2]; +	__be32 *p; +	char *netid, *match_netid; +	size_t len, match_netid_len; +	char *startsep = ""; +	char *endsep = ""; + + +	/* r_netid */ +	p = xdr_inline_decode(streamp, 4); +	if (unlikely(!p)) +		goto out_err; +	nlen = be32_to_cpup(p++); + +	p = xdr_inline_decode(streamp, nlen); +	if (unlikely(!p)) +		goto out_err; + +	netid = kmalloc(nlen+1, gfp_flags); +	if (unlikely(!netid)) +		goto out_err; + +	netid[nlen] = '\0'; +	memcpy(netid, p, nlen); + +	/* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */ +	p = xdr_inline_decode(streamp, 4); +	if (unlikely(!p)) +		goto out_free_netid; +	rlen = be32_to_cpup(p); + +	p = xdr_inline_decode(streamp, rlen); +	if (unlikely(!p)) +		goto out_free_netid; + +	/* port is ".ABC.DEF", 8 chars max */ +	if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) { +		dprintk("%s: Invalid address, length %d\n", __func__, +			rlen); +		goto out_free_netid; +	} +	buf = kmalloc(rlen + 1, gfp_flags); +	if (!buf) { +		dprintk("%s: Not enough memory\n", __func__); +		goto out_free_netid; +	} +	buf[rlen] = '\0'; +	memcpy(buf, p, rlen); + +	/* replace port '.' with '-' */ +	portstr = strrchr(buf, '.'); +	if (!portstr) { +		dprintk("%s: Failed finding expected dot in port\n", +			__func__); +		goto out_free_buf; +	} +	*portstr = '-'; + +	/* find '.' between address and port */ +	portstr = strrchr(buf, '.'); +	if (!portstr) { +		dprintk("%s: Failed finding expected dot between address and " +			"port\n", __func__); +		goto out_free_buf; +	} +	*portstr = '\0'; + +	da = kzalloc(sizeof(*da), gfp_flags); +	if (unlikely(!da)) +		goto out_free_buf; + +	INIT_LIST_HEAD(&da->da_node); + +	if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr, +		      sizeof(da->da_addr))) { +		dprintk("%s: error parsing address %s\n", __func__, buf); +		goto out_free_da; +	} + +	portstr++; +	sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]); +	port = htons((tmp[0] << 8) | (tmp[1])); + +	switch (da->da_addr.ss_family) { +	case AF_INET: +		((struct sockaddr_in *)&da->da_addr)->sin_port = port; +		da->da_addrlen = sizeof(struct sockaddr_in); +		match_netid = "tcp"; +		match_netid_len = 3; +		break; + +	case AF_INET6: +		((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port; +		da->da_addrlen = sizeof(struct sockaddr_in6); +		match_netid = "tcp6"; +		match_netid_len = 4; +		startsep = "["; +		endsep = "]"; +		break; + +	default: +		dprintk("%s: unsupported address family: %u\n", +			__func__, da->da_addr.ss_family); +		goto out_free_da; +	} + +	if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) { +		dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n", +			__func__, netid, match_netid); +		goto out_free_da; +	} + +	/* save human readable address */ +	len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7; +	da->da_remotestr = kzalloc(len, gfp_flags); + +	/* NULL is ok, only used for dprintk */ +	if (da->da_remotestr) +		snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep, +			 buf, endsep, ntohs(port)); + +	dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr); +	kfree(buf); +	kfree(netid); +	return da; + +out_free_da: +	kfree(da); +out_free_buf: +	dprintk("%s: Error parsing DS addr: %s\n", __func__, buf); +	kfree(buf); +out_free_netid: +	kfree(netid); +out_err: +	return NULL; +} + +/* Decode opaque device data and return the result */ +static struct nfs4_file_layout_dsaddr* +decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) +{ +	int i; +	u32 cnt, num; +	u8 *indexp; +	__be32 *p; +	u8 *stripe_indices; +	u8 max_stripe_index; +	struct nfs4_file_layout_dsaddr *dsaddr = NULL; +	struct xdr_stream stream; +	struct xdr_buf buf; +	struct page *scratch; +	struct list_head dsaddrs; +	struct nfs4_pnfs_ds_addr *da; + +	/* set up xdr stream */ +	scratch = alloc_page(gfp_flags); +	if (!scratch) +		goto out_err; + +	xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); +	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + +	/* Get the stripe count (number of stripe index) */ +	p = xdr_inline_decode(&stream, 4); +	if (unlikely(!p)) +		goto out_err_free_scratch; + +	cnt = be32_to_cpup(p); +	dprintk("%s stripe count  %d\n", __func__, cnt); +	if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { +		printk(KERN_WARNING "NFS: %s: stripe count %d greater than " +		       "supported maximum %d\n", __func__, +			cnt, NFS4_PNFS_MAX_STRIPE_CNT); +		goto out_err_free_scratch; +	} + +	/* read stripe indices */ +	stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags); +	if (!stripe_indices) +		goto out_err_free_scratch; + +	p = xdr_inline_decode(&stream, cnt << 2); +	if (unlikely(!p)) +		goto out_err_free_stripe_indices; + +	indexp = &stripe_indices[0]; +	max_stripe_index = 0; +	for (i = 0; i < cnt; i++) { +		*indexp = be32_to_cpup(p++); +		max_stripe_index = max(max_stripe_index, *indexp); +		indexp++; +	} + +	/* Check the multipath list count */ +	p = xdr_inline_decode(&stream, 4); +	if (unlikely(!p)) +		goto out_err_free_stripe_indices; + +	num = be32_to_cpup(p); +	dprintk("%s ds_num %u\n", __func__, num); +	if (num > NFS4_PNFS_MAX_MULTI_CNT) { +		printk(KERN_WARNING "NFS: %s: multipath count %d greater than " +			"supported maximum %d\n", __func__, +			num, NFS4_PNFS_MAX_MULTI_CNT); +		goto out_err_free_stripe_indices; +	} + +	/* validate stripe indices are all < num */ +	if (max_stripe_index >= num) { +		printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n", +			__func__, max_stripe_index, num); +		goto out_err_free_stripe_indices; +	} + +	dsaddr = kzalloc(sizeof(*dsaddr) + +			(sizeof(struct nfs4_pnfs_ds *) * (num - 1)), +			gfp_flags); +	if (!dsaddr) +		goto out_err_free_stripe_indices; + +	dsaddr->stripe_count = cnt; +	dsaddr->stripe_indices = stripe_indices; +	stripe_indices = NULL; +	dsaddr->ds_num = num; +	nfs4_init_deviceid_node(&dsaddr->id_node, +				NFS_SERVER(ino)->pnfs_curr_ld, +				NFS_SERVER(ino)->nfs_client, +				&pdev->dev_id); + +	INIT_LIST_HEAD(&dsaddrs); + +	for (i = 0; i < dsaddr->ds_num; i++) { +		int j; +		u32 mp_count; + +		p = xdr_inline_decode(&stream, 4); +		if (unlikely(!p)) +			goto out_err_free_deviceid; + +		mp_count = be32_to_cpup(p); /* multipath count */ +		for (j = 0; j < mp_count; j++) { +			da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net, +					    &stream, gfp_flags); +			if (da) +				list_add_tail(&da->da_node, &dsaddrs); +		} +		if (list_empty(&dsaddrs)) { +			dprintk("%s: no suitable DS addresses found\n", +				__func__); +			goto out_err_free_deviceid; +		} + +		dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); +		if (!dsaddr->ds_list[i]) +			goto out_err_drain_dsaddrs; + +		/* If DS was already in cache, free ds addrs */ +		while (!list_empty(&dsaddrs)) { +			da = list_first_entry(&dsaddrs, +					      struct nfs4_pnfs_ds_addr, +					      da_node); +			list_del_init(&da->da_node); +			kfree(da->da_remotestr); +			kfree(da); +		} +	} + +	__free_page(scratch); +	return dsaddr; + +out_err_drain_dsaddrs: +	while (!list_empty(&dsaddrs)) { +		da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr, +				      da_node); +		list_del_init(&da->da_node); +		kfree(da->da_remotestr); +		kfree(da); +	} +out_err_free_deviceid: +	nfs4_fl_free_deviceid(dsaddr); +	/* stripe_indicies was part of dsaddr */ +	goto out_err_free_scratch; +out_err_free_stripe_indices: +	kfree(stripe_indices); +out_err_free_scratch: +	__free_page(scratch); +out_err: +	dprintk("%s ERROR: returning NULL\n", __func__); +	return NULL; +} + +/* + * Decode the opaque device specified in 'dev' and add it to the cache of + * available devices. + */ +static struct nfs4_file_layout_dsaddr * +decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags) +{ +	struct nfs4_deviceid_node *d; +	struct nfs4_file_layout_dsaddr *n, *new; + +	new = decode_device(inode, dev, gfp_flags); +	if (!new) { +		printk(KERN_WARNING "NFS: %s: Could not decode or add device\n", +			__func__); +		return NULL; +	} + +	d = nfs4_insert_deviceid_node(&new->id_node); +	n = container_of(d, struct nfs4_file_layout_dsaddr, id_node); +	if (n != new) { +		nfs4_fl_free_deviceid(new); +		return n; +	} + +	return new; +} + +/* + * Retrieve the information for dev_id, add it to the list + * of available devices, and return it. + */ +struct nfs4_file_layout_dsaddr * +filelayout_get_device_info(struct inode *inode, +		struct nfs4_deviceid *dev_id, +		struct rpc_cred *cred, +		gfp_t gfp_flags) +{ +	struct pnfs_device *pdev = NULL; +	u32 max_resp_sz; +	int max_pages; +	struct page **pages = NULL; +	struct nfs4_file_layout_dsaddr *dsaddr = NULL; +	int rc, i; +	struct nfs_server *server = NFS_SERVER(inode); + +	/* +	 * Use the session max response size as the basis for setting +	 * GETDEVICEINFO's maxcount +	 */ +	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; +	max_pages = nfs_page_array_len(0, max_resp_sz); +	dprintk("%s inode %p max_resp_sz %u max_pages %d\n", +		__func__, inode, max_resp_sz, max_pages); + +	pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags); +	if (pdev == NULL) +		return NULL; + +	pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags); +	if (pages == NULL) { +		kfree(pdev); +		return NULL; +	} +	for (i = 0; i < max_pages; i++) { +		pages[i] = alloc_page(gfp_flags); +		if (!pages[i]) +			goto out_free; +	} + +	memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); +	pdev->layout_type = LAYOUT_NFSV4_1_FILES; +	pdev->pages = pages; +	pdev->pgbase = 0; +	pdev->pglen = max_resp_sz; +	pdev->mincount = 0; +	pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; + +	rc = nfs4_proc_getdeviceinfo(server, pdev, cred); +	dprintk("%s getdevice info returns %d\n", __func__, rc); +	if (rc) +		goto out_free; + +	/* +	 * Found new device, need to decode it and then add it to the +	 * list of known devices for this mountpoint. +	 */ +	dsaddr = decode_and_add_device(inode, pdev, gfp_flags); +out_free: +	for (i = 0; i < max_pages; i++) +		__free_page(pages[i]); +	kfree(pages); +	kfree(pdev); +	dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); +	return dsaddr; +} + +void +nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) +{ +	nfs4_put_deviceid_node(&dsaddr->id_node); +} + +/* + * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit + * Then: ((res + fsi) % dsaddr->stripe_count) + */ +u32 +nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) +{ +	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); +	u64 tmp; + +	tmp = offset - flseg->pattern_offset; +	do_div(tmp, flseg->stripe_unit); +	tmp += flseg->first_stripe_index; +	return do_div(tmp, flseg->dsaddr->stripe_count); +} + +u32 +nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j) +{ +	return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j]; +} + +struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) +{ +	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); +	u32 i; + +	if (flseg->stripe_type == STRIPE_SPARSE) { +		if (flseg->num_fh == 1) +			i = 0; +		else if (flseg->num_fh == 0) +			/* Use the MDS OPEN fh set in nfs_read_rpcsetup */ +			return NULL; +		else +			i = nfs4_fl_calc_ds_index(lseg, j); +	} else +		i = j; +	return flseg->fh_array[i]; +} + +static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) +{ +	might_sleep(); +	wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, +			nfs_wait_bit_killable, TASK_KILLABLE); +} + +static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) +{ +	smp_mb__before_atomic(); +	clear_bit(NFS4DS_CONNECTING, &ds->ds_state); +	smp_mb__after_atomic(); +	wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); +} + + +struct nfs4_pnfs_ds * +nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) +{ +	struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr; +	struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; +	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); +	struct nfs4_pnfs_ds *ret = ds; + +	if (ds == NULL) { +		printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", +			__func__, ds_idx); +		filelayout_mark_devid_invalid(devid); +		goto out; +	} +	smp_rmb(); +	if (ds->ds_clp) +		goto out_test_devid; + +	if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { +		struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); +		int err; + +		err = nfs4_ds_connect(s, ds); +		if (err) +			nfs4_mark_deviceid_unavailable(devid); +		nfs4_clear_ds_conn_bit(ds); +	} else { +		/* Either ds is connected, or ds is NULL */ +		nfs4_wait_ds_connect(ds); +	} +out_test_devid: +	if (filelayout_test_devid_unavailable(devid)) +		ret = NULL; +out: +	return ret; +} + +module_param(dataserver_retrans, uint, 0644); +MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client " +			"retries a request before it attempts further " +			" recovery  action."); +module_param(dataserver_timeo, uint, 0644); +MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the " +			"NFSv4.1  client  waits for a response from a " +			" data server before it retries an NFS request."); diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 5b1006480bc..7cf2c4699b0 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -212,7 +212,7 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,  	auxdata.ctime = nfsi->vfs_inode.i_ctime;  	if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) -		auxdata.change_attr = nfsi->change_attr; +		auxdata.change_attr = nfsi->vfs_inode.i_version;  	if (bufmax > sizeof(auxdata))  		bufmax = sizeof(auxdata); @@ -244,7 +244,7 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,  	auxdata.ctime = nfsi->vfs_inode.i_ctime;  	if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) -		auxdata.change_attr = nfsi->change_attr; +		auxdata.change_attr = nfsi->vfs_inode.i_version;  	if (memcmp(data, &auxdata, datalen) != 0)  		return FSCACHE_CHECKAUX_OBSOLETE; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index ce153a6b3ae..3ef01f0ba0b 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -39,7 +39,7 @@ void nfs_fscache_get_client_cookie(struct nfs_client *clp)  	/* create a cache index for looking up filehandles */  	clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,  					      &nfs_fscache_server_index_def, -					      clp); +					      clp, true);  	dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",  		 clp, clp->fscache);  } @@ -64,23 +64,12 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp)   * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent   * superblock across an automount point of some nature.   */ -void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, -				  struct nfs_clone_mount *mntdata) +void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen)  {  	struct nfs_fscache_key *key, *xkey;  	struct nfs_server *nfss = NFS_SB(sb);  	struct rb_node **p, *parent; -	int diff, ulen; - -	if (uniq) { -		ulen = strlen(uniq); -	} else if (mntdata) { -		struct nfs_server *mnt_s = NFS_SB(mntdata->sb); -		if (mnt_s->fscache_key) { -			uniq = mnt_s->fscache_key->key.uniquifier; -			ulen = mnt_s->fscache_key->key.uniq_len; -		} -	} +	int diff;  	if (!uniq) {  		uniq = ""; @@ -150,7 +139,7 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq,  	/* create a cache index for looking up filehandles */  	nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,  					       &nfs_fscache_super_index_def, -					       nfss); +					       nfss, true);  	dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",  		 nfss, nfss->fscache);  	return; @@ -189,164 +178,79 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)  /*   * Initialise the per-inode cache cookie pointer for an NFS inode.   */ -void nfs_fscache_init_inode_cookie(struct inode *inode) -{ -	NFS_I(inode)->fscache = NULL; -	if (S_ISREG(inode->i_mode)) -		set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); -} - -/* - * Get the per-inode cache cookie for an NFS inode. - */ -static void nfs_fscache_enable_inode_cookie(struct inode *inode) +void nfs_fscache_init_inode(struct inode *inode)  { -	struct super_block *sb = inode->i_sb;  	struct nfs_inode *nfsi = NFS_I(inode); -	if (nfsi->fscache || !NFS_FSCACHE(inode)) +	nfsi->fscache = NULL; +	if (!S_ISREG(inode->i_mode))  		return; - -	if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) { -		nfsi->fscache = fscache_acquire_cookie( -			NFS_SB(sb)->fscache, -			&nfs_fscache_inode_object_def, -			nfsi); - -		dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n", -			 sb, nfsi, nfsi->fscache); -	} +	nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, +					       &nfs_fscache_inode_object_def, +					       nfsi, false);  }  /*   * Release a per-inode cookie.   */ -void nfs_fscache_release_inode_cookie(struct inode *inode) -{ -	struct nfs_inode *nfsi = NFS_I(inode); - -	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", -		 nfsi, nfsi->fscache); - -	fscache_relinquish_cookie(nfsi->fscache, 0); -	nfsi->fscache = NULL; -} - -/* - * Retire a per-inode cookie, destroying the data attached to it. - */ -void nfs_fscache_zap_inode_cookie(struct inode *inode) +void nfs_fscache_clear_inode(struct inode *inode)  {  	struct nfs_inode *nfsi = NFS_I(inode); +	struct fscache_cookie *cookie = nfs_i_fscache(inode); -	dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n", -		 nfsi, nfsi->fscache); +	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); -	fscache_relinquish_cookie(nfsi->fscache, 1); +	fscache_relinquish_cookie(cookie, false);  	nfsi->fscache = NULL;  } -/* - * Turn off the cache with regard to a per-inode cookie if opened for writing, - * invalidating all the pages in the page cache relating to the associated - * inode to clear the per-page caching. - */ -static void nfs_fscache_disable_inode_cookie(struct inode *inode) +static bool nfs_fscache_can_enable(void *data)  { -	clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); - -	if (NFS_I(inode)->fscache) { -		dfprintk(FSCACHE, -			 "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode)); +	struct inode *inode = data; -		/* Need to invalidate any mapped pages that were read in before -		 * turning off the cache. -		 */ -		if (inode->i_mapping && inode->i_mapping->nrpages) -			invalidate_inode_pages2(inode->i_mapping); - -		nfs_fscache_zap_inode_cookie(inode); -	} -} - -/* - * wait_on_bit() sleep function for uninterruptible waiting - */ -static int nfs_fscache_wait_bit(void *flags) -{ -	schedule(); -	return 0; +	return !inode_is_open_for_write(inode);  }  /* - * Lock against someone else trying to also acquire or relinquish a cookie - */ -static inline void nfs_fscache_inode_lock(struct inode *inode) -{ -	struct nfs_inode *nfsi = NFS_I(inode); - -	while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags)) -		wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK, -			    nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE); -} - -/* - * Unlock cookie management lock - */ -static inline void nfs_fscache_inode_unlock(struct inode *inode) -{ -	struct nfs_inode *nfsi = NFS_I(inode); - -	smp_mb__before_clear_bit(); -	clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags); -	smp_mb__after_clear_bit(); -	wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK); -} - -/* - * Decide if we should enable or disable local caching for this inode. - * - For now, with NFS, only regular files that are open read-only will be able - *   to use the cache. - * - May be invoked multiple times in parallel by parallel nfs_open() functions. - */ -void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) -{ -	if (NFS_FSCACHE(inode)) { -		nfs_fscache_inode_lock(inode); -		if ((filp->f_flags & O_ACCMODE) != O_RDONLY) -			nfs_fscache_disable_inode_cookie(inode); -		else -			nfs_fscache_enable_inode_cookie(inode); -		nfs_fscache_inode_unlock(inode); -	} -} - -/* - * Replace a per-inode cookie due to revalidation detecting a file having - * changed on the server. + * Enable or disable caching for a file that is being opened as appropriate. + * The cookie is allocated when the inode is initialised, but is not enabled at + * that time.  Enablement is deferred to file-open time to avoid stat() and + * access() thrashing the cache. + * + * For now, with NFS, only regular files that are open read-only will be able + * to use the cache. + * + * We enable the cache for an inode if we open it read-only and it isn't + * currently open for writing.  We disable the cache if the inode is open + * write-only. + * + * The caller uses the file struct to pin i_writecount on the inode before + * calling us when a file is opened for writing, so we can make use of that. + * + * Note that this may be invoked multiple times in parallel by parallel + * nfs_open() functions.   */ -void nfs_fscache_reset_inode_cookie(struct inode *inode) +void nfs_fscache_open_file(struct inode *inode, struct file *filp)  {  	struct nfs_inode *nfsi = NFS_I(inode); -	struct nfs_server *nfss = NFS_SERVER(inode); -	struct fscache_cookie *old = nfsi->fscache; +	struct fscache_cookie *cookie = nfs_i_fscache(inode); -	nfs_fscache_inode_lock(inode); -	if (nfsi->fscache) { -		/* retire the current fscache cache and get a new one */ -		fscache_relinquish_cookie(nfsi->fscache, 1); - -		nfsi->fscache = fscache_acquire_cookie( -			nfss->nfs_client->fscache, -			&nfs_fscache_inode_object_def, -			nfsi); +	if (!fscache_cookie_valid(cookie)) +		return; -		dfprintk(FSCACHE, -			 "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n", -			 nfss, nfsi, old, nfsi->fscache); +	if (inode_is_open_for_write(inode)) { +		dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); +		clear_bit(NFS_INO_FSCACHE, &nfsi->flags); +		fscache_disable_cookie(cookie, true); +		fscache_uncache_all_inode_pages(cookie, inode); +	} else { +		dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi); +		fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode); +		if (fscache_cookie_enabled(cookie)) +			set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);  	} -	nfs_fscache_inode_unlock(inode);  } +EXPORT_SYMBOL_GPL(nfs_fscache_open_file);  /*   * Release the caching state associated with a page, if the page isn't busy @@ -356,12 +260,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)  int nfs_fscache_release_page(struct page *page, gfp_t gfp)  {  	if (PageFsCache(page)) { -		struct nfs_inode *nfsi = NFS_I(page->mapping->host); -		struct fscache_cookie *cookie = nfsi->fscache; +		struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host);  		BUG_ON(!cookie);  		dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", -			 cookie, page, nfsi); +			 cookie, page, NFS_I(page->mapping->host));  		if (!fscache_maybe_release_page(cookie, page, gfp))  			return 0; @@ -379,13 +282,12 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp)   */  void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)  { -	struct nfs_inode *nfsi = NFS_I(inode); -	struct fscache_cookie *cookie = nfsi->fscache; +	struct fscache_cookie *cookie = nfs_i_fscache(inode);  	BUG_ON(!cookie);  	dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n", -		 cookie, page, nfsi); +		 cookie, page, NFS_I(inode));  	fscache_wait_on_page_write(cookie, page); @@ -429,9 +331,9 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,  	dfprintk(FSCACHE,  		 "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", -		 NFS_I(inode)->fscache, page, page->index, page->flags, inode); +		 nfs_i_fscache(inode), page, page->index, page->flags, inode); -	ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache, +	ret = fscache_read_or_alloc_page(nfs_i_fscache(inode),  					 page,  					 nfs_readpage_from_fscache_complete,  					 ctx, @@ -471,9 +373,9 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,  	int ret;  	dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", -		 NFS_I(inode)->fscache, npages, inode); +		 nfs_i_fscache(inode), npages, inode); -	ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache, +	ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode),  					  mapping, pages, nr_pages,  					  nfs_readpage_from_fscache_complete,  					  ctx, @@ -518,15 +420,15 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)  	dfprintk(FSCACHE,  		 "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", -		 NFS_I(inode)->fscache, page, page->index, page->flags, sync); +		 nfs_i_fscache(inode), page, page->index, page->flags, sync); -	ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL); +	ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL);  	dfprintk(FSCACHE,  		 "NFS:     readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",  		 page, page->index, page->flags, ret);  	if (ret != 0) { -		fscache_uncache_page(NFS_I(inode)->fscache, page); +		fscache_uncache_page(nfs_i_fscache(inode), page);  		nfs_add_fscache_stats(inode,  				      NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);  		nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1); diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index b9c572d0679..d7fe3e799f2 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -73,16 +73,12 @@ extern void nfs_fscache_unregister(void);  extern void nfs_fscache_get_client_cookie(struct nfs_client *);  extern void nfs_fscache_release_client_cookie(struct nfs_client *); -extern void nfs_fscache_get_super_cookie(struct super_block *, -					 const char *, -					 struct nfs_clone_mount *); +extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int);  extern void nfs_fscache_release_super_cookie(struct super_block *); -extern void nfs_fscache_init_inode_cookie(struct inode *); -extern void nfs_fscache_release_inode_cookie(struct inode *); -extern void nfs_fscache_zap_inode_cookie(struct inode *); -extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *); -extern void nfs_fscache_reset_inode_cookie(struct inode *); +extern void nfs_fscache_init_inode(struct inode *); +extern void nfs_fscache_clear_inode(struct inode *); +extern void nfs_fscache_open_file(struct inode *, struct file *);  extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);  extern int nfs_fscache_release_page(struct page *, gfp_t); @@ -155,6 +151,22 @@ static inline void nfs_readpage_to_fscache(struct inode *inode,  }  /* + * Invalidate the contents of fscache for this inode.  This will not sleep. + */ +static inline void nfs_fscache_invalidate(struct inode *inode) +{ +	fscache_invalidate(NFS_I(inode)->fscache); +} + +/* + * Wait for an object to finish being invalidated. + */ +static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) +{ +	fscache_wait_on_invalidate(NFS_I(inode)->fscache); +} + +/*   * indicate the client caching state as readable text   */  static inline const char *nfs_server_fscache_state(struct nfs_server *server) @@ -164,7 +176,6 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server)  	return "no ";  } -  #else /* CONFIG_NFS_FSCACHE */  static inline int nfs_fscache_register(void) { return 0; }  static inline void nfs_fscache_unregister(void) {} @@ -172,20 +183,12 @@ static inline void nfs_fscache_unregister(void) {}  static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}  static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {} -static inline void nfs_fscache_get_super_cookie( -	struct super_block *sb, -	const char *uniq, -	struct nfs_clone_mount *mntdata) -{ -}  static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} -static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {} -static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {} -static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {} -static inline void nfs_fscache_set_inode_cookie(struct inode *inode, -						struct file *filp) {} -static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {} +static inline void nfs_fscache_init_inode(struct inode *inode) {} +static inline void nfs_fscache_clear_inode(struct inode *inode) {} +static inline void nfs_fscache_open_file(struct inode *inode, +					 struct file *filp) {}  static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)  { @@ -213,6 +216,10 @@ static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,  static inline void nfs_readpage_to_fscache(struct inode *inode,  					   struct page *page, int sync) {} + +static inline void nfs_fscache_invalidate(struct inode *inode) {} +static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) {} +  static inline const char *nfs_server_fscache_state(struct nfs_server *server)  {  	return "no "; diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index ac7b814ce16..b94f80420a5 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -23,20 +23,15 @@  #include <linux/sunrpc/stats.h>  #include <linux/nfs_fs.h>  #include <linux/nfs_mount.h> -#include <linux/nfs4_mount.h>  #include <linux/lockd/bind.h>  #include <linux/seq_file.h>  #include <linux/mount.h> -#include <linux/nfs_idmap.h>  #include <linux/vfs.h>  #include <linux/namei.h>  #include <linux/security.h> -#include <asm/system.h>  #include <asm/uaccess.h> -#include "nfs4_fs.h" -#include "delegation.h"  #include "internal.h"  #define NFSDBG_FACILITY		NFSDBG_CLIENT @@ -49,11 +44,9 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i  {  	/* The mntroot acts as the dummy root dentry for this superblock */  	if (sb->s_root == NULL) { -		sb->s_root = d_alloc_root(inode); -		if (sb->s_root == NULL) { -			iput(inode); +		sb->s_root = d_make_root(inode); +		if (sb->s_root == NULL)  			return -ENOMEM; -		}  		ihold(inode);  		/*  		 * Ensure that this dentry is invisible to d_find_alias(). @@ -63,9 +56,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i  		 * This again causes shrink_dcache_for_umount_subtree() to  		 * Oops, since the test for IS_ROOT() will fail.  		 */ -		spin_lock(&dcache_lock); -		list_del_init(&sb->s_root->d_alias); -		spin_unlock(&dcache_lock); +		spin_lock(&sb->s_root->d_inode->i_lock); +		spin_lock(&sb->s_root->d_lock); +		hlist_del_init(&sb->s_root->d_alias); +		spin_unlock(&sb->s_root->d_lock); +		spin_unlock(&sb->s_root->d_inode->i_lock);  	}  	return 0;  } @@ -73,18 +68,25 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i  /*   * get an NFS2/NFS3 root dentry from the root filehandle   */ -struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh) +struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, +			    const char *devname)  {  	struct nfs_server *server = NFS_SB(sb);  	struct nfs_fsinfo fsinfo;  	struct dentry *ret;  	struct inode *inode; +	void *name = kstrdup(devname, GFP_KERNEL);  	int error; +	if (!name) +		return ERR_PTR(-ENOMEM); +  	/* get the actual root for this mount */  	fsinfo.fattr = nfs_alloc_fattr(); -	if (fsinfo.fattr == NULL) +	if (fsinfo.fattr == NULL) { +		kfree(name);  		return ERR_PTR(-ENOMEM); +	}  	error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);  	if (error < 0) { @@ -93,7 +95,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)  		goto out;  	} -	inode = nfs_fhget(sb, mntfh, fsinfo.fattr); +	inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL);  	if (IS_ERR(inode)) {  		dprintk("nfs_get_root: get root inode failed\n");  		ret = ERR_CAST(inode); @@ -117,121 +119,15 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)  	}  	security_d_instantiate(ret, inode); - -	if (ret->d_op == NULL) -		ret->d_op = server->nfs_client->rpc_ops->dentry_ops; -out: -	nfs_free_fattr(fsinfo.fattr); -	return ret; -} - -#ifdef CONFIG_NFS_V4 - -int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh) -{ -	struct nfs_fsinfo fsinfo; -	int ret = -ENOMEM; - -	dprintk("--> nfs4_get_rootfh()\n"); - -	fsinfo.fattr = nfs_alloc_fattr(); -	if (fsinfo.fattr == NULL) -		goto out; - -	/* Start by getting the root filehandle from the server */ -	ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); -	if (ret < 0) { -		dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); -		goto out; -	} - -	if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE) -			|| !S_ISDIR(fsinfo.fattr->mode)) { -		printk(KERN_ERR "nfs4_get_rootfh:" -		       " getroot encountered non-directory\n"); -		ret = -ENOTDIR; -		goto out; -	} - -	if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { -		printk(KERN_ERR "nfs4_get_rootfh:" -		       " getroot obtained referral\n"); -		ret = -EREMOTE; -		goto out; +	spin_lock(&ret->d_lock); +	if (IS_ROOT(ret) && !ret->d_fsdata && +	    !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { +		ret->d_fsdata = name; +		name = NULL;  	} - -	memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); +	spin_unlock(&ret->d_lock);  out: +	kfree(name);  	nfs_free_fattr(fsinfo.fattr); -	dprintk("<-- nfs4_get_rootfh() = %d\n", ret);  	return ret;  } - -/* - * get an NFS4 root dentry from the root filehandle - */ -struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) -{ -	struct nfs_server *server = NFS_SB(sb); -	struct nfs_fattr *fattr = NULL; -	struct dentry *ret; -	struct inode *inode; -	int error; - -	dprintk("--> nfs4_get_root()\n"); - -	/* get the info about the server and filesystem */ -	error = nfs4_server_capabilities(server, mntfh); -	if (error < 0) { -		dprintk("nfs_get_root: getcaps error = %d\n", -			-error); -		return ERR_PTR(error); -	} - -	fattr = nfs_alloc_fattr(); -	if (fattr == NULL) -		return ERR_PTR(-ENOMEM);; - -	/* get the actual root for this mount */ -	error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); -	if (error < 0) { -		dprintk("nfs_get_root: getattr error = %d\n", -error); -		ret = ERR_PTR(error); -		goto out; -	} - -	inode = nfs_fhget(sb, mntfh, fattr); -	if (IS_ERR(inode)) { -		dprintk("nfs_get_root: get root inode failed\n"); -		ret = ERR_CAST(inode); -		goto out; -	} - -	error = nfs_superblock_set_dummy_root(sb, inode); -	if (error != 0) { -		ret = ERR_PTR(error); -		goto out; -	} - -	/* root dentries normally start off anonymous and get spliced in later -	 * if the dentry tree reaches them; however if the dentry already -	 * exists, we'll pick it up at this point and use it as the root -	 */ -	ret = d_obtain_alias(inode); -	if (IS_ERR(ret)) { -		dprintk("nfs_get_root: get root dentry failed\n"); -		goto out; -	} - -	security_d_instantiate(ret, inode); - -	if (ret->d_op == NULL) -		ret->d_op = server->nfs_client->rpc_ops->dentry_ops; - -out: -	nfs_free_fattr(fattr); -	dprintk("<-- nfs4_get_root()\n"); -	return ret; -} - -#endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 4e2d9b6b138..567983d2c0e 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -33,25 +33,146 @@   *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS   *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.   */ - -#ifdef CONFIG_NFS_USE_NEW_IDMAPPER - -#include <linux/slab.h> -#include <linux/cred.h> +#include <linux/types.h> +#include <linux/parser.h> +#include <linux/fs.h>  #include <linux/nfs_idmap.h> +#include <net/net_namespace.h> +#include <linux/sunrpc/rpc_pipe_fs.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <linux/key.h>  #include <linux/keyctl.h>  #include <linux/key-type.h> -#include <linux/rcupdate.h> -#include <linux/kernel.h> -#include <linux/err.h> -  #include <keys/user-type.h> +#include <linux/module.h> + +#include "internal.h" +#include "netns.h" +#include "nfs4trace.h"  #define NFS_UINT_MAXLEN 11 -const struct cred *id_resolver_cache; +static const struct cred *id_resolver_cache; +static struct key_type key_type_id_resolver_legacy; + +struct idmap_legacy_upcalldata { +	struct rpc_pipe_msg pipe_msg; +	struct idmap_msg idmap_msg; +	struct key_construction	*key_cons; +	struct idmap *idmap; +}; + +struct idmap { +	struct rpc_pipe_dir_object idmap_pdo; +	struct rpc_pipe		*idmap_pipe; +	struct idmap_legacy_upcalldata *idmap_upcall_data; +	struct mutex		idmap_mutex; +}; + +/** + * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields + * @fattr: fully initialised struct nfs_fattr + * @owner_name: owner name string cache + * @group_name: group name string cache + */ +void nfs_fattr_init_names(struct nfs_fattr *fattr, +		struct nfs4_string *owner_name, +		struct nfs4_string *group_name) +{ +	fattr->owner_name = owner_name; +	fattr->group_name = group_name; +} + +static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr) +{ +	fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME; +	kfree(fattr->owner_name->data); +} + +static void nfs_fattr_free_group_name(struct nfs_fattr *fattr) +{ +	fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME; +	kfree(fattr->group_name->data); +} + +static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr) +{ +	struct nfs4_string *owner = fattr->owner_name; +	kuid_t uid; + +	if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)) +		return false; +	if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) { +		fattr->uid = uid; +		fattr->valid |= NFS_ATTR_FATTR_OWNER; +	} +	return true; +} + +static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr) +{ +	struct nfs4_string *group = fattr->group_name; +	kgid_t gid; + +	if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)) +		return false; +	if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) { +		fattr->gid = gid; +		fattr->valid |= NFS_ATTR_FATTR_GROUP; +	} +	return true; +} + +/** + * nfs_fattr_free_names - free up the NFSv4 owner and group strings + * @fattr: a fully initialised nfs_fattr structure + */ +void nfs_fattr_free_names(struct nfs_fattr *fattr) +{ +	if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME) +		nfs_fattr_free_owner_name(fattr); +	if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME) +		nfs_fattr_free_group_name(fattr); +} + +/** + * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free + * @server: pointer to the filesystem nfs_server structure + * @fattr: a fully initialised nfs_fattr structure + * + * This helper maps the cached NFSv4 owner/group strings in fattr into + * their numeric uid/gid equivalents, and then frees the cached strings. + */ +void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr) +{ +	if (nfs_fattr_map_owner_name(server, fattr)) +		nfs_fattr_free_owner_name(fattr); +	if (nfs_fattr_map_group_name(server, fattr)) +		nfs_fattr_free_group_name(fattr); +} + +static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) +{ +	unsigned long val; +	char buf[16]; + +	if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf)) +		return 0; +	memcpy(buf, name, namelen); +	buf[namelen] = '\0'; +	if (kstrtoul(buf, 0, &val) != 0) +		return 0; +	*res = val; +	return 1; +} + +static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) +{ +	return snprintf(buf, buflen, "%u", id); +} -struct key_type key_type_id_resolver = { +static struct key_type key_type_id_resolver = {  	.name		= "id_resolver",  	.instantiate	= user_instantiate,  	.match		= user_match, @@ -61,40 +182,45 @@ struct key_type key_type_id_resolver = {  	.read		= user_read,  }; -int nfs_idmap_init(void) +static int nfs_idmap_init_keyring(void)  {  	struct cred *cred;  	struct key *keyring;  	int ret = 0; -	printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name); +	printk(KERN_NOTICE "NFS: Registering the %s key type\n", +		key_type_id_resolver.name);  	cred = prepare_kernel_cred(NULL);  	if (!cred)  		return -ENOMEM; -	keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred, -			     (KEY_POS_ALL & ~KEY_POS_SETATTR) | -			     KEY_USR_VIEW | KEY_USR_READ, -			     KEY_ALLOC_NOT_IN_QUOTA); +	keyring = keyring_alloc(".id_resolver", +				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, +				(KEY_POS_ALL & ~KEY_POS_SETATTR) | +				KEY_USR_VIEW | KEY_USR_READ, +				KEY_ALLOC_NOT_IN_QUOTA, NULL);  	if (IS_ERR(keyring)) {  		ret = PTR_ERR(keyring);  		goto failed_put_cred;  	} -	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); +	ret = register_key_type(&key_type_id_resolver);  	if (ret < 0)  		goto failed_put_key; -	ret = register_key_type(&key_type_id_resolver); +	ret = register_key_type(&key_type_id_resolver_legacy);  	if (ret < 0) -		goto failed_put_key; +		goto failed_reg_legacy; +	set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);  	cred->thread_keyring = keyring;  	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;  	id_resolver_cache = cred;  	return 0; +failed_reg_legacy: +	unregister_key_type(&key_type_id_resolver);  failed_put_key:  	key_put(keyring);  failed_put_cred: @@ -102,10 +228,11 @@ failed_put_cred:  	return ret;  } -void nfs_idmap_quit(void) +static void nfs_idmap_quit_keyring(void)  {  	key_revoke(id_resolver_cache->thread_keyring);  	unregister_key_type(&key_type_id_resolver); +	unregister_key_type(&key_type_id_resolver_legacy);  	put_cred(id_resolver_cache);  } @@ -137,23 +264,42 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,  	return desclen;  } -static ssize_t nfs_idmap_request_key(const char *name, size_t namelen, -		const char *type, void *data, size_t data_size) +static struct key *nfs_idmap_request_key(const char *name, size_t namelen, +					 const char *type, struct idmap *idmap)  { -	const struct cred *saved_cred; -	struct key *rkey;  	char *desc; -	struct user_key_payload *payload; +	struct key *rkey;  	ssize_t ret;  	ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);  	if (ret <= 0) -		goto out; +		return ERR_PTR(ret); -	saved_cred = override_creds(id_resolver_cache);  	rkey = request_key(&key_type_id_resolver, desc, ""); -	revert_creds(saved_cred); +	if (IS_ERR(rkey)) { +		mutex_lock(&idmap->idmap_mutex); +		rkey = request_key_with_auxdata(&key_type_id_resolver_legacy, +						desc, "", 0, idmap); +		mutex_unlock(&idmap->idmap_mutex); +	} +  	kfree(desc); +	return rkey; +} + +static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, +				 const char *type, void *data, +				 size_t data_size, struct idmap *idmap) +{ +	const struct cred *saved_cred; +	struct key *rkey; +	struct user_key_payload *payload; +	ssize_t ret; + +	saved_cred = override_creds(id_resolver_cache); +	rkey = nfs_idmap_request_key(name, namelen, type, idmap); +	revert_creds(saved_cred); +  	if (IS_ERR(rkey)) {  		ret = PTR_ERR(rkey);  		goto out; @@ -166,7 +312,7 @@ static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,  	if (ret < 0)  		goto out_up; -	payload = rcu_dereference(rkey->payload.data); +	payload = rcu_dereference(rkey->payload.rcudata);  	if (IS_ERR_OR_NULL(payload)) {  		ret = PTR_ERR(payload);  		goto out_up; @@ -185,162 +331,145 @@ out:  	return ret;  } -  /* ID -> Name */ -static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen) +static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, +				     size_t buflen, struct idmap *idmap)  {  	char id_str[NFS_UINT_MAXLEN];  	int id_len;  	ssize_t ret;  	id_len = snprintf(id_str, sizeof(id_str), "%u", id); -	ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen); +	ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap);  	if (ret < 0)  		return -EINVAL;  	return ret;  }  /* Name -> ID */ -static int nfs_idmap_lookup_id(const char *name, size_t namelen, -				const char *type, __u32 *id) +static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type, +			       __u32 *id, struct idmap *idmap)  {  	char id_str[NFS_UINT_MAXLEN];  	long id_long;  	ssize_t data_size;  	int ret = 0; -	data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN); +	data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap);  	if (data_size <= 0) {  		ret = -EINVAL;  	} else { -		ret = strict_strtol(id_str, 10, &id_long); +		ret = kstrtol(id_str, 10, &id_long);  		*id = (__u32)id_long;  	}  	return ret;  } -int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) -{ -	return nfs_idmap_lookup_id(name, namelen, "uid", uid); -} - -int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid) -{ -	return nfs_idmap_lookup_id(name, namelen, "gid", gid); -} - -int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) -{ -	return nfs_idmap_lookup_name(uid, "user", buf, buflen); -} -int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen) -{ -	return nfs_idmap_lookup_name(gid, "group", buf, buflen); -} - -#else  /* CONFIG_NFS_USE_IDMAPPER not defined */ - -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/socket.h> -#include <linux/in.h> -#include <linux/sched.h> - -#include <linux/sunrpc/clnt.h> -#include <linux/workqueue.h> -#include <linux/sunrpc/rpc_pipe_fs.h> - -#include <linux/nfs_fs.h> +/* idmap classic begins here */ -#include <linux/nfs_idmap.h> -#include "nfs4_fs.h" - -#define IDMAP_HASH_SZ          128 - -/* Default cache timeout is 10 minutes */ -unsigned int nfs_idmap_cache_timeout = 600 * HZ; - -static int param_set_idmap_timeout(const char *val, struct kernel_param *kp) -{ -	char *endp; -	int num = simple_strtol(val, &endp, 0); -	int jif = num * HZ; -	if (endp == val || *endp || num < 0 || jif < num) -		return -EINVAL; -	*((int *)kp->arg) = jif; -	return 0; -} - -module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int, -		 &nfs_idmap_cache_timeout, 0644); - -struct idmap_hashent { -	unsigned long		ih_expires; -	__u32			ih_id; -	size_t			ih_namelen; -	char			ih_name[IDMAP_NAMESZ]; +enum { +	Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err  }; -struct idmap_hashtable { -	__u8			h_type; -	struct idmap_hashent	h_entries[IDMAP_HASH_SZ]; +static const match_table_t nfs_idmap_tokens = { +	{ Opt_find_uid, "uid:%s" }, +	{ Opt_find_gid, "gid:%s" }, +	{ Opt_find_user, "user:%s" }, +	{ Opt_find_group, "group:%s" }, +	{ Opt_find_err, NULL }  }; -struct idmap { -	struct dentry		*idmap_dentry; -	wait_queue_head_t	idmap_wq; -	struct idmap_msg	idmap_im; -	struct mutex		idmap_lock;	/* Serializes upcalls */ -	struct mutex		idmap_im_lock;	/* Protects the hashtable */ -	struct idmap_hashtable	idmap_user_hash; -	struct idmap_hashtable	idmap_group_hash; -}; - -static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *, -				 char __user *, size_t); +static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *);  static ssize_t idmap_pipe_downcall(struct file *, const char __user *,  				   size_t); +static void idmap_release_pipe(struct inode *);  static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); -static unsigned int fnvhash32(const void *, size_t); -  static const struct rpc_pipe_ops idmap_upcall_ops = { -	.upcall		= idmap_pipe_upcall, +	.upcall		= rpc_pipe_generic_upcall,  	.downcall	= idmap_pipe_downcall, +	.release_pipe	= idmap_release_pipe,  	.destroy_msg	= idmap_pipe_destroy_msg,  }; +static struct key_type key_type_id_resolver_legacy = { +	.name		= "id_legacy", +	.instantiate	= user_instantiate, +	.match		= user_match, +	.revoke		= user_revoke, +	.destroy	= user_destroy, +	.describe	= user_describe, +	.read		= user_read, +	.request_key	= nfs_idmap_legacy_upcall, +}; + +static void nfs_idmap_pipe_destroy(struct dentry *dir, +		struct rpc_pipe_dir_object *pdo) +{ +	struct idmap *idmap = pdo->pdo_data; +	struct rpc_pipe *pipe = idmap->idmap_pipe; + +	if (pipe->dentry) { +		rpc_unlink(pipe->dentry); +		pipe->dentry = NULL; +	} +} + +static int nfs_idmap_pipe_create(struct dentry *dir, +		struct rpc_pipe_dir_object *pdo) +{ +	struct idmap *idmap = pdo->pdo_data; +	struct rpc_pipe *pipe = idmap->idmap_pipe; +	struct dentry *dentry; + +	dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe); +	if (IS_ERR(dentry)) +		return PTR_ERR(dentry); +	pipe->dentry = dentry; +	return 0; +} + +static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = { +	.create = nfs_idmap_pipe_create, +	.destroy = nfs_idmap_pipe_destroy, +}; +  int  nfs_idmap_new(struct nfs_client *clp)  {  	struct idmap *idmap; +	struct rpc_pipe *pipe;  	int error; -	BUG_ON(clp->cl_idmap != NULL); -  	idmap = kzalloc(sizeof(*idmap), GFP_KERNEL);  	if (idmap == NULL)  		return -ENOMEM; -	idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry, -			"idmap", idmap, &idmap_upcall_ops, 0); -	if (IS_ERR(idmap->idmap_dentry)) { -		error = PTR_ERR(idmap->idmap_dentry); -		kfree(idmap); -		return error; +	rpc_init_pipe_dir_object(&idmap->idmap_pdo, +			&nfs_idmap_pipe_dir_object_ops, +			idmap); + +	pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0); +	if (IS_ERR(pipe)) { +		error = PTR_ERR(pipe); +		goto err;  	} +	idmap->idmap_pipe = pipe; +	mutex_init(&idmap->idmap_mutex); -	mutex_init(&idmap->idmap_lock); -	mutex_init(&idmap->idmap_im_lock); -	init_waitqueue_head(&idmap->idmap_wq); -	idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER; -	idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP; +	error = rpc_add_pipe_dir_object(clp->cl_net, +			&clp->cl_rpcclient->cl_pipedir_objects, +			&idmap->idmap_pdo); +	if (error) +		goto err_destroy_pipe;  	clp->cl_idmap = idmap;  	return 0; +err_destroy_pipe: +	rpc_destroy_pipe_data(idmap->idmap_pipe); +err: +	kfree(idmap); +	return error;  }  void @@ -350,376 +479,309 @@ nfs_idmap_delete(struct nfs_client *clp)  	if (!idmap)  		return; -	rpc_unlink(idmap->idmap_dentry);  	clp->cl_idmap = NULL; +	rpc_remove_pipe_dir_object(clp->cl_net, +			&clp->cl_rpcclient->cl_pipedir_objects, +			&idmap->idmap_pdo); +	rpc_destroy_pipe_data(idmap->idmap_pipe);  	kfree(idmap);  } -/* - * Helper routines for manipulating the hashtable - */ -static inline struct idmap_hashent * -idmap_name_hash(struct idmap_hashtable* h, const char *name, size_t len) +int nfs_idmap_init(void)  { -	return &h->h_entries[fnvhash32(name, len) % IDMAP_HASH_SZ]; +	int ret; +	ret = nfs_idmap_init_keyring(); +	if (ret != 0) +		goto out; +out: +	return ret;  } -static struct idmap_hashent * -idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len) +void nfs_idmap_quit(void)  { -	struct idmap_hashent *he = idmap_name_hash(h, name, len); - -	if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0) -		return NULL; -	if (time_after(jiffies, he->ih_expires)) -		return NULL; -	return he; +	nfs_idmap_quit_keyring();  } -static inline struct idmap_hashent * -idmap_id_hash(struct idmap_hashtable* h, __u32 id) +static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, +				     struct idmap_msg *im, +				     struct rpc_pipe_msg *msg)  { -	return &h->h_entries[fnvhash32(&id, sizeof(id)) % IDMAP_HASH_SZ]; -} +	substring_t substr; +	int token, ret; + +	im->im_type = IDMAP_TYPE_GROUP; +	token = match_token(desc, nfs_idmap_tokens, &substr); + +	switch (token) { +	case Opt_find_uid: +		im->im_type = IDMAP_TYPE_USER; +	case Opt_find_gid: +		im->im_conv = IDMAP_CONV_NAMETOID; +		ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ); +		break; -static struct idmap_hashent * -idmap_lookup_id(struct idmap_hashtable *h, __u32 id) -{ -	struct idmap_hashent *he = idmap_id_hash(h, id); -	if (he->ih_id != id || he->ih_namelen == 0) -		return NULL; -	if (time_after(jiffies, he->ih_expires)) -		return NULL; -	return he; +	case Opt_find_user: +		im->im_type = IDMAP_TYPE_USER; +	case Opt_find_group: +		im->im_conv = IDMAP_CONV_IDTONAME; +		ret = match_int(&substr, &im->im_id); +		break; + +	default: +		ret = -EINVAL; +		goto out; +	} + +	msg->data = im; +	msg->len  = sizeof(struct idmap_msg); + +out: +	return ret;  } -/* - * Routines for allocating new entries in the hashtable. - * For now, we just have 1 entry per bucket, so it's all - * pretty trivial. - */ -static inline struct idmap_hashent * -idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len) +static bool +nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, +		struct idmap_legacy_upcalldata *data)  { -	return idmap_name_hash(h, name, len); +	if (idmap->idmap_upcall_data != NULL) { +		WARN_ON_ONCE(1); +		return false; +	} +	idmap->idmap_upcall_data = data; +	return true;  } -static inline struct idmap_hashent * -idmap_alloc_id(struct idmap_hashtable *h, __u32 id) +static void +nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret)  { -	return idmap_id_hash(h, id); +	struct key_construction *cons = idmap->idmap_upcall_data->key_cons; + +	kfree(idmap->idmap_upcall_data); +	idmap->idmap_upcall_data = NULL; +	complete_request_key(cons, ret);  }  static void -idmap_update_entry(struct idmap_hashent *he, const char *name, -		size_t namelen, __u32 id) +nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret)  { -	he->ih_id = id; -	memcpy(he->ih_name, name, namelen); -	he->ih_name[namelen] = '\0'; -	he->ih_namelen = namelen; -	he->ih_expires = jiffies + nfs_idmap_cache_timeout; +	if (idmap->idmap_upcall_data != NULL) +		nfs_idmap_complete_pipe_upcall_locked(idmap, ret);  } -/* - * Name -> ID - */ -static int -nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h, -		const char *name, size_t namelen, __u32 *id) +static int nfs_idmap_legacy_upcall(struct key_construction *cons, +				   const char *op, +				   void *aux)  { -	struct rpc_pipe_msg msg; +	struct idmap_legacy_upcalldata *data; +	struct rpc_pipe_msg *msg;  	struct idmap_msg *im; -	struct idmap_hashent *he; -	DECLARE_WAITQUEUE(wq, current); -	int ret = -EIO; - -	im = &idmap->idmap_im; - -	/* -	 * String sanity checks -	 * Note that the userland daemon expects NUL terminated strings -	 */ -	for (;;) { -		if (namelen == 0) -			return -EINVAL; -		if (name[namelen-1] != '\0') -			break; -		namelen--; -	} -	if (namelen >= IDMAP_NAMESZ) -		return -EINVAL; - -	mutex_lock(&idmap->idmap_lock); -	mutex_lock(&idmap->idmap_im_lock); - -	he = idmap_lookup_name(h, name, namelen); -	if (he != NULL) { -		*id = he->ih_id; -		ret = 0; -		goto out; -	} - -	memset(im, 0, sizeof(*im)); -	memcpy(im->im_name, name, namelen); +	struct idmap *idmap = (struct idmap *)aux; +	struct key *key = cons->key; +	int ret = -ENOMEM; -	im->im_type = h->h_type; -	im->im_conv = IDMAP_CONV_NAMETOID; +	/* msg and im are freed in idmap_pipe_destroy_msg */ +	data = kzalloc(sizeof(*data), GFP_KERNEL); +	if (!data) +		goto out1; -	memset(&msg, 0, sizeof(msg)); -	msg.data = im; -	msg.len = sizeof(*im); +	msg = &data->pipe_msg; +	im = &data->idmap_msg; +	data->idmap = idmap; +	data->key_cons = cons; -	add_wait_queue(&idmap->idmap_wq, &wq); -	if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) { -		remove_wait_queue(&idmap->idmap_wq, &wq); -		goto out; -	} +	ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); +	if (ret < 0) +		goto out2; -	set_current_state(TASK_UNINTERRUPTIBLE); -	mutex_unlock(&idmap->idmap_im_lock); -	schedule(); -	__set_current_state(TASK_RUNNING); -	remove_wait_queue(&idmap->idmap_wq, &wq); -	mutex_lock(&idmap->idmap_im_lock); +	ret = -EAGAIN; +	if (!nfs_idmap_prepare_pipe_upcall(idmap, data)) +		goto out2; -	if (im->im_status & IDMAP_STATUS_SUCCESS) { -		*id = im->im_id; -		ret = 0; -	} +	ret = rpc_queue_upcall(idmap->idmap_pipe, msg); +	if (ret < 0) +		nfs_idmap_abort_pipe_upcall(idmap, ret); - out: -	memset(im, 0, sizeof(*im)); -	mutex_unlock(&idmap->idmap_im_lock); -	mutex_unlock(&idmap->idmap_lock); +	return ret; +out2: +	kfree(data); +out1: +	complete_request_key(cons, ret);  	return ret;  } -/* - * ID -> Name - */ -static int -nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h, -		__u32 id, char *name) +static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen)  { -	struct rpc_pipe_msg msg; -	struct idmap_msg *im; -	struct idmap_hashent *he; -	DECLARE_WAITQUEUE(wq, current); -	int ret = -EIO; -	unsigned int len; - -	im = &idmap->idmap_im; - -	mutex_lock(&idmap->idmap_lock); -	mutex_lock(&idmap->idmap_im_lock); - -	he = idmap_lookup_id(h, id); -	if (he) { -		memcpy(name, he->ih_name, he->ih_namelen); -		ret = he->ih_namelen; -		goto out; -	} - -	memset(im, 0, sizeof(*im)); -	im->im_type = h->h_type; -	im->im_conv = IDMAP_CONV_IDTONAME; -	im->im_id = id; - -	memset(&msg, 0, sizeof(msg)); -	msg.data = im; -	msg.len = sizeof(*im); - -	add_wait_queue(&idmap->idmap_wq, &wq); - -	if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) { -		remove_wait_queue(&idmap->idmap_wq, &wq); -		goto out; -	} - -	set_current_state(TASK_UNINTERRUPTIBLE); -	mutex_unlock(&idmap->idmap_im_lock); -	schedule(); -	__set_current_state(TASK_RUNNING); -	remove_wait_queue(&idmap->idmap_wq, &wq); -	mutex_lock(&idmap->idmap_im_lock); - -	if (im->im_status & IDMAP_STATUS_SUCCESS) { -		if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0) -			goto out; -		memcpy(name, im->im_name, len); -		ret = len; -	} - - out: -	memset(im, 0, sizeof(*im)); -	mutex_unlock(&idmap->idmap_im_lock); -	mutex_unlock(&idmap->idmap_lock); -	return ret; +	return key_instantiate_and_link(key, data, datalen, +					id_resolver_cache->thread_keyring, +					authkey);  } -/* RPC pipefs upcall/downcall routines */ -static ssize_t -idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, -		  char __user *dst, size_t buflen) +static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, +		struct idmap_msg *upcall, +		struct key *key, struct key *authkey)  { -	char *data = (char *)msg->data + msg->copied; -	size_t mlen = min(msg->len, buflen); -	unsigned long left; +	char id_str[NFS_UINT_MAXLEN]; +	size_t len; +	int ret = -ENOKEY; -	left = copy_to_user(dst, data, mlen); -	if (left == mlen) { -		msg->errno = -EFAULT; -		return -EFAULT; +	/* ret = -ENOKEY */ +	if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv) +		goto out; +	switch (im->im_conv) { +	case IDMAP_CONV_NAMETOID: +		if (strcmp(upcall->im_name, im->im_name) != 0) +			break; +		/* Note: here we store the NUL terminator too */ +		len = sprintf(id_str, "%d", im->im_id) + 1; +		ret = nfs_idmap_instantiate(key, authkey, id_str, len); +		break; +	case IDMAP_CONV_IDTONAME: +		if (upcall->im_id != im->im_id) +			break; +		len = strlen(im->im_name); +		ret = nfs_idmap_instantiate(key, authkey, im->im_name, len); +		break; +	default: +		ret = -EINVAL;  	} - -	mlen -= left; -	msg->copied += mlen; -	msg->errno = 0; -	return mlen; +out: +	return ret;  }  static ssize_t  idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)  { -	struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode); +	struct rpc_inode *rpci = RPC_I(file_inode(filp));  	struct idmap *idmap = (struct idmap *)rpci->private; -	struct idmap_msg im_in, *im = &idmap->idmap_im; -	struct idmap_hashtable *h; -	struct idmap_hashent *he = NULL; +	struct key_construction *cons; +	struct idmap_msg im;  	size_t namelen_in; -	int ret; +	int ret = -ENOKEY; -	if (mlen != sizeof(im_in)) -		return -ENOSPC; - -	if (copy_from_user(&im_in, src, mlen) != 0) -		return -EFAULT; +	/* If instantiation is successful, anyone waiting for key construction +	 * will have been woken up and someone else may now have used +	 * idmap_key_cons - so after this point we may no longer touch it. +	 */ +	if (idmap->idmap_upcall_data == NULL) +		goto out_noupcall; -	mutex_lock(&idmap->idmap_im_lock); +	cons = idmap->idmap_upcall_data->key_cons; -	ret = mlen; -	im->im_status = im_in.im_status; -	/* If we got an error, terminate now, and wake up pending upcalls */ -	if (!(im_in.im_status & IDMAP_STATUS_SUCCESS)) { -		wake_up(&idmap->idmap_wq); +	if (mlen != sizeof(im)) { +		ret = -ENOSPC;  		goto out;  	} -	/* Sanity checking of strings */ -	ret = -EINVAL; -	namelen_in = strnlen(im_in.im_name, IDMAP_NAMESZ); -	if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) +	if (copy_from_user(&im, src, mlen) != 0) { +		ret = -EFAULT;  		goto out; +	} -	switch (im_in.im_type) { -		case IDMAP_TYPE_USER: -			h = &idmap->idmap_user_hash; -			break; -		case IDMAP_TYPE_GROUP: -			h = &idmap->idmap_group_hash; -			break; -		default: -			goto out; +	if (!(im.im_status & IDMAP_STATUS_SUCCESS)) { +		ret = -ENOKEY; +		goto out;  	} -	switch (im_in.im_conv) { -	case IDMAP_CONV_IDTONAME: -		/* Did we match the current upcall? */ -		if (im->im_conv == IDMAP_CONV_IDTONAME -				&& im->im_type == im_in.im_type -				&& im->im_id == im_in.im_id) { -			/* Yes: copy string, including the terminating '\0'  */ -			memcpy(im->im_name, im_in.im_name, namelen_in); -			im->im_name[namelen_in] = '\0'; -			wake_up(&idmap->idmap_wq); -		} -		he = idmap_alloc_id(h, im_in.im_id); -		break; -	case IDMAP_CONV_NAMETOID: -		/* Did we match the current upcall? */ -		if (im->im_conv == IDMAP_CONV_NAMETOID -				&& im->im_type == im_in.im_type -				&& strnlen(im->im_name, IDMAP_NAMESZ) == namelen_in -				&& memcmp(im->im_name, im_in.im_name, namelen_in) == 0) { -			im->im_id = im_in.im_id; -			wake_up(&idmap->idmap_wq); -		} -		he = idmap_alloc_name(h, im_in.im_name, namelen_in); -		break; -	default: +	namelen_in = strnlen(im.im_name, IDMAP_NAMESZ); +	if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { +		ret = -EINVAL;  		goto out; +} + +	ret = nfs_idmap_read_and_verify_message(&im, +			&idmap->idmap_upcall_data->idmap_msg, +			cons->key, cons->authkey); +	if (ret >= 0) { +		key_set_timeout(cons->key, nfs_idmap_cache_timeout); +		ret = mlen;  	} -	/* If the entry is valid, also copy it to the cache */ -	if (he != NULL) -		idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id); -	ret = mlen;  out: -	mutex_unlock(&idmap->idmap_im_lock); +	nfs_idmap_complete_pipe_upcall_locked(idmap, ret); +out_noupcall:  	return ret;  }  static void  idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)  { -	struct idmap_msg *im = msg->data; -	struct idmap *idmap = container_of(im, struct idmap, idmap_im);  +	struct idmap_legacy_upcalldata *data = container_of(msg, +			struct idmap_legacy_upcalldata, +			pipe_msg); +	struct idmap *idmap = data->idmap; -	if (msg->errno >= 0) -		return; -	mutex_lock(&idmap->idmap_im_lock); -	im->im_status = IDMAP_STATUS_LOOKUPFAIL; -	wake_up(&idmap->idmap_wq); -	mutex_unlock(&idmap->idmap_im_lock); +	if (msg->errno) +		nfs_idmap_abort_pipe_upcall(idmap, msg->errno);  } -/*  - * Fowler/Noll/Vo hash - *    http://www.isthe.com/chongo/tech/comp/fnv/ - */ - -#define FNV_P_32 ((unsigned int)0x01000193) /* 16777619 */ -#define FNV_1_32 ((unsigned int)0x811c9dc5) /* 2166136261 */ - -static unsigned int fnvhash32(const void *buf, size_t buflen) +static void +idmap_release_pipe(struct inode *inode)  { -	const unsigned char *p, *end = (const unsigned char *)buf + buflen; -	unsigned int hash = FNV_1_32; - -	for (p = buf; p < end; p++) { -		hash *= FNV_P_32; -		hash ^= (unsigned int)*p; -	} +	struct rpc_inode *rpci = RPC_I(inode); +	struct idmap *idmap = (struct idmap *)rpci->private; -	return hash; +	nfs_idmap_abort_pipe_upcall(idmap, -EPIPE);  } -int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid)  { -	struct idmap *idmap = clp->cl_idmap; +	struct idmap *idmap = server->nfs_client->cl_idmap; +	__u32 id = -1; +	int ret = 0; -	return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid); +	if (!nfs_map_string_to_numeric(name, namelen, &id)) +		ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap); +	if (ret == 0) { +		*uid = make_kuid(&init_user_ns, id); +		if (!uid_valid(*uid)) +			ret = -ERANGE; +	} +	trace_nfs4_map_name_to_uid(name, namelen, id, ret); +	return ret;  } -int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid)  { -	struct idmap *idmap = clp->cl_idmap; +	struct idmap *idmap = server->nfs_client->cl_idmap; +	__u32 id = -1; +	int ret = 0; -	return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); +	if (!nfs_map_string_to_numeric(name, namelen, &id)) +		ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap); +	if (ret == 0) { +		*gid = make_kgid(&init_user_ns, id); +		if (!gid_valid(*gid)) +			ret = -ERANGE; +	} +	trace_nfs4_map_group_to_gid(name, namelen, id, ret); +	return ret;  } -int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) +int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen)  { -	struct idmap *idmap = clp->cl_idmap; +	struct idmap *idmap = server->nfs_client->cl_idmap; +	int ret = -EINVAL; +	__u32 id; -	return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); +	id = from_kuid(&init_user_ns, uid); +	if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) +		ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap); +	if (ret < 0) +		ret = nfs_map_numeric_to_string(id, buf, buflen); +	trace_nfs4_map_uid_to_name(buf, ret, id, ret); +	return ret;  } -int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) +int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen)  { -	struct idmap *idmap = clp->cl_idmap; +	struct idmap *idmap = server->nfs_client->cl_idmap; +	int ret = -EINVAL; +	__u32 id; -	return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); +	id = from_kgid(&init_user_ns, gid); +	if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) +		ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap); +	if (ret < 0) +		ret = nfs_map_numeric_to_string(id, buf, buflen); +	trace_nfs4_map_gid_to_group(buf, ret, id, ret); +	return ret;  } - -#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 314f5716460..9927913c97c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -32,13 +32,13 @@  #include <linux/lockd/bind.h>  #include <linux/seq_file.h>  #include <linux/mount.h> -#include <linux/nfs_idmap.h>  #include <linux/vfs.h>  #include <linux/inet.h>  #include <linux/nfs_xdr.h>  #include <linux/slab.h> +#include <linux/compat.h> +#include <linux/freezer.h> -#include <asm/system.h>  #include <asm/uaccess.h>  #include "nfs4_fs.h" @@ -47,15 +47,18 @@  #include "iostat.h"  #include "internal.h"  #include "fscache.h" -#include "dns_resolve.h"  #include "pnfs.h" +#include "nfs.h" +#include "netns.h" + +#include "nfstrace.h"  #define NFSDBG_FACILITY		NFSDBG_VFS  #define NFS_64_BIT_INODE_NUMBERS_ENABLED	1  /* Default is to see 64-bit inode numbers */ -static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; +static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;  static void nfs_invalidate_inode(struct inode *);  static int nfs_update_inode(struct inode *, struct nfs_fattr *); @@ -76,9 +79,10 @@ int nfs_wait_bit_killable(void *word)  {  	if (fatal_signal_pending(current))  		return -ERESTARTSYS; -	schedule(); +	freezable_schedule_unsafe();  	return 0;  } +EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);  /**   * nfs_compat_user_ino64 - returns the user-visible inode number @@ -89,7 +93,11 @@ int nfs_wait_bit_killable(void *word)   */  u64 nfs_compat_user_ino64(u64 fileid)  { -	int ino; +#ifdef CONFIG_COMPAT +	compat_ulong_t ino; +#else	 +	unsigned long ino; +#endif  	if (enable_ino64)  		return fileid; @@ -99,22 +107,29 @@ u64 nfs_compat_user_ino64(u64 fileid)  	return ino;  } -static void nfs_clear_inode(struct inode *inode) +int nfs_drop_inode(struct inode *inode) +{ +	return NFS_STALE(inode) || generic_drop_inode(inode); +} +EXPORT_SYMBOL_GPL(nfs_drop_inode); + +void nfs_clear_inode(struct inode *inode)  {  	/*  	 * The following should never happen...  	 */ -	BUG_ON(nfs_have_writebacks(inode)); -	BUG_ON(!list_empty(&NFS_I(inode)->open_files)); +	WARN_ON_ONCE(nfs_have_writebacks(inode)); +	WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files));  	nfs_zap_acl_cache(inode);  	nfs_access_zap_cache(inode); -	nfs_fscache_release_inode_cookie(inode); +	nfs_fscache_clear_inode(inode);  } +EXPORT_SYMBOL_GPL(nfs_clear_inode);  void nfs_evict_inode(struct inode *inode)  { -	truncate_inode_pages(&inode->i_data, 0); -	end_writeback(inode); +	truncate_inode_pages_final(&inode->i_data); +	clear_inode(inode);  	nfs_clear_inode(inode);  } @@ -132,6 +147,17 @@ int nfs_sync_mapping(struct address_space *mapping)  	return ret;  } +static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) +{ +	struct nfs_inode *nfsi = NFS_I(inode); + +	if (inode->i_mapping->nrpages == 0) +		flags &= ~NFS_INO_INVALID_DATA; +	nfsi->cache_validity |= flags; +	if (flags & NFS_INO_INVALID_DATA) +		nfs_fscache_invalidate(inode); +} +  /*   * Invalidate the local caches   */ @@ -145,11 +171,19 @@ static void nfs_zap_caches_locked(struct inode *inode)  	nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);  	nfsi->attrtimeo_timestamp = jiffies; -	memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); -	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) -		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; -	else -		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; +	memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); +	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { +		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR +					| NFS_INO_INVALID_DATA +					| NFS_INO_INVALID_ACCESS +					| NFS_INO_INVALID_ACL +					| NFS_INO_REVAL_PAGECACHE); +	} else +		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR +					| NFS_INO_INVALID_ACCESS +					| NFS_INO_INVALID_ACL +					| NFS_INO_REVAL_PAGECACHE); +	nfs_zap_label_cache_locked(nfsi);  }  void nfs_zap_caches(struct inode *inode) @@ -163,7 +197,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)  {  	if (mapping->nrpages != 0) {  		spin_lock(&inode->i_lock); -		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; +		nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);  		spin_unlock(&inode->i_lock);  	}  } @@ -179,13 +213,15 @@ void nfs_zap_acl_cache(struct inode *inode)  	NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL;  	spin_unlock(&inode->i_lock);  } +EXPORT_SYMBOL_GPL(nfs_zap_acl_cache);  void nfs_invalidate_atime(struct inode *inode)  {  	spin_lock(&inode->i_lock); -	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; +	nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);  	spin_unlock(&inode->i_lock);  } +EXPORT_SYMBOL_GPL(nfs_invalidate_atime);  /*   * Invalidate, but do not unhash, the inode. @@ -217,6 +253,8 @@ nfs_find_actor(struct inode *inode, void *opaque)  	if (NFS_FILEID(inode) != fattr->fileid)  		return 0; +	if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode)) +		return 0;  	if (nfs_compare_fh(NFS_FH(inode), fh))  		return 0;  	if (is_bad_inode(inode) || NFS_STALE(inode)) @@ -235,12 +273,74 @@ nfs_init_locked(struct inode *inode, void *opaque)  	return 0;  } +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static void nfs_clear_label_invalid(struct inode *inode) +{ +	spin_lock(&inode->i_lock); +	NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_LABEL; +	spin_unlock(&inode->i_lock); +} + +void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, +					struct nfs4_label *label) +{ +	int error; + +	if (label == NULL) +		return; + +	if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) { +		error = security_inode_notifysecctx(inode, label->label, +				label->len); +		if (error) +			printk(KERN_ERR "%s() %s %d " +					"security_inode_notifysecctx() %d\n", +					__func__, +					(char *)label->label, +					label->len, error); +		nfs_clear_label_invalid(inode); +	} +} + +struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) +{ +	struct nfs4_label *label = NULL; +	int minor_version = server->nfs_client->cl_minorversion; + +	if (minor_version < 2) +		return label; + +	if (!(server->caps & NFS_CAP_SECURITY_LABEL)) +		return label; + +	label = kzalloc(sizeof(struct nfs4_label), flags); +	if (label == NULL) +		return ERR_PTR(-ENOMEM); + +	label->label = kzalloc(NFS4_MAXLABELLEN, flags); +	if (label->label == NULL) { +		kfree(label); +		return ERR_PTR(-ENOMEM); +	} +	label->len = NFS4_MAXLABELLEN; + +	return label; +} +EXPORT_SYMBOL_GPL(nfs4_label_alloc); +#else +void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, +					struct nfs4_label *label) +{ +} +#endif +EXPORT_SYMBOL_GPL(nfs_setsecurity); +  /*   * This is our front-end to iget that looks up inodes by file handle   * instead of inode number.   */  struct inode * -nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) +nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label)  {  	struct nfs_find_desc desc = {  		.fh	= fh, @@ -249,7 +349,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)  	struct inode *inode = ERR_PTR(-ENOENT);  	unsigned long hash; -	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) +	nfs_attr_check_mountpoint(sb, fattr); + +	if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) && +	    !nfs_attr_use_mounted_on_fileid(fattr))  		goto out_no_inode;  	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)  		goto out_no_inode; @@ -275,31 +378,28 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)  		inode->i_mode = fattr->mode;  		if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0  				&& nfs_server_capable(inode, NFS_CAP_MODE)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR -				| NFS_INO_INVALID_ACCESS -				| NFS_INO_INVALID_ACL; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		/* Why so? Because we want revalidate for devices/FIFOs, and  		 * that's precisely what we have in nfs_file_inode_operations.  		 */  		inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops;  		if (S_ISREG(inode->i_mode)) { -			inode->i_fop = &nfs_file_operations; +			inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;  			inode->i_data.a_ops = &nfs_file_aops;  			inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;  		} else if (S_ISDIR(inode->i_mode)) {  			inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;  			inode->i_fop = &nfs_dir_operations; -			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) -				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); +			inode->i_data.a_ops = &nfs_dir_aops;  			/* Deal with crossing mountpoints */ -			if ((fattr->valid & NFS_ATTR_FATTR_FSID) -					&& !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { +			if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT || +					fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {  				if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)  					inode->i_op = &nfs_referral_inode_operations;  				else  					inode->i_op = &nfs_mountpoint_inode_operations;  				inode->i_fop = NULL; -				set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags); +				inode->i_flags |= S_AUTOMOUNT;  			}  		} else if (S_ISLNK(inode->i_mode))  			inode->i_op = &nfs_symlink_inode_operations; @@ -309,58 +409,51 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)  		memset(&inode->i_atime, 0, sizeof(inode->i_atime));  		memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));  		memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); -		nfsi->change_attr = 0; +		inode->i_version = 0;  		inode->i_size = 0; -		inode->i_nlink = 0; -		inode->i_uid = -2; -		inode->i_gid = -2; +		clear_nlink(inode); +		inode->i_uid = make_kuid(&init_user_ns, -2); +		inode->i_gid = make_kgid(&init_user_ns, -2);  		inode->i_blocks = 0;  		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); +		nfsi->write_io = 0; +		nfsi->read_io = 0;  		nfsi->read_cache_jiffies = fattr->time_start;  		nfsi->attr_gencount = fattr->gencount;  		if (fattr->valid & NFS_ATTR_FATTR_ATIME)  			inode->i_atime = fattr->atime;  		else if (nfs_server_capable(inode, NFS_CAP_ATIME)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		if (fattr->valid & NFS_ATTR_FATTR_MTIME)  			inode->i_mtime = fattr->mtime;  		else if (nfs_server_capable(inode, NFS_CAP_MTIME)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR -				| NFS_INO_INVALID_DATA; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		if (fattr->valid & NFS_ATTR_FATTR_CTIME)  			inode->i_ctime = fattr->ctime;  		else if (nfs_server_capable(inode, NFS_CAP_CTIME)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR -				| NFS_INO_INVALID_ACCESS -				| NFS_INO_INVALID_ACL; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		if (fattr->valid & NFS_ATTR_FATTR_CHANGE) -			nfsi->change_attr = fattr->change_attr; +			inode->i_version = fattr->change_attr;  		else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR -				| NFS_INO_INVALID_DATA; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		if (fattr->valid & NFS_ATTR_FATTR_SIZE)  			inode->i_size = nfs_size_to_loff_t(fattr->size);  		else -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR -				| NFS_INO_INVALID_DATA -				| NFS_INO_REVAL_PAGECACHE; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR +				| NFS_INO_REVAL_PAGECACHE);  		if (fattr->valid & NFS_ATTR_FATTR_NLINK) -			inode->i_nlink = fattr->nlink; +			set_nlink(inode, fattr->nlink);  		else if (nfs_server_capable(inode, NFS_CAP_NLINK)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		if (fattr->valid & NFS_ATTR_FATTR_OWNER)  			inode->i_uid = fattr->uid;  		else if (nfs_server_capable(inode, NFS_CAP_OWNER)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR -				| NFS_INO_INVALID_ACCESS -				| NFS_INO_INVALID_ACL; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		if (fattr->valid & NFS_ATTR_FATTR_GROUP)  			inode->i_gid = fattr->gid;  		else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) -			nfsi->cache_validity |= NFS_INO_INVALID_ATTR -				| NFS_INO_INVALID_ACCESS -				| NFS_INO_INVALID_ACL; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);  		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)  			inode->i_blocks = fattr->du.nfs2.blocks;  		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -369,18 +462,22 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)  			 */  			inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);  		} + +		nfs_setsecurity(inode, fattr, label); +  		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);  		nfsi->attrtimeo_timestamp = now;  		nfsi->access_cache = RB_ROOT; -		nfs_fscache_init_inode_cookie(inode); +		nfs_fscache_init_inode(inode);  		unlock_new_inode(inode);  	} else  		nfs_refresh_inode(inode, fattr); -	dprintk("NFS: nfs_fhget(%s/%Ld ct=%d)\n", +	dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n",  		inode->i_sb->s_id, -		(long long)NFS_FILEID(inode), +		(unsigned long long)NFS_FILEID(inode), +		nfs_display_fhandle_hash(fh),  		atomic_read(&inode->i_count));  out: @@ -390,8 +487,9 @@ out_no_inode:  	dprintk("nfs_fhget: iget failed with error %ld\n", PTR_ERR(inode));  	goto out;  } +EXPORT_SYMBOL_GPL(nfs_fhget); -#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE) +#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN)  int  nfs_setattr(struct dentry *dentry, struct iattr *attr) @@ -413,12 +511,16 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)  	/* Optimization: if the end result is no change, don't RPC */  	attr->ia_valid &= NFS_VALID_ATTRS; -	if ((attr->ia_valid & ~ATTR_FILE) == 0) +	if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)  		return 0; +	trace_nfs_setattr_enter(inode); +  	/* Write all dirty data */ -	if (S_ISREG(inode->i_mode)) +	if (S_ISREG(inode->i_mode)) { +		nfs_inode_dio_wait(inode);  		nfs_wb_all(inode); +	}  	fattr = nfs_alloc_fattr();  	if (fattr == NULL) @@ -427,14 +529,16 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)  	 * Return any delegations if we're going to change ACLs  	 */  	if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) -		nfs_inode_return_delegation(inode); +		NFS_PROTO(inode)->return_delegation(inode);  	error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);  	if (error == 0) -		nfs_refresh_inode(inode, fattr); +		error = nfs_refresh_inode(inode, fattr);  	nfs_free_fattr(fattr);  out: +	trace_nfs_setattr_exit(inode, error);  	return error;  } +EXPORT_SYMBOL_GPL(nfs_setattr);  /**   * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall @@ -447,7 +551,6 @@ out:   */  static int nfs_vmtruncate(struct inode * inode, loff_t offset)  { -	loff_t oldsize;  	int err;  	err = inode_newsize_ok(inode, offset); @@ -455,11 +558,13 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)  		goto out;  	spin_lock(&inode->i_lock); -	oldsize = inode->i_size;  	i_size_write(inode, offset); +	/* Optimisation */ +	if (offset == 0) +		NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;  	spin_unlock(&inode->i_lock); -	truncate_pagecache(inode, oldsize, offset); +	truncate_pagecache(inode, offset);  out:  	return err;  } @@ -485,7 +590,8 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)  			inode->i_uid = attr->ia_uid;  		if ((attr->ia_valid & ATTR_GID) != 0)  			inode->i_gid = attr->ia_gid; -		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; +		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS +				| NFS_INO_INVALID_ACL);  		spin_unlock(&inode->i_lock);  	}  	if ((attr->ia_valid & ATTR_SIZE) != 0) { @@ -493,6 +599,26 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)  		nfs_vmtruncate(inode, attr->ia_size);  	}  } +EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); + +static void nfs_request_parent_use_readdirplus(struct dentry *dentry) +{ +	struct dentry *parent; + +	parent = dget_parent(dentry); +	nfs_force_use_readdirplus(parent->d_inode); +	dput(parent); +} + +static bool nfs_need_revalidate_inode(struct inode *inode) +{ +	if (NFS_I(inode)->cache_validity & +			(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) +		return true; +	if (nfs_attribute_cache_expired(inode)) +		return true; +	return false; +}  int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)  { @@ -500,8 +626,10 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)  	int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;  	int err; +	trace_nfs_getattr_enter(inode);  	/* Flush out writes to the server in order to update c/mtime.  */  	if (S_ISREG(inode->i_mode)) { +		nfs_inode_dio_wait(inode);  		err = filemap_write_and_wait(inode->i_mapping);  		if (err)  			goto out; @@ -520,45 +648,52 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)   	    ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))  		need_atime = 0; -	if (need_atime) -		err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); -	else -		err = nfs_revalidate_inode(NFS_SERVER(inode), inode); +	if (need_atime || nfs_need_revalidate_inode(inode)) { +		struct nfs_server *server = NFS_SERVER(inode); + +		if (server->caps & NFS_CAP_READDIRPLUS) +			nfs_request_parent_use_readdirplus(dentry); +		err = __nfs_revalidate_inode(server, inode); +	}  	if (!err) {  		generic_fillattr(inode, stat);  		stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));  	}  out: +	trace_nfs_getattr_exit(inode, err);  	return err;  } +EXPORT_SYMBOL_GPL(nfs_getattr);  static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)  {  	atomic_set(&l_ctx->count, 1); -	l_ctx->lockowner = current->files; -	l_ctx->pid = current->tgid; +	l_ctx->lockowner.l_owner = current->files; +	l_ctx->lockowner.l_pid = current->tgid;  	INIT_LIST_HEAD(&l_ctx->list); +	nfs_iocounter_init(&l_ctx->io_count);  }  static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)  { -	struct nfs_lock_context *pos; +	struct nfs_lock_context *head = &ctx->lock_context; +	struct nfs_lock_context *pos = head; -	list_for_each_entry(pos, &ctx->lock_context.list, list) { -		if (pos->lockowner != current->files) +	do { +		if (pos->lockowner.l_owner != current->files)  			continue; -		if (pos->pid != current->tgid) +		if (pos->lockowner.l_pid != current->tgid)  			continue;  		atomic_inc(&pos->count);  		return pos; -	} +	} while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head);  	return NULL;  }  struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)  {  	struct nfs_lock_context *res, *new = NULL; -	struct inode *inode = ctx->path.dentry->d_inode; +	struct inode *inode = ctx->dentry->d_inode;  	spin_lock(&inode->i_lock);  	res = __nfs_find_lock_context(ctx); @@ -566,7 +701,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)  		spin_unlock(&inode->i_lock);  		new = kmalloc(sizeof(*new), GFP_KERNEL);  		if (new == NULL) -			return NULL; +			return ERR_PTR(-ENOMEM);  		nfs_init_lock_context(new);  		spin_lock(&inode->i_lock);  		res = __nfs_find_lock_context(ctx); @@ -585,7 +720,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)  void nfs_put_lock_context(struct nfs_lock_context *l_ctx)  {  	struct nfs_open_context *ctx = l_ctx->open_context; -	struct inode *inode = ctx->path.dentry->d_inode; +	struct inode *inode = ctx->dentry->d_inode;  	if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))  		return; @@ -611,7 +746,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)  		return;  	if (!is_sync)  		return; -	inode = ctx->path.dentry->d_inode; +	inode = ctx->dentry->d_inode;  	if (!list_empty(&NFS_I(inode)->open_files))  		return;  	server = NFS_SERVER(inode); @@ -619,27 +754,34 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)  		return;  	nfs_revalidate_inode(server, inode);  } +EXPORT_SYMBOL_GPL(nfs_close_context); -struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode) +struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode)  {  	struct nfs_open_context *ctx; +	struct rpc_cred *cred = rpc_lookup_cred(); +	if (IS_ERR(cred)) +		return ERR_CAST(cred);  	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); -	if (ctx != NULL) { -		ctx->path = *path; -		path_get(&ctx->path); -		ctx->cred = get_rpccred(cred); -		ctx->state = NULL; -		ctx->mode = f_mode; -		ctx->flags = 0; -		ctx->error = 0; -		ctx->dir_cookie = 0; -		nfs_init_lock_context(&ctx->lock_context); -		ctx->lock_context.open_context = ctx; -		INIT_LIST_HEAD(&ctx->list); +	if (!ctx) { +		put_rpccred(cred); +		return ERR_PTR(-ENOMEM);  	} +	nfs_sb_active(dentry->d_sb); +	ctx->dentry = dget(dentry); +	ctx->cred = cred; +	ctx->state = NULL; +	ctx->mode = f_mode; +	ctx->flags = 0; +	ctx->error = 0; +	nfs_init_lock_context(&ctx->lock_context); +	ctx->lock_context.open_context = ctx; +	INIT_LIST_HEAD(&ctx->list); +	ctx->mdsthreshold = NULL;  	return ctx;  } +EXPORT_SYMBOL_GPL(alloc_nfs_open_context);  struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)  { @@ -647,10 +789,12 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)  		atomic_inc(&ctx->lock_context.count);  	return ctx;  } +EXPORT_SYMBOL_GPL(get_nfs_open_context);  static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)  { -	struct inode *inode = ctx->path.dentry->d_inode; +	struct inode *inode = ctx->dentry->d_inode; +	struct super_block *sb = ctx->dentry->d_sb;  	if (!list_empty(&ctx->list)) {  		if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) @@ -663,7 +807,9 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)  		NFS_PROTO(inode)->close_context(ctx, is_sync);  	if (ctx->cred != NULL)  		put_rpccred(ctx->cred); -	path_put(&ctx->path); +	dput(ctx->dentry); +	nfs_sb_deactive(sb); +	kfree(ctx->mdsthreshold);  	kfree(ctx);  } @@ -671,21 +817,30 @@ void put_nfs_open_context(struct nfs_open_context *ctx)  {  	__put_nfs_open_context(ctx, 0);  } +EXPORT_SYMBOL_GPL(put_nfs_open_context);  /*   * Ensure that mmap has a recent RPC credential for use when writing out   * shared pages   */ -void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) +void nfs_inode_attach_open_context(struct nfs_open_context *ctx)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = ctx->dentry->d_inode;  	struct nfs_inode *nfsi = NFS_I(inode); -	filp->private_data = get_nfs_open_context(ctx);  	spin_lock(&inode->i_lock);  	list_add(&ctx->list, &nfsi->open_files);  	spin_unlock(&inode->i_lock);  } +EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); + +void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) +{ +	filp->private_data = get_nfs_open_context(ctx); +	if (list_empty(&ctx->list)) +		nfs_inode_attach_open_context(ctx); +} +EXPORT_SYMBOL_GPL(nfs_file_set_open_context);  /*   * Given an inode, search for an open context with the desired characteristics @@ -710,10 +865,11 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c  static void nfs_file_clear_open_context(struct file *filp)  { -	struct inode *inode = filp->f_path.dentry->d_inode;  	struct nfs_open_context *ctx = nfs_file_open_context(filp);  	if (ctx) { +		struct inode *inode = ctx->dentry->d_inode; +  		filp->private_data = NULL;  		spin_lock(&inode->i_lock);  		list_move_tail(&ctx->list, &NFS_I(inode)->open_files); @@ -728,18 +884,13 @@ static void nfs_file_clear_open_context(struct file *filp)  int nfs_open(struct inode *inode, struct file *filp)  {  	struct nfs_open_context *ctx; -	struct rpc_cred *cred; -	cred = rpc_lookup_cred(); -	if (IS_ERR(cred)) -		return PTR_ERR(cred); -	ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode); -	put_rpccred(cred); -	if (ctx == NULL) -		return -ENOMEM; +	ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); +	if (IS_ERR(ctx)) +		return PTR_ERR(ctx);  	nfs_file_set_open_context(filp, ctx);  	put_nfs_open_context(ctx); -	nfs_fscache_set_inode_cookie(inode, filp); +	nfs_fscache_open_file(inode, filp);  	return 0;  } @@ -757,11 +908,14 @@ int  __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)  {  	int		 status = -ESTALE; +	struct nfs4_label *label = NULL;  	struct nfs_fattr *fattr = NULL;  	struct nfs_inode *nfsi = NFS_I(inode); -	dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", -		inode->i_sb->s_id, (long long)NFS_FILEID(inode)); +	dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n", +		inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); + +	trace_nfs_revalidate_inode_enter(inode);  	if (is_bad_inode(inode))  		goto out; @@ -774,36 +928,48 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)  		goto out;  	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); -	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr); + +	label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); +	if (IS_ERR(label)) { +		status = PTR_ERR(label); +		goto out; +	} + +	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);  	if (status != 0) { -		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", +		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n",  			 inode->i_sb->s_id, -			 (long long)NFS_FILEID(inode), status); +			 (unsigned long long)NFS_FILEID(inode), status);  		if (status == -ESTALE) {  			nfs_zap_caches(inode);  			if (!S_ISDIR(inode->i_mode))  				set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);  		} -		goto out; +		goto err_out;  	}  	status = nfs_refresh_inode(inode, fattr);  	if (status) { -		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", +		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n",  			 inode->i_sb->s_id, -			 (long long)NFS_FILEID(inode), status); -		goto out; +			 (unsigned long long)NFS_FILEID(inode), status); +		goto err_out;  	}  	if (nfsi->cache_validity & NFS_INO_INVALID_ACL)  		nfs_zap_acl_cache(inode); -	dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n", +	nfs_setsecurity(inode, fattr, label); + +	dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n",  		inode->i_sb->s_id, -		(long long)NFS_FILEID(inode)); +		(unsigned long long)NFS_FILEID(inode)); - out: +err_out: +	nfs4_label_free(label); +out:  	nfs_free_fattr(fattr); +	trace_nfs_revalidate_inode_exit(inode, status);  	return status;  } @@ -814,7 +980,7 @@ int nfs_attribute_timeout(struct inode *inode)  	return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);  } -static int nfs_attribute_cache_expired(struct inode *inode) +int nfs_attribute_cache_expired(struct inode *inode)  {  	if (nfs_have_delegated_attributes(inode))  		return 0; @@ -830,33 +996,50 @@ static int nfs_attribute_cache_expired(struct inode *inode)   */  int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)  { -	if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) -			&& !nfs_attribute_cache_expired(inode)) +	if (!nfs_need_revalidate_inode(inode))  		return NFS_STALE(inode) ? -ESTALE : 0;  	return __nfs_revalidate_inode(server, inode);  } +EXPORT_SYMBOL_GPL(nfs_revalidate_inode);  static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)  {  	struct nfs_inode *nfsi = NFS_I(inode); -	 +	int ret; +  	if (mapping->nrpages != 0) { -		int ret = invalidate_inode_pages2(mapping); +		if (S_ISREG(inode->i_mode)) { +			ret = nfs_sync_mapping(mapping); +			if (ret < 0) +				return ret; +		} +		ret = invalidate_inode_pages2(mapping);  		if (ret < 0)  			return ret;  	} -	spin_lock(&inode->i_lock); -	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; -	if (S_ISDIR(inode->i_mode)) +	if (S_ISDIR(inode->i_mode)) { +		spin_lock(&inode->i_lock);  		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); -	spin_unlock(&inode->i_lock); +		spin_unlock(&inode->i_lock); +	}  	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); -	nfs_fscache_reset_inode_cookie(inode); -	dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", -			inode->i_sb->s_id, (long long)NFS_FILEID(inode)); +	nfs_fscache_wait_on_invalidate(inode); + +	dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n", +			inode->i_sb->s_id, +			(unsigned long long)NFS_FILEID(inode));  	return 0;  } +static bool nfs_mapping_need_revalidate_inode(struct inode *inode) +{ +	if (nfs_have_delegated_attributes(inode)) +		return false; +	return (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE) +		|| nfs_attribute_timeout(inode) +		|| NFS_STALE(inode); +} +  /**   * nfs_revalidate_mapping - Revalidate the pagecache   * @inode - pointer to host inode @@ -865,50 +1048,100 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map  int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)  {  	struct nfs_inode *nfsi = NFS_I(inode); +	unsigned long *bitlock = &nfsi->flags;  	int ret = 0; -	if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) -			|| nfs_attribute_cache_expired(inode) -			|| NFS_STALE(inode)) { +	/* swapfiles are not supposed to be shared. */ +	if (IS_SWAPFILE(inode)) +		goto out; + +	if (nfs_mapping_need_revalidate_inode(inode)) {  		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);  		if (ret < 0)  			goto out;  	} -	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) -		ret = nfs_invalidate_mapping(inode, mapping); + +	/* +	 * We must clear NFS_INO_INVALID_DATA first to ensure that +	 * invalidations that come in while we're shooting down the mappings +	 * are respected. But, that leaves a race window where one revalidator +	 * can clear the flag, and then another checks it before the mapping +	 * gets invalidated. Fix that by serializing access to this part of +	 * the function. +	 * +	 * At the same time, we need to allow other tasks to see whether we +	 * might be in the middle of invalidating the pages, so we only set +	 * the bit lock here if it looks like we're going to be doing that. +	 */ +	for (;;) { +		ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING, +				  nfs_wait_bit_killable, TASK_KILLABLE); +		if (ret) +			goto out; +		spin_lock(&inode->i_lock); +		if (test_bit(NFS_INO_INVALIDATING, bitlock)) { +			spin_unlock(&inode->i_lock); +			continue; +		} +		if (nfsi->cache_validity & NFS_INO_INVALID_DATA) +			break; +		spin_unlock(&inode->i_lock); +		goto out; +	} + +	set_bit(NFS_INO_INVALIDATING, bitlock); +	smp_wmb(); +	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; +	spin_unlock(&inode->i_lock); +	trace_nfs_invalidate_mapping_enter(inode); +	ret = nfs_invalidate_mapping(inode, mapping); +	trace_nfs_invalidate_mapping_exit(inode, ret); + +	clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); +	smp_mb__after_atomic(); +	wake_up_bit(bitlock, NFS_INO_INVALIDATING);  out:  	return ret;  } -static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) +static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)  {  	struct nfs_inode *nfsi = NFS_I(inode); +	unsigned long ret = 0;  	if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)  			&& (fattr->valid & NFS_ATTR_FATTR_CHANGE) -			&& nfsi->change_attr == fattr->pre_change_attr) { -		nfsi->change_attr = fattr->change_attr; +			&& inode->i_version == fattr->pre_change_attr) { +		inode->i_version = fattr->change_attr;  		if (S_ISDIR(inode->i_mode)) -			nfsi->cache_validity |= NFS_INO_INVALID_DATA; +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); +		ret |= NFS_INO_INVALID_ATTR;  	}  	/* If we have atomic WCC data, we may update some attributes */  	if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)  			&& (fattr->valid & NFS_ATTR_FATTR_CTIME) -			&& timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) -			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); +			&& timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { +		memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); +		ret |= NFS_INO_INVALID_ATTR; +	}  	if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)  			&& (fattr->valid & NFS_ATTR_FATTR_MTIME)  			&& timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { -			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); -			if (S_ISDIR(inode->i_mode)) -				nfsi->cache_validity |= NFS_INO_INVALID_DATA; +		memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); +		if (S_ISDIR(inode->i_mode)) +			nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); +		ret |= NFS_INO_INVALID_ATTR;  	}  	if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)  			&& (fattr->valid & NFS_ATTR_FATTR_SIZE)  			&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) -			&& nfsi->npages == 0) -			i_size_write(inode, nfs_size_to_loff_t(fattr->size)); +			&& nfsi->npages == 0) { +		i_size_write(inode, nfs_size_to_loff_t(fattr->size)); +		ret |= NFS_INO_INVALID_ATTR; +	} + +	return ret;  }  /** @@ -927,6 +1160,8 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat  	unsigned long invalid = 0; +	if (nfs_have_delegated_attributes(inode)) +		return 0;  	/* Has the inode gone and changed behind our back? */  	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)  		return -EIO; @@ -934,12 +1169,12 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat  		return -EIO;  	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && -			nfsi->change_attr != fattr->change_attr) +			inode->i_version != fattr->change_attr)  		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;  	/* Verify a few of the more important attributes */  	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) -		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; +		invalid |= NFS_INO_INVALID_ATTR;  	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {  		cur_size = i_size_read(inode); @@ -951,9 +1186,9 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat  	/* Have any file permissions changed? */  	if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))  		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; -	if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid) +	if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid))  		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; -	if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid) +	if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid))  		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;  	/* Has the link count changed? */ @@ -964,7 +1199,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat  		invalid |= NFS_INO_INVALID_ATIME;  	if (invalid != 0) -		nfsi->cache_validity |= invalid; +		nfs_set_cache_invalid(inode, invalid);  	nfsi->read_cache_jiffies = fattr->time_start;  	return 0; @@ -1001,7 +1236,10 @@ void nfs_fattr_init(struct nfs_fattr *fattr)  	fattr->valid = 0;  	fattr->time_start = jiffies;  	fattr->gencount = nfs_inc_attr_generation_counter(); +	fattr->owner_name = NULL; +	fattr->group_name = NULL;  } +EXPORT_SYMBOL_GPL(nfs_fattr_init);  struct nfs_fattr *nfs_alloc_fattr(void)  { @@ -1012,6 +1250,7 @@ struct nfs_fattr *nfs_alloc_fattr(void)  		nfs_fattr_init(fattr);  	return fattr;  } +EXPORT_SYMBOL_GPL(nfs_alloc_fattr);  struct nfs_fh *nfs_alloc_fhandle(void)  { @@ -1022,6 +1261,70 @@ struct nfs_fh *nfs_alloc_fhandle(void)  		fh->size = 0;  	return fh;  } +EXPORT_SYMBOL_GPL(nfs_alloc_fhandle); + +#ifdef NFS_DEBUG +/* + * _nfs_display_fhandle_hash - calculate the crc32 hash for the filehandle + *                             in the same way that wireshark does + * + * @fh: file handle + * + * For debugging only. + */ +u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh) +{ +	/* wireshark uses 32-bit AUTODIN crc and does a bitwise +	 * not on the result */ +	return nfs_fhandle_hash(fh); +} +EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash); + +/* + * _nfs_display_fhandle - display an NFS file handle on the console + * + * @fh: file handle to display + * @caption: display caption + * + * For debugging only. + */ +void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption) +{ +	unsigned short i; + +	if (fh == NULL || fh->size == 0) { +		printk(KERN_DEFAULT "%s at %p is empty\n", caption, fh); +		return; +	} + +	printk(KERN_DEFAULT "%s at %p is %u bytes, crc: 0x%08x:\n", +	       caption, fh, fh->size, _nfs_display_fhandle_hash(fh)); +	for (i = 0; i < fh->size; i += 16) { +		__be32 *pos = (__be32 *)&fh->data[i]; + +		switch ((fh->size - i - 1) >> 2) { +		case 0: +			printk(KERN_DEFAULT " %08x\n", +				be32_to_cpup(pos)); +			break; +		case 1: +			printk(KERN_DEFAULT " %08x %08x\n", +				be32_to_cpup(pos), be32_to_cpup(pos + 1)); +			break; +		case 2: +			printk(KERN_DEFAULT " %08x %08x %08x\n", +				be32_to_cpup(pos), be32_to_cpup(pos + 1), +				be32_to_cpup(pos + 2)); +			break; +		default: +			printk(KERN_DEFAULT " %08x %08x %08x %08x\n", +				be32_to_cpup(pos), be32_to_cpup(pos + 1), +				be32_to_cpup(pos + 2), be32_to_cpup(pos + 3)); +		} +	} +} +EXPORT_SYMBOL_GPL(_nfs_display_fhandle); +#endif  /**   * nfs_inode_attrs_need_update - check if the inode attributes need updating @@ -1052,11 +1355,35 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n  		((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);  } +/* + * Don't trust the change_attribute, mtime, ctime or size if + * a pnfs LAYOUTCOMMIT is outstanding + */ +static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode, +		struct nfs_fattr *fattr) +{ +	if (pnfs_layoutcommit_outstanding(inode)) +		fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE | +				NFS_ATTR_FATTR_MTIME | +				NFS_ATTR_FATTR_CTIME | +				NFS_ATTR_FATTR_SIZE); +} +  static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)  { +	int ret; + +	trace_nfs_refresh_inode_enter(inode); + +	nfs_inode_attrs_handle_layoutcommit(inode, fattr); +  	if (nfs_inode_attrs_need_update(inode, fattr)) -		return nfs_update_inode(inode, fattr); -	return nfs_check_inode_attributes(inode, fattr); +		ret = nfs_update_inode(inode, fattr); +	else +		ret = nfs_check_inode_attributes(inode, fattr); + +	trace_nfs_refresh_inode_exit(inode, ret); +	return ret;  }  /** @@ -1081,14 +1408,15 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)  	return status;  } +EXPORT_SYMBOL_GPL(nfs_refresh_inode);  static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)  { -	struct nfs_inode *nfsi = NFS_I(inode); +	unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; -	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;  	if (S_ISDIR(inode->i_mode)) -		nfsi->cache_validity |= NFS_INO_INVALID_DATA; +		invalid |= NFS_INO_INVALID_DATA; +	nfs_set_cache_invalid(inode, invalid);  	if ((fattr->valid & NFS_ATTR_FATTR) == 0)  		return 0;  	return nfs_refresh_inode_locked(inode, fattr); @@ -1115,8 +1443,10 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)  	spin_lock(&inode->i_lock);  	status = nfs_post_op_update_inode_locked(inode, fattr);  	spin_unlock(&inode->i_lock); +  	return status;  } +EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);  /**   * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache @@ -1145,7 +1475,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa  	}  	if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&  			(fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { -		fattr->pre_change_attr = NFS_I(inode)->change_attr; +		fattr->pre_change_attr = inode->i_version;  		fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;  	}  	if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && @@ -1168,6 +1498,7 @@ out_noforce:  	spin_unlock(&inode->i_lock);  	return status;  } +EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc);  /*   * Many nfs protocol calls return the new file attributes after @@ -1190,24 +1521,37 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  	unsigned long now = jiffies;  	unsigned long save_cache_validity; -	dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", +	dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",  			__func__, inode->i_sb->s_id, inode->i_ino, +			nfs_display_fhandle_hash(NFS_FH(inode)),  			atomic_read(&inode->i_count), fattr->valid); -	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) -		goto out_fileid; +	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) { +		printk(KERN_ERR "NFS: server %s error: fileid changed\n" +			"fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", +			NFS_SERVER(inode)->nfs_client->cl_hostname, +			inode->i_sb->s_id, (long long)nfsi->fileid, +			(long long)fattr->fileid); +		goto out_err; +	}  	/*  	 * Make sure the inode's type hasn't changed.  	 */ -	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) -		goto out_changed; +	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { +		/* +		* Big trouble! The inode has become a different object. +		*/ +		printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n", +				__func__, inode->i_ino, inode->i_mode, fattr->mode); +		goto out_err; +	}  	server = NFS_SERVER(inode);  	/* Update the fsid? */  	if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&  			!nfs_fsid_equal(&server->fsid, &fattr->fsid) && -			!test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags)) +			!IS_AUTOMOUNT(inode))  		server->fsid = fattr->fsid;  	/* @@ -1222,54 +1566,37 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  			| NFS_INO_REVAL_PAGECACHE);  	/* Do atomic weak cache consistency updates */ -	nfs_wcc_update_inode(inode, fattr); +	invalid |= nfs_wcc_update_inode(inode, fattr);  	/* More cache consistency checks */  	if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { -		if (nfsi->change_attr != fattr->change_attr) { +		if (inode->i_version != fattr->change_attr) {  			dprintk("NFS: change_attr change on server for file %s/%ld\n",  					inode->i_sb->s_id, inode->i_ino); -			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; +			invalid |= NFS_INO_INVALID_ATTR +				| NFS_INO_INVALID_DATA +				| NFS_INO_INVALID_ACCESS +				| NFS_INO_INVALID_ACL +				| NFS_INO_REVAL_PAGECACHE;  			if (S_ISDIR(inode->i_mode))  				nfs_force_lookup_revalidate(inode); -			nfsi->change_attr = fattr->change_attr; +			inode->i_version = fattr->change_attr;  		}  	} else if (server->caps & NFS_CAP_CHANGE_ATTR) -		invalid |= save_cache_validity; +		nfsi->cache_validity |= save_cache_validity;  	if (fattr->valid & NFS_ATTR_FATTR_MTIME) { -		/* NFSv2/v3: Check if the mtime agrees */ -		if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { -			dprintk("NFS: mtime change on server for file %s/%ld\n", -					inode->i_sb->s_id, inode->i_ino); -			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; -			if (S_ISDIR(inode->i_mode)) -				nfs_force_lookup_revalidate(inode); -			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); -		} +		memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));  	} else if (server->caps & NFS_CAP_MTIME) -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR -				| NFS_INO_INVALID_DATA -				| NFS_INO_REVAL_PAGECACHE +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATTR  				| NFS_INO_REVAL_FORCED);  	if (fattr->valid & NFS_ATTR_FATTR_CTIME) { -		/* If ctime has changed we should definitely clear access+acl caches */ -		if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { -			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; -			/* and probably clear data for a directory too as utimes can cause -			 * havoc with our cache. -			 */ -			if (S_ISDIR(inode->i_mode)) { -				invalid |= NFS_INO_INVALID_DATA; -				nfs_force_lookup_revalidate(inode); -			} -			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); -		} +		memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));  	} else if (server->caps & NFS_CAP_CTIME) -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR -				| NFS_INO_INVALID_ACCESS -				| NFS_INO_INVALID_ACL +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATTR  				| NFS_INO_REVAL_FORCED);  	/* Check if our cached file size is stale */ @@ -1279,15 +1606,21 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  		if (new_isize != cur_isize) {  			/* Do we perhaps have any outstanding writes, or has  			 * the file grown beyond our last write? */ -			if (nfsi->npages == 0 || new_isize > cur_isize) { +			if ((nfsi->npages == 0) || new_isize > cur_isize) {  				i_size_write(inode, new_isize);  				invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; +				invalid &= ~NFS_INO_REVAL_PAGECACHE;  			} -			dprintk("NFS: isize change on server for file %s/%ld\n", -					inode->i_sb->s_id, inode->i_ino); +			dprintk("NFS: isize change on server for file %s/%ld " +					"(%Ld to %Ld)\n", +					inode->i_sb->s_id, +					inode->i_ino, +					(long long)cur_isize, +					(long long)new_isize);  		}  	} else -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATTR  				| NFS_INO_REVAL_PAGECACHE  				| NFS_INO_REVAL_FORCED); @@ -1295,7 +1628,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  	if (fattr->valid & NFS_ATTR_FATTR_ATIME)  		memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));  	else if (server->caps & NFS_CAP_ATIME) -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATIME  				| NFS_INO_REVAL_FORCED);  	if (fattr->valid & NFS_ATTR_FATTR_MODE) { @@ -1306,29 +1640,32 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;  		}  	} else if (server->caps & NFS_CAP_MODE) -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATTR  				| NFS_INO_INVALID_ACCESS  				| NFS_INO_INVALID_ACL  				| NFS_INO_REVAL_FORCED);  	if (fattr->valid & NFS_ATTR_FATTR_OWNER) { -		if (inode->i_uid != fattr->uid) { +		if (!uid_eq(inode->i_uid, fattr->uid)) {  			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;  			inode->i_uid = fattr->uid;  		}  	} else if (server->caps & NFS_CAP_OWNER) -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATTR  				| NFS_INO_INVALID_ACCESS  				| NFS_INO_INVALID_ACL  				| NFS_INO_REVAL_FORCED);  	if (fattr->valid & NFS_ATTR_FATTR_GROUP) { -		if (inode->i_gid != fattr->gid) { +		if (!gid_eq(inode->i_gid, fattr->gid)) {  			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;  			inode->i_gid = fattr->gid;  		}  	} else if (server->caps & NFS_CAP_OWNER_GROUP) -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATTR  				| NFS_INO_INVALID_ACCESS  				| NFS_INO_INVALID_ACL  				| NFS_INO_REVAL_FORCED); @@ -1338,10 +1675,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  			invalid |= NFS_INO_INVALID_ATTR;  			if (S_ISDIR(inode->i_mode))  				invalid |= NFS_INO_INVALID_DATA; -			inode->i_nlink = fattr->nlink; +			set_nlink(inode, fattr->nlink);  		}  	} else if (server->caps & NFS_CAP_NLINK) -		invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR +		nfsi->cache_validity |= save_cache_validity & +				(NFS_INO_INVALID_ATTR  				| NFS_INO_REVAL_FORCED);  	if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -1371,17 +1709,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)  				|| S_ISLNK(inode->i_mode)))  		invalid &= ~NFS_INO_INVALID_DATA; -	if (!nfs_have_delegation(inode, FMODE_READ) || +	if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) ||  			(save_cache_validity & NFS_INO_REVAL_FORCED)) -		nfsi->cache_validity |= invalid; +		nfs_set_cache_invalid(inode, invalid);  	return 0; - out_changed: -	/* -	 * Big trouble! The inode has become a different object. -	 */ -	printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n", -			__func__, inode->i_ino, inode->i_mode, fattr->mode);   out_err:  	/*  	 * No need to worry about unhashing the dentry, as the @@ -1390,34 +1722,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  	 */  	nfs_invalidate_inode(inode);  	return -ESTALE; - - out_fileid: -	printk(KERN_ERR "NFS: server %s error: fileid changed\n" -		"fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", -		NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id, -		(long long)nfsi->fileid, (long long)fattr->fileid); -	goto out_err; -} - - -#ifdef CONFIG_NFS_V4 - -/* - * Clean out any remaining NFSv4 state that might be left over due - * to open() calls that passed nfs_atomic_lookup, but failed to call - * nfs_open(). - */ -void nfs4_evict_inode(struct inode *inode) -{ -	truncate_inode_pages(&inode->i_data, 0); -	end_writeback(inode); -	pnfs_destroy_layout(NFS_I(inode)); -	/* If we are holding a delegation, return it! */ -	nfs_inode_return_delegation_noreclaim(inode); -	/* First call standard NFS clear_inode() code */ -	nfs_clear_inode(inode);  } -#endif  struct inode *nfs_alloc_inode(struct super_block *sb)  { @@ -1427,24 +1732,28 @@ struct inode *nfs_alloc_inode(struct super_block *sb)  		return NULL;  	nfsi->flags = 0UL;  	nfsi->cache_validity = 0UL; -#ifdef CONFIG_NFS_V3_ACL -	nfsi->acl_access = ERR_PTR(-EAGAIN); -	nfsi->acl_default = ERR_PTR(-EAGAIN); -#endif -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4)  	nfsi->nfs4_acl = NULL;  #endif /* CONFIG_NFS_V4 */  	return &nfsi->vfs_inode;  } +EXPORT_SYMBOL_GPL(nfs_alloc_inode); -void nfs_destroy_inode(struct inode *inode) +static void nfs_i_callback(struct rcu_head *head)  { +	struct inode *inode = container_of(head, struct inode, i_rcu);  	kmem_cache_free(nfs_inode_cachep, NFS_I(inode));  } +void nfs_destroy_inode(struct inode *inode) +{ +	call_rcu(&inode->i_rcu, nfs_i_callback); +} +EXPORT_SYMBOL_GPL(nfs_destroy_inode); +  static inline void nfs4_init_once(struct nfs_inode *nfsi)  { -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4)  	INIT_LIST_HEAD(&nfsi->open_states);  	nfsi->delegation = NULL;  	nfsi->delegation_state = 0; @@ -1461,9 +1770,10 @@ static void init_once(void *foo)  	INIT_LIST_HEAD(&nfsi->open_files);  	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);  	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); -	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); +	INIT_LIST_HEAD(&nfsi->commit_info.list);  	nfsi->npages = 0; -	nfsi->ncommit = 0; +	nfsi->commit_info.ncommit = 0; +	atomic_set(&nfsi->commit_info.rpcs_out, 0);  	atomic_set(&nfsi->silly_count, 1);  	INIT_HLIST_HEAD(&nfsi->silly_list);  	init_waitqueue_head(&nfsi->waitqueue); @@ -1485,10 +1795,16 @@ static int __init nfs_init_inodecache(void)  static void nfs_destroy_inodecache(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier();  	kmem_cache_destroy(nfs_inode_cachep);  }  struct workqueue_struct *nfsiod_workqueue; +EXPORT_SYMBOL_GPL(nfsiod_workqueue);  /*   * start up the nfsiod workqueue @@ -1497,7 +1813,7 @@ static int nfsiod_start(void)  {  	struct workqueue_struct *wq;  	dprintk("RPC:       creating workqueue nfsiod\n"); -	wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0); +	wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0);  	if (wq == NULL)  		return -ENOMEM;  	nfsiod_workqueue = wq; @@ -1518,6 +1834,27 @@ static void nfsiod_stop(void)  	destroy_workqueue(wq);  } +int nfs_net_id; +EXPORT_SYMBOL_GPL(nfs_net_id); + +static int nfs_net_init(struct net *net) +{ +	nfs_clients_init(net); +	return 0; +} + +static void nfs_net_exit(struct net *net) +{ +	nfs_cleanup_cb_ident_idr(net); +} + +static struct pernet_operations nfs_net_ops = { +	.init = nfs_net_init, +	.exit = nfs_net_exit, +	.id   = &nfs_net_id, +	.size = sizeof(struct nfs_net), +}; +  /*   * Initialize NFS   */ @@ -1525,75 +1862,70 @@ static int __init init_nfs_fs(void)  {  	int err; -	err = nfs_idmap_init(); +	err = register_pernet_subsys(&nfs_net_ops);  	if (err < 0)  		goto out9; -	err = nfs_dns_resolver_init(); -	if (err < 0) -		goto out8; -  	err = nfs_fscache_register();  	if (err < 0) -		goto out7; +		goto out8;  	err = nfsiod_start();  	if (err) -		goto out6; +		goto out7;  	err = nfs_fs_proc_init();  	if (err) -		goto out5; +		goto out6;  	err = nfs_init_nfspagecache();  	if (err) -		goto out4; +		goto out5;  	err = nfs_init_inodecache();  	if (err) -		goto out3; +		goto out4;  	err = nfs_init_readpagecache();  	if (err) -		goto out2; +		goto out3;  	err = nfs_init_writepagecache();  	if (err) -		goto out1; +		goto out2;  	err = nfs_init_directcache();  	if (err) -		goto out0; +		goto out1;  #ifdef CONFIG_PROC_FS -	rpc_proc_register(&nfs_rpcstat); +	rpc_proc_register(&init_net, &nfs_rpcstat);  #endif  	if ((err = register_nfs_fs()) != 0) -		goto out; +		goto out0; +  	return 0; -out: +out0:  #ifdef CONFIG_PROC_FS -	rpc_proc_unregister("nfs"); +	rpc_proc_unregister(&init_net, "nfs");  #endif  	nfs_destroy_directcache(); -out0: -	nfs_destroy_writepagecache();  out1: -	nfs_destroy_readpagecache(); +	nfs_destroy_writepagecache();  out2: -	nfs_destroy_inodecache(); +	nfs_destroy_readpagecache();  out3: -	nfs_destroy_nfspagecache(); +	nfs_destroy_inodecache();  out4: -	nfs_fs_proc_exit(); +	nfs_destroy_nfspagecache();  out5: -	nfsiod_stop(); +	nfs_fs_proc_exit();  out6: -	nfs_fscache_unregister(); +	nfsiod_stop();  out7: -	nfs_dns_resolver_destroy(); +	nfs_fscache_unregister();  out8: -	nfs_idmap_quit(); +	unregister_pernet_subsys(&nfs_net_ops);  out9:  	return err;  } @@ -1606,10 +1938,9 @@ static void __exit exit_nfs_fs(void)  	nfs_destroy_inodecache();  	nfs_destroy_nfspagecache();  	nfs_fscache_unregister(); -	nfs_dns_resolver_destroy(); -	nfs_idmap_quit(); +	unregister_pernet_subsys(&nfs_net_ops);  #ifdef CONFIG_PROC_FS -	rpc_proc_unregister("nfs"); +	rpc_proc_unregister(&init_net, "nfs");  #endif  	unregister_nfs_fs();  	nfs_fs_proc_exit(); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index db08ff3ff45..f415cbf9f6c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -5,6 +5,7 @@  #include "nfs4_fs.h"  #include <linux/mount.h>  #include <linux/security.h> +#include <linux/crc32.h>  #define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) @@ -18,25 +19,21 @@ struct nfs_string;   */  #define NFS_MAX_READAHEAD	(RPC_DEF_SLOT_TABLE - 1) -/* - * Determine if sessions are in use. - */ -static inline int nfs4_has_session(const struct nfs_client *clp) +static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)  { -#ifdef CONFIG_NFS_V4_1 -	if (clp->cl_session) -		return 1; -#endif /* CONFIG_NFS_V4_1 */ -	return 0; +	if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) +		fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT;  } -static inline int nfs4_has_persistent_session(const struct nfs_client *clp) +static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr)  { -#ifdef CONFIG_NFS_V4_1 -	if (nfs4_has_session(clp)) -		return (clp->cl_session->flags & SESSION4_PERSIST); -#endif /* CONFIG_NFS_V4_1 */ -	return 0; +	if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) || +	    (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) && +	     ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0))) +		return 0; + +	fattr->fileid = fattr->mounted_on_fileid; +	return 1;  }  struct nfs_clone_mount { @@ -68,24 +65,36 @@ struct nfs_clone_mount {   */  #define NFS_MAX_READDIR_PAGES 8 +struct nfs_client_initdata { +	unsigned long init_flags; +	const char *hostname; +	const struct sockaddr *addr; +	size_t addrlen; +	struct nfs_subversion *nfs_mod; +	int proto; +	u32 minorversion; +	struct net *net; +}; +  /*   * In-kernel mount arguments   */  struct nfs_parsed_mount_data {  	int			flags; -	int			rsize, wsize; -	int			timeo, retrans; -	int			acregmin, acregmax, +	unsigned int		rsize, wsize; +	unsigned int		timeo, retrans; +	unsigned int		acregmin, acregmax,  				acdirmin, acdirmax; -	int			namlen; +	unsigned int		namlen;  	unsigned int		options;  	unsigned int		bsize; -	unsigned int		auth_flavor_len; -	rpc_authflavor_t	auth_flavors[1]; +	struct nfs_auth_info	auth_info; +	rpc_authflavor_t	selected_flavor;  	char			*client_address;  	unsigned int		version;  	unsigned int		minorversion;  	char			*fscache_uniq; +	bool			need_mount;  	struct {  		struct sockaddr_storage	address; @@ -106,6 +115,7 @@ struct nfs_parsed_mount_data {  	} nfs_server;  	struct security_mnt_opts lsm_opts; +	struct net		*net;  };  /* mount_clnt.c */ @@ -120,31 +130,68 @@ struct nfs_mount_request {  	int			noresvport;  	unsigned int		*auth_flav_len;  	rpc_authflavor_t	*auth_flavs; +	struct net		*net; +}; + +struct nfs_mount_info { +	void (*fill_super)(struct super_block *, struct nfs_mount_info *); +	int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *); +	struct nfs_parsed_mount_data *parsed; +	struct nfs_clone_mount *cloned; +	struct nfs_fh *mntfh;  };  extern int nfs_mount(struct nfs_mount_request *info);  extern void nfs_umount(const struct nfs_mount_request *info);  /* client.c */ -extern struct rpc_program nfs_program; - +extern const struct rpc_program nfs_program; +extern void nfs_clients_init(struct net *net); +extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); +int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t); +struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, +				  const struct rpc_timeout *, const char *, +				  rpc_authflavor_t); +int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); +void nfs_server_insert_lists(struct nfs_server *); +void nfs_server_remove_lists(struct nfs_server *); +void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int); +int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, +		rpc_authflavor_t); +struct nfs_server *nfs_alloc_server(void); +void nfs_server_copy_userdata(struct nfs_server *, struct nfs_server *); + +extern void nfs_cleanup_cb_ident_idr(struct net *);  extern void nfs_put_client(struct nfs_client *); -extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32); -extern struct nfs_client *nfs_find_client_next(struct nfs_client *); -extern struct nfs_server *nfs_create_server( -					const struct nfs_parsed_mount_data *, -					struct nfs_fh *); +extern void nfs_free_client(struct nfs_client *); +extern struct nfs_client *nfs4_find_client_ident(struct net *, int); +extern struct nfs_client * +nfs4_find_client_sessionid(struct net *, const struct sockaddr *, +				struct nfs4_sessionid *, u32); +extern struct nfs_server *nfs_create_server(struct nfs_mount_info *, +					struct nfs_subversion *);  extern struct nfs_server *nfs4_create_server( -					const struct nfs_parsed_mount_data *, -					struct nfs_fh *); +					struct nfs_mount_info *, +					struct nfs_subversion *);  extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,  						      struct nfs_fh *); +extern int nfs4_update_server(struct nfs_server *server, const char *hostname, +					struct sockaddr *sap, size_t salen, +					struct net *net);  extern void nfs_free_server(struct nfs_server *server);  extern struct nfs_server *nfs_clone_server(struct nfs_server *,  					   struct nfs_fh *, -					   struct nfs_fattr *); +					   struct nfs_fattr *, +					   rpc_authflavor_t); +extern int nfs_wait_client_init_complete(const struct nfs_client *clp);  extern void nfs_mark_client_ready(struct nfs_client *clp, int state); -extern int nfs4_check_client_ready(struct nfs_client *clp); +extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, +					     const struct sockaddr *ds_addr, +					     int ds_addrlen, int ds_proto, +					     unsigned int ds_timeo, +					     unsigned int ds_retrans); +extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, +						struct inode *);  #ifdef CONFIG_PROC_FS  extern int __init nfs_fs_proc_init(void);  extern void nfs_fs_proc_exit(void); @@ -158,21 +205,22 @@ static inline void nfs_fs_proc_exit(void)  }  #endif -/* nfs4namespace.c */ -#ifdef CONFIG_NFS_V4 -extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry); -#else -static inline -struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) -{ -	return ERR_PTR(-ENOENT); -} +#ifdef CONFIG_NFS_V4_1 +int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *); +#endif + +/* nfs3client.c */ +#if IS_ENABLED(CONFIG_NFS_V3) +struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *); +struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *, +				     struct nfs_fattr *, rpc_authflavor_t);  #endif  /* callback_xdr.c */  extern struct svc_version nfs4_callback_version1;  extern struct svc_version nfs4_callback_version4; +struct nfs_pageio_descriptor;  /* pagelist.c */  extern int __init nfs_init_nfspagecache(void);  extern void nfs_destroy_nfspagecache(void); @@ -183,55 +231,146 @@ extern void nfs_destroy_writepagecache(void);  extern int __init nfs_init_directcache(void);  extern void nfs_destroy_directcache(void); +extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, +			      struct nfs_pgio_header *hdr, +			      void (*release)(struct nfs_pgio_header *hdr)); +void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); +int nfs_iocounter_wait(struct nfs_io_counter *c); + +extern const struct nfs_pageio_ops nfs_pgio_rw_ops; +struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *); +void nfs_rw_header_free(struct nfs_pgio_header *); +void nfs_pgio_data_release(struct nfs_pgio_data *); +int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); +int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *, +		      const struct rpc_call_ops *, int, int); +void nfs_free_request(struct nfs_page *req); + +static inline void nfs_iocounter_init(struct nfs_io_counter *c) +{ +	c->flags = 0; +	atomic_set(&c->io_count, 0); +}  /* nfs2xdr.c */ -extern int nfs_stat_to_errno(int);  extern struct rpc_procinfo nfs_procedures[]; -extern __be32 *nfs_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); +extern int nfs2_decode_dirent(struct xdr_stream *, +				struct nfs_entry *, int);  /* nfs3xdr.c */  extern struct rpc_procinfo nfs3_procedures[]; -extern __be32 *nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); +extern int nfs3_decode_dirent(struct xdr_stream *, +				struct nfs_entry *, int);  /* nfs4xdr.c */ -#ifdef CONFIG_NFS_V4 -extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); +#if IS_ENABLED(CONFIG_NFS_V4) +extern int nfs4_decode_dirent(struct xdr_stream *, +				struct nfs_entry *, int);  #endif  #ifdef CONFIG_NFS_V4_1  extern const u32 nfs41_maxread_overhead;  extern const u32 nfs41_maxwrite_overhead; +extern const u32 nfs41_maxgetdevinfo_overhead;  #endif  /* nfs4proc.c */ -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4)  extern struct rpc_procinfo nfs4_procedures[];  #endif +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); +static inline void nfs4_label_free(struct nfs4_label *label) +{ +	if (label) { +		kfree(label->label); +		kfree(label); +	} +	return; +} + +static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) +{ +	if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL)) +		nfsi->cache_validity |= NFS_INO_INVALID_LABEL; +} +#else +static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; } +static inline void nfs4_label_free(void *label) {} +static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) +{ +} +#endif /* CONFIG_NFS_V4_SECURITY_LABEL */ +  /* proc.c */  void nfs_close_context(struct nfs_open_context *ctx, int is_sync); +extern struct nfs_client *nfs_init_client(struct nfs_client *clp, +			   const struct rpc_timeout *timeparms, +			   const char *ip_addr);  /* dir.c */ -extern int nfs_access_cache_shrinker(struct shrinker *shrink, -					int nr_to_scan, gfp_t gfp_mask); +extern void nfs_force_use_readdirplus(struct inode *dir); +extern unsigned long nfs_access_cache_count(struct shrinker *shrink, +					    struct shrink_control *sc); +extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, +					   struct shrink_control *sc); +struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); +int nfs_create(struct inode *, struct dentry *, umode_t, bool); +int nfs_mkdir(struct inode *, struct dentry *, umode_t); +int nfs_rmdir(struct inode *, struct dentry *); +int nfs_unlink(struct inode *, struct dentry *); +int nfs_symlink(struct inode *, struct dentry *, const char *); +int nfs_link(struct dentry *, struct inode *, struct dentry *); +int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); +int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); + +/* file.c */ +int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int); +loff_t nfs_file_llseek(struct file *, loff_t, int); +int nfs_file_flush(struct file *, fl_owner_t); +ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); +ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, +			     size_t, unsigned int); +int nfs_file_mmap(struct file *, struct vm_area_struct *); +ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); +int nfs_file_release(struct inode *, struct file *); +int nfs_lock(struct file *, int, struct file_lock *); +int nfs_flock(struct file *, int, struct file_lock *); +int nfs_check_flags(int); +int nfs_setlease(struct file *, long, struct file_lock **);  /* inode.c */  extern struct workqueue_struct *nfsiod_workqueue;  extern struct inode *nfs_alloc_inode(struct super_block *sb);  extern void nfs_destroy_inode(struct inode *);  extern int nfs_write_inode(struct inode *, struct writeback_control *); +extern int nfs_drop_inode(struct inode *); +extern void nfs_clear_inode(struct inode *);  extern void nfs_evict_inode(struct inode *); -#ifdef CONFIG_NFS_V4 -extern void nfs4_evict_inode(struct inode *); -#endif  void nfs_zap_acl_cache(struct inode *inode);  extern int nfs_wait_bit_killable(void *word);  /* super.c */ +extern const struct super_operations nfs_sops; +extern struct file_system_type nfs_fs_type;  extern struct file_system_type nfs_xdev_fs_type; -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4)  extern struct file_system_type nfs4_xdev_fs_type;  extern struct file_system_type nfs4_referral_fs_type;  #endif +bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t); +struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *, +			struct nfs_subversion *); +void nfs_initialise_sb(struct super_block *); +int nfs_set_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); +int nfs_clone_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); +struct dentry *nfs_fs_mount_common(struct nfs_server *, int, const char *, +				   struct nfs_mount_info *, struct nfs_subversion *); +struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void *); +struct dentry * nfs_xdev_mount_common(struct file_system_type *, int, +		const char *, struct nfs_mount_info *); +void nfs_kill_super(struct super_block *); +void nfs_fill_super(struct super_block *, struct nfs_mount_info *);  extern struct rpc_stat nfs_rpcstat; @@ -241,52 +380,127 @@ extern void nfs_sb_active(struct super_block *sb);  extern void nfs_sb_deactive(struct super_block *sb);  /* namespace.c */ -extern char *nfs_path(const char *base, -		      const struct dentry *droot, -		      const struct dentry *dentry, -		      char *buffer, ssize_t buflen); +#define NFS_PATH_CANONICAL 1 +extern char *nfs_path(char **p, struct dentry *dentry, +		      char *buffer, ssize_t buflen, unsigned flags); +extern struct vfsmount *nfs_d_automount(struct path *path); +struct vfsmount *nfs_submount(struct nfs_server *, struct dentry *, +			      struct nfs_fh *, struct nfs_fattr *); +struct vfsmount *nfs_do_submount(struct dentry *, struct nfs_fh *, +				 struct nfs_fattr *, rpc_authflavor_t);  /* getroot.c */ -extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *); -#ifdef CONFIG_NFS_V4 -extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *); +extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, +				   const char *); +#if IS_ENABLED(CONFIG_NFS_V4) +extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *, +				    const char *); -extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); +extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool);  #endif +struct nfs_pgio_completion_ops;  /* read.c */ +extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, +			struct inode *inode, bool force_mds, +			const struct nfs_pgio_completion_ops *compl_ops);  extern void nfs_read_prepare(struct rpc_task *task, void *calldata); +extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); + +/* super.c */ +void nfs_clone_super(struct super_block *, struct nfs_mount_info *); +void nfs_umount_begin(struct super_block *); +int  nfs_statfs(struct dentry *, struct kstatfs *); +int  nfs_show_options(struct seq_file *, struct dentry *); +int  nfs_show_devname(struct seq_file *, struct dentry *); +int  nfs_show_path(struct seq_file *, struct dentry *); +int  nfs_show_stats(struct seq_file *, struct dentry *); +void nfs_put_super(struct super_block *); +int nfs_remount(struct super_block *sb, int *flags, char *raw_data);  /* write.c */ +extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, +			struct inode *inode, int ioflags, bool force_mds, +			const struct nfs_pgio_completion_ops *compl_ops); +extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); +extern void nfs_commit_free(struct nfs_commit_data *p);  extern void nfs_write_prepare(struct rpc_task *task, void *calldata); +extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); +extern int nfs_initiate_commit(struct rpc_clnt *clnt, +			       struct nfs_commit_data *data, +			       const struct rpc_call_ops *call_ops, +			       int how, int flags); +extern void nfs_init_commit(struct nfs_commit_data *data, +			    struct list_head *head, +			    struct pnfs_layout_segment *lseg, +			    struct nfs_commit_info *cinfo); +int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, +			 struct nfs_commit_info *cinfo, int max); +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *); +int nfs_scan_commit(struct inode *inode, struct list_head *dst, +		    struct nfs_commit_info *cinfo); +void nfs_mark_request_commit(struct nfs_page *req, +			     struct pnfs_layout_segment *lseg, +			     struct nfs_commit_info *cinfo); +int nfs_generic_commit_list(struct inode *inode, struct list_head *head, +			    int how, struct nfs_commit_info *cinfo); +void nfs_retry_commit(struct list_head *page_list, +		      struct pnfs_layout_segment *lseg, +		      struct nfs_commit_info *cinfo); +void nfs_commitdata_release(struct nfs_commit_data *data); +void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, +				 struct nfs_commit_info *cinfo); +void nfs_request_remove_commit_list(struct nfs_page *req, +				    struct nfs_commit_info *cinfo); +void nfs_init_cinfo(struct nfs_commit_info *cinfo, +		    struct inode *inode, +		    struct nfs_direct_req *dreq); +int nfs_key_timeout_notify(struct file *filp, struct inode *inode); +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); +  #ifdef CONFIG_MIGRATION  extern int nfs_migrate_page(struct address_space *, -		struct page *, struct page *); +		struct page *, struct page *, enum migrate_mode);  #else  #define nfs_migrate_page NULL  #endif +/* unlink.c */ +extern struct rpc_task * +nfs_async_rename(struct inode *old_dir, struct inode *new_dir, +		 struct dentry *old_dentry, struct dentry *new_dentry, +		 void (*complete)(struct rpc_task *, struct nfs_renamedata *)); +extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); + +/* direct.c */ +void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, +			      struct nfs_direct_req *dreq); +static inline void nfs_inode_dio_wait(struct inode *inode) +{ +	inode_dio_wait(inode); +} +extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); +  /* nfs4proc.c */ -extern int _nfs4_call_sync(struct nfs_server *server, -			   struct rpc_message *msg, -			   struct nfs4_sequence_args *args, -			   struct nfs4_sequence_res *res, -			   int cache_reply); -extern int _nfs4_call_sync_session(struct nfs_server *server, -				   struct rpc_message *msg, -				   struct nfs4_sequence_args *args, -				   struct nfs4_sequence_res *res, -				   int cache_reply); +extern void __nfs4_read_done_cb(struct nfs_pgio_data *); +extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, +			    const struct rpc_timeout *timeparms, +			    const char *ip_addr); +extern int nfs40_walk_client_list(struct nfs_client *clp, +				struct nfs_client **result, +				struct rpc_cred *cred); +extern int nfs41_walk_client_list(struct nfs_client *clp, +				struct nfs_client **result, +				struct rpc_cred *cred);  /*   * Determine the device name as a string   */ -static inline char *nfs_devname(const struct vfsmount *mnt_parent, -				const struct dentry *dentry, +static inline char *nfs_devname(struct dentry *dentry,  				char *buffer, ssize_t buflen)  { -	return nfs_path(mnt_parent->mnt_devname, mnt_parent->mnt_root, -			dentry, buffer, buflen); +	char *dummy; +	return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL);  }  /* @@ -349,19 +563,29 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)  static inline  unsigned int nfs_page_length(struct page *page)  { -	loff_t i_size = i_size_read(page->mapping->host); +	loff_t i_size = i_size_read(page_file_mapping(page)->host);  	if (i_size > 0) { +		pgoff_t page_index = page_file_index(page);  		pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; -		if (page->index < end_index) +		if (page_index < end_index)  			return PAGE_CACHE_SIZE; -		if (page->index == end_index) +		if (page_index == end_index)  			return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;  	}  	return 0;  }  /* + * Convert a umode to a dirent->d_type + */ +static inline +unsigned char nfs_umode_to_dtype(umode_t mode) +{ +	return (mode >> 12) & 15; +} + +/*   * Determine the number of pages in an array of length 'len' and   * with a base offset of 'base'   */ @@ -373,12 +597,33 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)  }  /* - * Helper for restarting RPC calls in the possible presence of NFSv4.1 - * sessions. + * Convert a struct timespec into a 64-bit change attribute + * + * This does approximately the same thing as timespec_to_ns(), + * but for calculation efficiency, we multiply the seconds by + * 1024*1024*1024.   */ -static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp) +static inline +u64 nfs_timespec_to_change_attr(const struct timespec *ts)  { -	if (nfs4_has_session(clp)) -		return rpc_restart_call_prepare(task); -	return rpc_restart_call(task); +	return ((u64)ts->tv_sec << 30) + ts->tv_nsec;  } + +#ifdef CONFIG_CRC32 +/** + * nfs_fhandle_hash - calculate the crc32 hash for the filehandle + * @fh - pointer to filehandle + * + * returns a crc32 hash for the filehandle that is compatible with + * the one displayed by "wireshark". + */ +static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) +{ +	return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size); +} +#else +static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) +{ +	return 0; +} +#endif diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index eceafe74f47..99a45283b9e 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -16,7 +16,7 @@  #include <linux/nfs_fs.h>  #include "internal.h" -#ifdef RPC_DEBUG +#ifdef NFS_DEBUG  # define NFSDBG_FACILITY	NFSDBG_MOUNT  #endif @@ -67,7 +67,7 @@ enum {  	MOUNTPROC3_EXPORT	= 5,  }; -static struct rpc_program	mnt_program; +static const struct rpc_program mnt_program;  /*   * Defined by OpenGroup XNFS Version 3W, chapter 8 @@ -139,7 +139,10 @@ struct mnt_fhstatus {   * nfs_mount - Obtain an NFS file handle for the given host and path   * @info: pointer to mount request arguments   * - * Uses default timeout parameters specified by underlying transport. + * Uses default timeout parameters specified by underlying transport. On + * successful return, the auth_flavs list and auth_flav_len will be populated + * with the list from the server or a faked-up list if the server didn't + * provide one.   */  int nfs_mount(struct nfs_mount_request *info)  { @@ -153,7 +156,7 @@ int nfs_mount(struct nfs_mount_request *info)  		.rpc_resp	= &result,  	};  	struct rpc_create_args args = { -		.net		= &init_net, +		.net		= info->net,  		.protocol	= info->protocol,  		.address	= info->sap,  		.addrsize	= info->salen, @@ -169,6 +172,9 @@ int nfs_mount(struct nfs_mount_request *info)  		(info->hostname ? info->hostname : "server"),  			info->dirpath); +	if (strlen(info->dirpath) > MNTPATHLEN) +		return -ENAMETOOLONG; +  	if (info->noresvport)  		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; @@ -181,7 +187,7 @@ int nfs_mount(struct nfs_mount_request *info)  	else  		msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT]; -	status = rpc_call_sync(mnt_clnt, &msg, 0); +	status = rpc_call_sync(mnt_clnt, &msg, RPC_TASK_SOFT|RPC_TASK_TIMEOUT);  	rpc_shutdown_client(mnt_clnt);  	if (status < 0) @@ -192,6 +198,15 @@ int nfs_mount(struct nfs_mount_request *info)  	dprintk("NFS: MNT request succeeded\n");  	status = 0; +	/* +	 * If the server didn't provide a flavor list, allow the +	 * client to try any flavor. +	 */ +	if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) { +		dprintk("NFS: Faking up auth_flavs list\n"); +		info->auth_flavs[0] = RPC_AUTH_NULL; +		*info->auth_flav_len = 1; +	}  out:  	return status; @@ -225,7 +240,7 @@ void nfs_umount(const struct nfs_mount_request *info)  		.to_retries = 2,  	};  	struct rpc_create_args args = { -		.net		= &init_net, +		.net		= info->net,  		.protocol	= IPPROTO_UDP,  		.address	= info->sap,  		.addrsize	= info->salen, @@ -236,19 +251,20 @@ void nfs_umount(const struct nfs_mount_request *info)  		.authflavor	= RPC_AUTH_UNIX,  		.flags		= RPC_CLNT_CREATE_NOPING,  	}; -	struct mountres	result;  	struct rpc_message msg	= {  		.rpc_argp	= info->dirpath, -		.rpc_resp	= &result,  	};  	struct rpc_clnt *clnt;  	int status; +	if (strlen(info->dirpath) > MNTPATHLEN) +		return; +  	if (info->noresvport)  		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;  	clnt = rpc_create(&args); -	if (unlikely(IS_ERR(clnt))) +	if (IS_ERR(clnt))  		goto out_clnt_err;  	dprintk("NFS: sending UMNT request for %s:%s\n", @@ -280,29 +296,19 @@ out_call_err:   * XDR encode/decode functions for MOUNT   */ -static int encode_mntdirpath(struct xdr_stream *xdr, const char *pathname) +static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)  {  	const u32 pathname_len = strlen(pathname);  	__be32 *p; -	if (unlikely(pathname_len > MNTPATHLEN)) -		return -EIO; - -	p = xdr_reserve_space(xdr, sizeof(u32) + pathname_len); -	if (unlikely(p == NULL)) -		return -EIO; +	p = xdr_reserve_space(xdr, 4 + pathname_len);  	xdr_encode_opaque(p, pathname, pathname_len); - -	return 0;  } -static int mnt_enc_dirpath(struct rpc_rqst *req, __be32 *p, -			   const char *dirpath) +static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr, +				const char *dirpath)  { -	struct xdr_stream xdr; - -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	return encode_mntdirpath(&xdr, dirpath); +	encode_mntdirpath(xdr, dirpath);  }  /* @@ -320,10 +326,10 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)  	u32 status;  	__be32 *p; -	p = xdr_inline_decode(xdr, sizeof(status)); +	p = xdr_inline_decode(xdr, 4);  	if (unlikely(p == NULL))  		return -EIO; -	status = ntohl(*p); +	status = be32_to_cpup(p);  	for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {  		if (mnt_errtbl[i].status == status) { @@ -351,18 +357,16 @@ static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res)  	return 0;  } -static int mnt_dec_mountres(struct rpc_rqst *req, __be32 *p, -			    struct mountres *res) +static int mnt_xdr_dec_mountres(struct rpc_rqst *req, +				struct xdr_stream *xdr, +				struct mountres *res)  { -	struct xdr_stream xdr;  	int status; -	xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - -	status = decode_status(&xdr, res); +	status = decode_status(xdr, res);  	if (unlikely(status != 0 || res->errno != 0))  		return status; -	return decode_fhandle(&xdr, res); +	return decode_fhandle(xdr, res);  }  static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res) @@ -371,10 +375,10 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)  	u32 status;  	__be32 *p; -	p = xdr_inline_decode(xdr, sizeof(status)); +	p = xdr_inline_decode(xdr, 4);  	if (unlikely(p == NULL))  		return -EIO; -	status = ntohl(*p); +	status = be32_to_cpup(p);  	for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {  		if (mnt3_errtbl[i].status == status) { @@ -394,11 +398,11 @@ static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res)  	u32 size;  	__be32 *p; -	p = xdr_inline_decode(xdr, sizeof(size)); +	p = xdr_inline_decode(xdr, 4);  	if (unlikely(p == NULL))  		return -EIO; -	size = ntohl(*p++); +	size = be32_to_cpup(p);  	if (size > NFS3_FHSIZE || size == 0)  		return -EIO; @@ -421,15 +425,15 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)  	if (*count == 0)  		return 0; -	p = xdr_inline_decode(xdr, sizeof(entries)); +	p = xdr_inline_decode(xdr, 4);  	if (unlikely(p == NULL))  		return -EIO; -	entries = ntohl(*p); +	entries = be32_to_cpup(p);  	dprintk("NFS: received %u auth flavors\n", entries);  	if (entries > NFS_MAX_SECFLAVORS)  		entries = NFS_MAX_SECFLAVORS; -	p = xdr_inline_decode(xdr, sizeof(u32) * entries); +	p = xdr_inline_decode(xdr, 4 * entries);  	if (unlikely(p == NULL))  		return -EIO; @@ -437,7 +441,7 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)  		entries = *count;  	for (i = 0; i < entries; i++) { -		flavors[i] = ntohl(*p++); +		flavors[i] = be32_to_cpup(p++);  		dprintk("NFS:   auth flavor[%u]: %d\n", i, flavors[i]);  	}  	*count = i; @@ -445,30 +449,28 @@ static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res)  	return 0;  } -static int mnt_dec_mountres3(struct rpc_rqst *req, __be32 *p, -			     struct mountres *res) +static int mnt_xdr_dec_mountres3(struct rpc_rqst *req, +				 struct xdr_stream *xdr, +				 struct mountres *res)  { -	struct xdr_stream xdr;  	int status; -	xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - -	status = decode_fhs_status(&xdr, res); +	status = decode_fhs_status(xdr, res);  	if (unlikely(status != 0 || res->errno != 0))  		return status; -	status = decode_fhandle3(&xdr, res); +	status = decode_fhandle3(xdr, res);  	if (unlikely(status != 0)) {  		res->errno = -EBADHANDLE;  		return 0;  	} -	return decode_auth_flavors(&xdr, res); +	return decode_auth_flavors(xdr, res);  }  static struct rpc_procinfo mnt_procedures[] = {  	[MOUNTPROC_MNT] = {  		.p_proc		= MOUNTPROC_MNT, -		.p_encode	= (kxdrproc_t)mnt_enc_dirpath, -		.p_decode	= (kxdrproc_t)mnt_dec_mountres, +		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath, +		.p_decode	= (kxdrdproc_t)mnt_xdr_dec_mountres,  		.p_arglen	= MNT_enc_dirpath_sz,  		.p_replen	= MNT_dec_mountres_sz,  		.p_statidx	= MOUNTPROC_MNT, @@ -476,7 +478,7 @@ static struct rpc_procinfo mnt_procedures[] = {  	},  	[MOUNTPROC_UMNT] = {  		.p_proc		= MOUNTPROC_UMNT, -		.p_encode	= (kxdrproc_t)mnt_enc_dirpath, +		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,  		.p_arglen	= MNT_enc_dirpath_sz,  		.p_statidx	= MOUNTPROC_UMNT,  		.p_name		= "UMOUNT", @@ -486,8 +488,8 @@ static struct rpc_procinfo mnt_procedures[] = {  static struct rpc_procinfo mnt3_procedures[] = {  	[MOUNTPROC3_MNT] = {  		.p_proc		= MOUNTPROC3_MNT, -		.p_encode	= (kxdrproc_t)mnt_enc_dirpath, -		.p_decode	= (kxdrproc_t)mnt_dec_mountres3, +		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath, +		.p_decode	= (kxdrdproc_t)mnt_xdr_dec_mountres3,  		.p_arglen	= MNT_enc_dirpath_sz,  		.p_replen	= MNT_dec_mountres3_sz,  		.p_statidx	= MOUNTPROC3_MNT, @@ -495,7 +497,7 @@ static struct rpc_procinfo mnt3_procedures[] = {  	},  	[MOUNTPROC3_UMNT] = {  		.p_proc		= MOUNTPROC3_UMNT, -		.p_encode	= (kxdrproc_t)mnt_enc_dirpath, +		.p_encode	= (kxdreproc_t)mnt_xdr_enc_dirpath,  		.p_arglen	= MNT_enc_dirpath_sz,  		.p_statidx	= MOUNTPROC3_UMNT,  		.p_name		= "UMOUNT", @@ -503,19 +505,19 @@ static struct rpc_procinfo mnt3_procedures[] = {  }; -static struct rpc_version mnt_version1 = { +static const struct rpc_version mnt_version1 = {  	.number		= 1, -	.nrprocs	= 2, +	.nrprocs	= ARRAY_SIZE(mnt_procedures),  	.procs		= mnt_procedures,  }; -static struct rpc_version mnt_version3 = { +static const struct rpc_version mnt_version3 = {  	.number		= 3, -	.nrprocs	= 2, +	.nrprocs	= ARRAY_SIZE(mnt3_procedures),  	.procs		= mnt3_procedures,  }; -static struct rpc_version *mnt_version[] = { +static const struct rpc_version *mnt_version[] = {  	NULL,  	&mnt_version1,  	NULL, @@ -524,7 +526,7 @@ static struct rpc_version *mnt_version[] = {  static struct rpc_stat mnt_stats; -static struct rpc_program mnt_program = { +static const struct rpc_program mnt_program = {  	.name		= "mount",  	.number		= NFS_MNT_PROGRAM,  	.nrvers		= ARRAY_SIZE(mnt_version), diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index db6aa3673cf..b5a0afc3ee1 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -7,6 +7,7 @@   * NFS namespace   */ +#include <linux/module.h>  #include <linux/dcache.h>  #include <linux/gfp.h>  #include <linux/mount.h> @@ -15,6 +16,7 @@  #include <linux/string.h>  #include <linux/sunrpc/clnt.h>  #include <linux/vfs.h> +#include <linux/sunrpc/gss_api.h>  #include "internal.h"  #define NFSDBG_FACILITY		NFSDBG_VFS @@ -25,37 +27,45 @@ static LIST_HEAD(nfs_automount_list);  static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);  int nfs_mountpoint_expiry_timeout = 500 * HZ; -static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, -					const struct dentry *dentry, -					struct nfs_fh *fh, -					struct nfs_fattr *fattr); -  /*   * nfs_path - reconstruct the path given an arbitrary dentry - * @base - arbitrary string to prepend to the path - * @droot - pointer to root dentry for mountpoint + * @base - used to return pointer to the end of devname part of path   * @dentry - pointer to dentry   * @buffer - result buffer   * @buflen - length of buffer + * @flags - options (see below)   * - * Helper function for constructing the path from the - * root dentry to an arbitrary hashed dentry. + * Helper function for constructing the server pathname + * by arbitrary hashed dentry.   *   * This is mainly for use in figuring out the path on the - * server side when automounting on top of an existing partition. + * server side when automounting on top of an existing partition + * and in generating /proc/mounts and friends. + * + * Supported flags: + * NFS_PATH_CANONICAL: ensure there is exactly one slash after + *		       the original device (export) name + *		       (if unset, the original name is returned verbatim)   */ -char *nfs_path(const char *base, -	       const struct dentry *droot, -	       const struct dentry *dentry, -	       char *buffer, ssize_t buflen) +char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen, +	       unsigned flags)  { -	char *end = buffer+buflen; +	char *end;  	int namelen; +	unsigned seq; +	const char *base; +rename_retry: +	end = buffer+buflen;  	*--end = '\0';  	buflen--; -	spin_lock(&dcache_lock); -	while (!IS_ROOT(dentry) && dentry != droot) { + +	seq = read_seqbegin(&rename_lock); +	rcu_read_lock(); +	while (1) { +		spin_lock(&dentry->d_lock); +		if (IS_ROOT(dentry)) +			break;  		namelen = dentry->d_name.len;  		buflen -= namelen + 1;  		if (buflen < 0) @@ -63,34 +73,60 @@ char *nfs_path(const char *base,  		end -= namelen;  		memcpy(end, dentry->d_name.name, namelen);  		*--end = '/'; +		spin_unlock(&dentry->d_lock);  		dentry = dentry->d_parent;  	} -	spin_unlock(&dcache_lock); -	if (*end != '/') { -		if (--buflen < 0) +	if (read_seqretry(&rename_lock, seq)) { +		spin_unlock(&dentry->d_lock); +		rcu_read_unlock(); +		goto rename_retry; +	} +	if ((flags & NFS_PATH_CANONICAL) && *end != '/') { +		if (--buflen < 0) { +			spin_unlock(&dentry->d_lock); +			rcu_read_unlock();  			goto Elong; +		}  		*--end = '/';  	} +	*p = end; +	base = dentry->d_fsdata; +	if (!base) { +		spin_unlock(&dentry->d_lock); +		rcu_read_unlock(); +		WARN_ON(1); +		return end; +	}  	namelen = strlen(base); -	/* Strip off excess slashes in base string */ -	while (namelen > 0 && base[namelen - 1] == '/') -		namelen--; +	if (flags & NFS_PATH_CANONICAL) { +		/* Strip off excess slashes in base string */ +		while (namelen > 0 && base[namelen - 1] == '/') +			namelen--; +	}  	buflen -= namelen; -	if (buflen < 0) +	if (buflen < 0) { +		spin_unlock(&dentry->d_lock); +		rcu_read_unlock();  		goto Elong; +	}  	end -= namelen;  	memcpy(end, base, namelen); +	spin_unlock(&dentry->d_lock); +	rcu_read_unlock();  	return end;  Elong_unlock: -	spin_unlock(&dcache_lock); +	spin_unlock(&dentry->d_lock); +	rcu_read_unlock(); +	if (read_seqretry(&rename_lock, seq)) +		goto rename_retry;  Elong:  	return ERR_PTR(-ENAMETOOLONG);  } +EXPORT_SYMBOL_GPL(nfs_path);  /* - * nfs_follow_mountpoint - handle crossing a mountpoint on the server - * @dentry - dentry of mountpoint - * @nd - nameidata info + * nfs_d_automount - Handle crossing a mountpoint on the server + * @path - The mountpoint   *   * When we encounter a mountpoint on the server, we want to set up   * a mountpoint on the client too, to prevent inode numbers from @@ -100,87 +136,72 @@ Elong:   * situation, and that different filesystems may want to use   * different security flavours.   */ -static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) +struct vfsmount *nfs_d_automount(struct path *path)  {  	struct vfsmount *mnt; -	struct nfs_server *server = NFS_SERVER(dentry->d_inode); -	struct dentry *parent; +	struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);  	struct nfs_fh *fh = NULL;  	struct nfs_fattr *fattr = NULL; -	int err; -	dprintk("--> nfs_follow_mountpoint()\n"); +	dprintk("--> nfs_d_automount()\n"); -	err = -ESTALE; -	if (IS_ROOT(dentry)) -		goto out_err; +	mnt = ERR_PTR(-ESTALE); +	if (IS_ROOT(path->dentry)) +		goto out_nofree; -	err = -ENOMEM; +	mnt = ERR_PTR(-ENOMEM);  	fh = nfs_alloc_fhandle();  	fattr = nfs_alloc_fattr();  	if (fh == NULL || fattr == NULL) -		goto out_err; +		goto out;  	dprintk("%s: enter\n", __func__); -	dput(nd->path.dentry); -	nd->path.dentry = dget(dentry); -	/* Look it up again */ -	parent = dget_parent(nd->path.dentry); -	err = server->nfs_client->rpc_ops->lookup(parent->d_inode, -						  &nd->path.dentry->d_name, -						  fh, fattr); -	dput(parent); -	if (err != 0) -		goto out_err; - -	if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) -		mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry); -	else -		mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh, -				      fattr); -	err = PTR_ERR(mnt); +	mnt = server->nfs_client->rpc_ops->submount(server, path->dentry, fh, fattr);  	if (IS_ERR(mnt)) -		goto out_err; +		goto out; -	mntget(mnt); -	err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE, -			   &nfs_automount_list); -	if (err < 0) { -		mntput(mnt); -		if (err == -EBUSY) -			goto out_follow; -		goto out_err; -	} -	path_put(&nd->path); -	nd->path.mnt = mnt; -	nd->path.dentry = dget(mnt->mnt_root); +	dprintk("%s: done, success\n", __func__); +	mntget(mnt); /* prevent immediate expiration */ +	mnt_set_expiry(mnt, &nfs_automount_list);  	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); +  out:  	nfs_free_fattr(fattr);  	nfs_free_fhandle(fh); -	dprintk("%s: done, returned %d\n", __func__, err); +out_nofree: +	if (IS_ERR(mnt)) +		dprintk("<-- %s(): error %ld\n", __func__, PTR_ERR(mnt)); +	else +		dprintk("<-- %s() = %p\n", __func__, mnt); +	return mnt; +} -	dprintk("<-- nfs_follow_mountpoint() = %d\n", err); -	return ERR_PTR(err); -out_err: -	path_put(&nd->path); -	goto out; -out_follow: -	while (d_mountpoint(nd->path.dentry) && -	       follow_down(&nd->path)) -		; -	err = 0; -	goto out; +static int +nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ +	if (NFS_FH(dentry->d_inode)->size != 0) +		return nfs_getattr(mnt, dentry, stat); +	generic_fillattr(dentry->d_inode, stat); +	return 0; +} + +static int +nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr) +{ +	if (NFS_FH(dentry->d_inode)->size != 0) +		return nfs_setattr(dentry, attr); +	return -EACCES;  }  const struct inode_operations nfs_mountpoint_inode_operations = { -	.follow_link	= nfs_follow_mountpoint,  	.getattr	= nfs_getattr, +	.setattr	= nfs_setattr,  };  const struct inode_operations nfs_referral_inode_operations = { -	.follow_link	= nfs_follow_mountpoint, +	.getattr	= nfs_namespace_getattr, +	.setattr	= nfs_namespace_setattr,  };  static void nfs_expire_automounts(struct work_struct *work) @@ -205,40 +226,26 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,  					   const char *devname,  					   struct nfs_clone_mount *mountdata)  { -#ifdef CONFIG_NFS_V4 -	struct vfsmount *mnt = ERR_PTR(-EINVAL); -	switch (server->nfs_client->rpc_ops->version) { -		case 2: -		case 3: -			mnt = vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); -			break; -		case 4: -			mnt = vfs_kern_mount(&nfs4_xdev_fs_type, 0, devname, mountdata); -	} -	return mnt; -#else  	return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); -#endif  }  /**   * nfs_do_submount - set up mountpoint when crossing a filesystem boundary - * @mnt_parent - mountpoint of parent directory   * @dentry - parent directory   * @fh - filehandle for new root dentry   * @fattr - attributes for new root inode + * @authflavor - security flavor to use when performing the mount   *   */ -static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, -					const struct dentry *dentry, -					struct nfs_fh *fh, -					struct nfs_fattr *fattr) +struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, +				 struct nfs_fattr *fattr, rpc_authflavor_t authflavor)  {  	struct nfs_clone_mount mountdata = { -		.sb = mnt_parent->mnt_sb, +		.sb = dentry->d_sb,  		.dentry = dentry,  		.fh = fh,  		.fattr = fattr, +		.authflavor = authflavor,  	};  	struct vfsmount *mnt = ERR_PTR(-ENOMEM);  	char *page = (char *) __get_free_page(GFP_USER); @@ -246,16 +253,15 @@ static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,  	dprintk("--> nfs_do_submount()\n"); -	dprintk("%s: submounting on %s/%s\n", __func__, -			dentry->d_parent->d_name.name, -			dentry->d_name.name); +	dprintk("%s: submounting on %pd2\n", __func__, +			dentry);  	if (page == NULL)  		goto out; -	devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE); +	devname = nfs_devname(dentry, page, PAGE_SIZE);  	mnt = (struct vfsmount *)devname;  	if (IS_ERR(devname))  		goto free_page; -	mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata); +	mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata);  free_page:  	free_page((unsigned long)page);  out: @@ -264,3 +270,20 @@ out:  	dprintk("<-- nfs_do_submount() = %p\n", mnt);  	return mnt;  } +EXPORT_SYMBOL_GPL(nfs_do_submount); + +struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry, +			      struct nfs_fh *fh, struct nfs_fattr *fattr) +{ +	int err; +	struct dentry *parent = dget_parent(dentry); + +	/* Look it up again to get its attributes */ +	err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL); +	dput(parent); +	if (err != 0) +		return ERR_PTR(err); + +	return nfs_do_submount(dentry, fh, fattr, server->client->cl_auth->au_flavor); +} +EXPORT_SYMBOL_GPL(nfs_submount); diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h new file mode 100644 index 00000000000..8ee1fab8326 --- /dev/null +++ b/fs/nfs/netns.h @@ -0,0 +1,36 @@ +/* + * NFS-private data for each "struct net".  Accessed with net_generic(). + */ + +#ifndef __NFS_NETNS_H__ +#define __NFS_NETNS_H__ + +#include <linux/nfs4.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +struct bl_dev_msg { +	int32_t status; +	uint32_t major, minor; +}; + +struct nfs_net { +	struct cache_detail *nfs_dns_resolve; +	struct rpc_pipe *bl_device_pipe; +	struct bl_dev_msg bl_mount_reply; +	wait_queue_head_t bl_wq; +	struct list_head nfs_client_list; +	struct list_head nfs_volume_list; +#if IS_ENABLED(CONFIG_NFS_V4) +	struct idr cb_ident_idr; /* Protected by nfs_client_lock */ +	unsigned short nfs_callback_tcpport; +	unsigned short nfs_callback_tcpport6; +	int cb_users[NFS4_MAX_MINOR_VERSION + 1]; +#endif +	spinlock_t nfs_client_lock; +	struct timespec boot_time; +}; + +extern int nfs_net_id; + +#endif diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h new file mode 100644 index 00000000000..43679df56cd --- /dev/null +++ b/fs/nfs/nfs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2012 Netapp, Inc. All rights reserved. + * + * Function and structures exported by the NFS module + * for use by NFS version-specific modules. + */ +#ifndef __LINUX_INTERNAL_NFS_H +#define __LINUX_INTERNAL_NFS_H + +#include <linux/fs.h> +#include <linux/sunrpc/sched.h> +#include <linux/nfs_xdr.h> + +struct nfs_subversion { +	struct module *owner;	/* THIS_MODULE pointer */ +	struct file_system_type *nfs_fs;	/* NFS filesystem type */ +	const struct rpc_version *rpc_vers;	/* NFS version information */ +	const struct nfs_rpc_ops *rpc_ops;	/* NFS operations */ +	const struct super_operations *sops;	/* NFS Super operations */ +	const struct xattr_handler **xattr;	/* NFS xattr handlers */ +	struct list_head list;		/* List of NFS versions */ +}; + +struct nfs_subversion *get_nfs_version(unsigned int); +void put_nfs_version(struct nfs_subversion *); +void register_nfs_version(struct nfs_subversion *); +void unregister_nfs_version(struct nfs_subversion *); + +#endif /* __LINUX_INTERNAL_NFS_H */ diff --git a/fs/nfs/nfs2super.c b/fs/nfs/nfs2super.c new file mode 100644 index 00000000000..0a9782c9171 --- /dev/null +++ b/fs/nfs/nfs2super.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2012 Netapp, Inc. All rights reserved. + */ +#include <linux/module.h> +#include <linux/nfs_fs.h> +#include "internal.h" +#include "nfs.h" + +static struct nfs_subversion nfs_v2 = { +	.owner = THIS_MODULE, +	.nfs_fs   = &nfs_fs_type, +	.rpc_vers = &nfs_version2, +	.rpc_ops  = &nfs_v2_clientops, +	.sops     = &nfs_sops, +}; + +static int __init init_nfs_v2(void) +{ +	register_nfs_version(&nfs_v2); +	return 0; +} + +static void __exit exit_nfs_v2(void) +{ +	unregister_nfs_version(&nfs_v2); +} + +MODULE_LICENSE("GPL"); + +module_init(init_nfs_v2); +module_exit(exit_nfs_v2); diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index e6bf45710cc..5f61b83f4a1 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -61,582 +61,998 @@  #define NFS_readdirres_sz	(1)  #define NFS_statfsres_sz	(1+NFS_info_sz) +static int nfs_stat_to_errno(enum nfs_stat); +  /* - * Common NFS XDR functions as inlines + * While encoding arguments, set up the reply buffer in advance to + * receive reply data directly into the page cache.   */ -static inline __be32 * -xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fhandle) +static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, +				 unsigned int base, unsigned int len, +				 unsigned int bufsize)  { -	memcpy(p, fhandle->data, NFS2_FHSIZE); -	return p + XDR_QUADLEN(NFS2_FHSIZE); -} +	struct rpc_auth	*auth = req->rq_cred->cr_auth; +	unsigned int replen; -static inline __be32 * -xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle) -{ -	/* NFSv2 handles have a fixed length */ -	fhandle->size = NFS2_FHSIZE; -	memcpy(fhandle->data, p, NFS2_FHSIZE); -	return p + XDR_QUADLEN(NFS2_FHSIZE); +	replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; +	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len);  } -static inline __be32* -xdr_encode_time(__be32 *p, struct timespec *timep) +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)  { -	*p++ = htonl(timep->tv_sec); -	/* Convert nanoseconds into microseconds */ -	*p++ = htonl(timep->tv_nsec ? timep->tv_nsec / 1000 : 0); -	return p; +	dprintk("NFS: %s prematurely hit the end of our receive buffer. " +		"Remaining buffer length is %tu words.\n", +		func, xdr->end - xdr->p);  } -static inline __be32* -xdr_encode_current_server_time(__be32 *p, struct timespec *timep) + +/* + * Encode/decode NFSv2 basic data types + * + * Basic NFSv2 data types are defined in section 2.3 of RFC 1094: + * "NFS: Network File System Protocol Specification". + * + * Not all basic data types have their own encoding and decoding + * functions.  For run-time efficiency, some data types are encoded + * or decoded inline. + */ + +/* + *	typedef opaque	nfsdata<>; + */ +static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result)  { -	/* -	 * Passing the invalid value useconds=1000000 is a -	 * Sun convention for "set to current server time". -	 * It's needed to make permissions checks for the -	 * "touch" program across v2 mounts to Solaris and -	 * Irix boxes work correctly. See description of -	 * sattr in section 6.1 of "NFS Illustrated" by -	 * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5 -	 */ -	*p++ = htonl(timep->tv_sec); -	*p++ = htonl(1000000); -	return p; +	u32 recvd, count; +	__be32 *p; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	count = be32_to_cpup(p); +	recvd = xdr_read_pages(xdr, count); +	if (unlikely(count > recvd)) +		goto out_cheating; +out: +	result->eof = 0;	/* NFSv2 does not pass EOF flag on the wire. */ +	result->count = count; +	return count; +out_cheating: +	dprintk("NFS: server cheating in read result: " +		"count %u > recvd %u\n", count, recvd); +	count = recvd; +	goto out; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO;  } -static inline __be32* -xdr_decode_time(__be32 *p, struct timespec *timep) +/* + *	enum stat { + *		NFS_OK = 0, + *		NFSERR_PERM = 1, + *		NFSERR_NOENT = 2, + *		NFSERR_IO = 5, + *		NFSERR_NXIO = 6, + *		NFSERR_ACCES = 13, + *		NFSERR_EXIST = 17, + *		NFSERR_NODEV = 19, + *		NFSERR_NOTDIR = 20, + *		NFSERR_ISDIR = 21, + *		NFSERR_FBIG = 27, + *		NFSERR_NOSPC = 28, + *		NFSERR_ROFS = 30, + *		NFSERR_NAMETOOLONG = 63, + *		NFSERR_NOTEMPTY = 66, + *		NFSERR_DQUOT = 69, + *		NFSERR_STALE = 70, + *		NFSERR_WFLUSH = 99 + *	}; + */ +static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status)  { -	timep->tv_sec = ntohl(*p++); -	/* Convert microseconds into nanoseconds */ -	timep->tv_nsec = ntohl(*p++) * 1000; -	return p; +	__be32 *p; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	*status = be32_to_cpup(p); +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO;  } -static __be32 * -xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) +/* + * 2.3.2.  ftype + * + *	enum ftype { + *		NFNON = 0, + *		NFREG = 1, + *		NFDIR = 2, + *		NFBLK = 3, + *		NFCHR = 4, + *		NFLNK = 5 + *	}; + * + */ +static __be32 *xdr_decode_ftype(__be32 *p, u32 *type)  { -	u32 rdev, type; -	type = ntohl(*p++); -	fattr->mode = ntohl(*p++); -	fattr->nlink = ntohl(*p++); -	fattr->uid = ntohl(*p++); -	fattr->gid = ntohl(*p++); -	fattr->size = ntohl(*p++); -	fattr->du.nfs2.blocksize = ntohl(*p++); -	rdev = ntohl(*p++); -	fattr->du.nfs2.blocks = ntohl(*p++); -	fattr->fsid.major = ntohl(*p++); -	fattr->fsid.minor = 0; -	fattr->fileid = ntohl(*p++); -	p = xdr_decode_time(p, &fattr->atime); -	p = xdr_decode_time(p, &fattr->mtime); -	p = xdr_decode_time(p, &fattr->ctime); -	fattr->valid |= NFS_ATTR_FATTR_V2; -	fattr->rdev = new_decode_dev(rdev); -	if (type == NFCHR && rdev == NFS2_FIFO_DEV) { -		fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; -		fattr->rdev = 0; -	} +	*type = be32_to_cpup(p++); +	if (unlikely(*type > NF2FIFO)) +		*type = NFBAD;  	return p;  } -static inline __be32 * -xdr_encode_sattr(__be32 *p, struct iattr *attr) +/* + * 2.3.3.  fhandle + * + *	typedef opaque fhandle[FHSIZE]; + */ +static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)  { -	const __be32 not_set = __constant_htonl(0xFFFFFFFF); - -	*p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set; -	*p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set; -	*p++ = (attr->ia_valid & ATTR_GID) ? htonl(attr->ia_gid) : not_set; -	*p++ = (attr->ia_valid & ATTR_SIZE) ? htonl(attr->ia_size) : not_set; - -	if (attr->ia_valid & ATTR_ATIME_SET) { -		p = xdr_encode_time(p, &attr->ia_atime); -	} else if (attr->ia_valid & ATTR_ATIME) { -		p = xdr_encode_current_server_time(p, &attr->ia_atime); -	} else { -		*p++ = not_set; -		*p++ = not_set; -	} +	__be32 *p; -	if (attr->ia_valid & ATTR_MTIME_SET) { -		p = xdr_encode_time(p, &attr->ia_mtime); -	} else if (attr->ia_valid & ATTR_MTIME) { -		p = xdr_encode_current_server_time(p, &attr->ia_mtime); -	} else { -		*p++ = not_set;	 -		*p++ = not_set; -	} -  	return p; +	p = xdr_reserve_space(xdr, NFS2_FHSIZE); +	memcpy(p, fh->data, NFS2_FHSIZE);  } -/* - * NFS encode functions - */ -/* - * Encode file handle argument - * GETATTR, READLINK, STATFS - */ -static int -nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh) +static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)  { -	p = xdr_encode_fhandle(p, fh); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); +	__be32 *p; + +	p = xdr_inline_decode(xdr, NFS2_FHSIZE); +	if (unlikely(p == NULL)) +		goto out_overflow; +	fh->size = NFS2_FHSIZE; +	memcpy(fh->data, p, NFS2_FHSIZE);  	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO;  }  /* - * Encode SETATTR arguments + * 2.3.4.  timeval + * + *	struct timeval { + *		unsigned int seconds; + *		unsigned int useconds; + *	};   */ -static int -nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args) +static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep)  { -	p = xdr_encode_fhandle(p, args->fh); -	p = xdr_encode_sattr(p, args->sattr); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +	*p++ = cpu_to_be32(timep->tv_sec); +	if (timep->tv_nsec != 0) +		*p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC); +	else +		*p++ = cpu_to_be32(0); +	return p;  }  /* - * Encode directory ops argument - * LOOKUP, RMDIR + * Passing the invalid value useconds=1000000 is a Sun convention for + * "set to current server time".  It's needed to make permissions checks + * for the "touch" program across v2 mounts to Solaris and Irix servers + * work correctly.  See description of sattr in section 6.1 of "NFS + * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5.   */ -static int -nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args) +static __be32 *xdr_encode_current_server_time(__be32 *p, +					      const struct timespec *timep)  { -	p = xdr_encode_fhandle(p, args->fh); -	p = xdr_encode_array(p, args->name, args->len); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +	*p++ = cpu_to_be32(timep->tv_sec); +	*p++ = cpu_to_be32(1000000); +	return p;  } -/* - * Encode REMOVE argument - */ -static int -nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args) +static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep)  { -	p = xdr_encode_fhandle(p, args->fh); -	p = xdr_encode_array(p, args->name.name, args->name.len); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +	timep->tv_sec = be32_to_cpup(p++); +	timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC; +	return p;  }  /* - * Arguments to a READ call. Since we read data directly into the page - * cache, we also set up the reply iovec here so that iov[1] points - * exactly to the page we want to fetch. + * 2.3.5.  fattr + * + *	struct fattr { + *		ftype		type; + *		unsigned int	mode; + *		unsigned int	nlink; + *		unsigned int	uid; + *		unsigned int	gid; + *		unsigned int	size; + *		unsigned int	blocksize; + *		unsigned int	rdev; + *		unsigned int	blocks; + *		unsigned int	fsid; + *		unsigned int	fileid; + *		timeval		atime; + *		timeval		mtime; + *		timeval		ctime; + *	}; + *   */ -static int -nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) +static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)  { -	struct rpc_auth	*auth = req->rq_cred->cr_auth; -	unsigned int replen; -	u32 offset = (u32)args->offset; -	u32 count = args->count; +	u32 rdev, type; +	__be32 *p; -	p = xdr_encode_fhandle(p, args->fh); -	*p++ = htonl(offset); -	*p++ = htonl(count); -	*p++ = htonl(count); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); +	p = xdr_inline_decode(xdr, NFS_fattr_sz << 2); +	if (unlikely(p == NULL)) +		goto out_overflow; + +	fattr->valid |= NFS_ATTR_FATTR_V2; + +	p = xdr_decode_ftype(p, &type); + +	fattr->mode = be32_to_cpup(p++); +	fattr->nlink = be32_to_cpup(p++); +	fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); +	if (!uid_valid(fattr->uid)) +		goto out_uid; +	fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); +	if (!gid_valid(fattr->gid)) +		goto out_gid; +		 +	fattr->size = be32_to_cpup(p++); +	fattr->du.nfs2.blocksize = be32_to_cpup(p++); + +	rdev = be32_to_cpup(p++); +	fattr->rdev = new_decode_dev(rdev); +	if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) { +		fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; +		fattr->rdev = 0; +	} + +	fattr->du.nfs2.blocks = be32_to_cpup(p++); +	fattr->fsid.major = be32_to_cpup(p++); +	fattr->fsid.minor = 0; +	fattr->fileid = be32_to_cpup(p++); + +	p = xdr_decode_time(p, &fattr->atime); +	p = xdr_decode_time(p, &fattr->mtime); +	xdr_decode_time(p, &fattr->ctime); +	fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime); -	/* Inline the page array */ -	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2; -	xdr_inline_pages(&req->rq_rcv_buf, replen, -			 args->pages, args->pgbase, count); -	req->rq_rcv_buf.flags |= XDRBUF_READ;  	return 0; +out_uid: +	dprintk("NFS: returned invalid uid\n"); +	return -EINVAL; +out_gid: +	dprintk("NFS: returned invalid gid\n"); +	return -EINVAL; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO;  }  /* - * Decode READ reply + * 2.3.6.  sattr + * + *	struct sattr { + *		unsigned int	mode; + *		unsigned int	uid; + *		unsigned int	gid; + *		unsigned int	size; + *		timeval		atime; + *		timeval		mtime; + *	};   */ -static int -nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) -{ -	struct kvec *iov = req->rq_rcv_buf.head; -	size_t hdrlen; -	u32 count, recvd; -	int status; -	if ((status = ntohl(*p++))) -		return nfs_stat_to_errno(status); -	p = xdr_decode_fattr(p, res->fattr); +#define NFS2_SATTR_NOT_SET	(0xffffffff) -	count = ntohl(*p++); -	res->eof = 0; -	hdrlen = (u8 *) p - (u8 *) iov->iov_base; -	if (iov->iov_len < hdrlen) { -		dprintk("NFS: READ reply header overflowed:" -				"length %Zu > %Zu\n", hdrlen, iov->iov_len); -		return -errno_NFSERR_IO; -	} else if (iov->iov_len != hdrlen) { -		dprintk("NFS: READ header is short. iovec will be shifted.\n"); -		xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen); -	} +static __be32 *xdr_time_not_set(__be32 *p) +{ +	*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); +	*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); +	return p; +} -	recvd = req->rq_rcv_buf.len - hdrlen; -	if (count > recvd) { -		dprintk("NFS: server cheating in read reply: " -			"count %u > recvd %u\n", count, recvd); -		count = recvd; -	} +static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) +{ +	__be32 *p; -	dprintk("RPC:      readres OK count %u\n", count); -	if (count < res->count) -		res->count = count; +	p = xdr_reserve_space(xdr, NFS_sattr_sz << 2); -	return count; -} +	if (attr->ia_valid & ATTR_MODE) +		*p++ = cpu_to_be32(attr->ia_mode); +	else +		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); +	if (attr->ia_valid & ATTR_UID) +		*p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid)); +	else +		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); +	if (attr->ia_valid & ATTR_GID) +		*p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid)); +	else +		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); +	if (attr->ia_valid & ATTR_SIZE) +		*p++ = cpu_to_be32((u32)attr->ia_size); +	else +		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); +	if (attr->ia_valid & ATTR_ATIME_SET) +		p = xdr_encode_time(p, &attr->ia_atime); +	else if (attr->ia_valid & ATTR_ATIME) +		p = xdr_encode_current_server_time(p, &attr->ia_atime); +	else +		p = xdr_time_not_set(p); +	if (attr->ia_valid & ATTR_MTIME_SET) +		xdr_encode_time(p, &attr->ia_mtime); +	else if (attr->ia_valid & ATTR_MTIME) +		xdr_encode_current_server_time(p, &attr->ia_mtime); +	else +		xdr_time_not_set(p); +}  /* - * Write arguments. Splice the buffer to be written into the iovec. + * 2.3.7.  filename + * + *	typedef string filename<MAXNAMLEN>;   */ -static int -nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) +static void encode_filename(struct xdr_stream *xdr, +			    const char *name, u32 length)  { -	struct xdr_buf *sndbuf = &req->rq_snd_buf; -	u32 offset = (u32)args->offset; -	u32 count = args->count; +	__be32 *p; + +	WARN_ON_ONCE(length > NFS2_MAXNAMLEN); +	p = xdr_reserve_space(xdr, 4 + length); +	xdr_encode_opaque(p, name, length); +} -	p = xdr_encode_fhandle(p, args->fh); -	*p++ = htonl(offset); -	*p++ = htonl(offset); -	*p++ = htonl(count); -	*p++ = htonl(count); -	sndbuf->len = xdr_adjust_iovec(sndbuf->head, p); +static int decode_filename_inline(struct xdr_stream *xdr, +				  const char **name, u32 *length) +{ +	__be32 *p; +	u32 count; -	/* Copy the page array */ -	xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); -	sndbuf->flags |= XDRBUF_WRITE; +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	count = be32_to_cpup(p); +	if (count > NFS3_MAXNAMLEN) +		goto out_nametoolong; +	p = xdr_inline_decode(xdr, count); +	if (unlikely(p == NULL)) +		goto out_overflow; +	*name = (const char *)p; +	*length = count;  	return 0; +out_nametoolong: +	dprintk("NFS: returned filename too long: %u\n", count); +	return -ENAMETOOLONG; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO;  }  /* - * Encode create arguments - * CREATE, MKDIR + * 2.3.8.  path + * + *	typedef string path<MAXPATHLEN>;   */ -static int -nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args) +static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)  { -	p = xdr_encode_fhandle(p, args->fh); -	p = xdr_encode_array(p, args->name, args->len); -	p = xdr_encode_sattr(p, args->sattr); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); +	__be32 *p; + +	p = xdr_reserve_space(xdr, 4); +	*p = cpu_to_be32(length); +	xdr_write_pages(xdr, pages, 0, length); +} + +static int decode_path(struct xdr_stream *xdr) +{ +	u32 length, recvd; +	__be32 *p; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	length = be32_to_cpup(p); +	if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN)) +		goto out_size; +	recvd = xdr_read_pages(xdr, length); +	if (unlikely(length > recvd)) +		goto out_cheating; +	xdr_terminate_string(xdr->buf, length);  	return 0; +out_size: +	dprintk("NFS: returned pathname too long: %u\n", length); +	return -ENAMETOOLONG; +out_cheating: +	dprintk("NFS: server cheating in pathname result: " +		"length %u > received %u\n", length, recvd); +	return -EIO; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO;  }  /* - * Encode RENAME arguments + * 2.3.9.  attrstat + * + *	union attrstat switch (stat status) { + *	case NFS_OK: + *		fattr attributes; + *	default: + *		void; + *	};   */ -static int -nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) +static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)  { -	p = xdr_encode_fhandle(p, args->old_dir); -	p = xdr_encode_array(p, args->old_name->name, args->old_name->len); -	p = xdr_encode_fhandle(p, args->new_dir); -	p = xdr_encode_array(p, args->new_name->name, args->new_name->len); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +	enum nfs_stat status; +	int error; + +	error = decode_stat(xdr, &status); +	if (unlikely(error)) +		goto out; +	if (status != NFS_OK) +		goto out_default; +	error = decode_fattr(xdr, result); +out: +	return error; +out_default: +	return nfs_stat_to_errno(status);  }  /* - * Encode LINK arguments + * 2.3.10.  diropargs + * + *	struct diropargs { + *		fhandle  dir; + *		filename name; + *	};   */ -static int -nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args) +static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh, +			     const char *name, u32 length)  { -	p = xdr_encode_fhandle(p, args->fromfh); -	p = xdr_encode_fhandle(p, args->tofh); -	p = xdr_encode_array(p, args->toname, args->tolen); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +	encode_fhandle(xdr, fh); +	encode_filename(xdr, name, length);  }  /* - * Encode SYMLINK arguments + * 2.3.11.  diropres + * + *	union diropres switch (stat status) { + *	case NFS_OK: + *		struct { + *			fhandle file; + *			fattr   attributes; + *		} diropok; + *	default: + *		void; + *	};   */ -static int -nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args) +static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result)  { -	struct xdr_buf *sndbuf = &req->rq_snd_buf; -	size_t pad; +	int error; + +	error = decode_fhandle(xdr, result->fh); +	if (unlikely(error)) +		goto out; +	error = decode_fattr(xdr, result->fattr); +out: +	return error; +} -	p = xdr_encode_fhandle(p, args->fromfh); -	p = xdr_encode_array(p, args->fromname, args->fromlen); -	*p++ = htonl(args->pathlen); -	sndbuf->len = xdr_adjust_iovec(sndbuf->head, p); +static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result) +{ +	enum nfs_stat status; +	int error; + +	error = decode_stat(xdr, &status); +	if (unlikely(error)) +		goto out; +	if (status != NFS_OK) +		goto out_default; +	error = decode_diropok(xdr, result); +out: +	return error; +out_default: +	return nfs_stat_to_errno(status); +} -	xdr_encode_pages(sndbuf, args->pages, 0, args->pathlen); -	/* -	 * xdr_encode_pages may have added a few bytes to ensure the -	 * pathname ends on a 4-byte boundary.  Start encoding the -	 * attributes after the pad bytes. -	 */ -	pad = sndbuf->tail->iov_len; -	if (pad > 0) -		p++; -	p = xdr_encode_sattr(p, args->sattr); -	sndbuf->len += xdr_adjust_iovec(sndbuf->tail, p) - pad; -	return 0; +/* + * NFSv2 XDR encode functions + * + * NFSv2 argument types are defined in section 2.2 of RFC 1094: + * "NFS: Network File System Protocol Specification". + */ + +static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req, +				 struct xdr_stream *xdr, +				 const struct nfs_fh *fh) +{ +	encode_fhandle(xdr, fh);  }  /* - * Encode arguments to readdir call + * 2.2.3.  sattrargs + * + *	struct sattrargs { + *		fhandle file; + *		sattr attributes; + *	};   */ -static int -nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args) +static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   const struct nfs_sattrargs *args)  { -	struct rpc_auth	*auth = req->rq_cred->cr_auth; -	unsigned int replen; -	u32 count = args->count; +	encode_fhandle(xdr, args->fh); +	encode_sattr(xdr, args->sattr); +} -	p = xdr_encode_fhandle(p, args->fh); -	*p++ = htonl(args->cookie); -	*p++ = htonl(count); /* see above */ -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); +static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   const struct nfs_diropargs *args) +{ +	encode_diropargs(xdr, args->fh, args->name, args->len); +} -	/* Inline the page array */ -	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readdirres_sz) << 2; -	xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count); -	return 0; +static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, +				      struct xdr_stream *xdr, +				      const struct nfs_readlinkargs *args) +{ +	encode_fhandle(xdr, args->fh); +	prepare_reply_buffer(req, args->pages, args->pgbase, +					args->pglen, NFS_readlinkres_sz);  }  /* - * Decode the result of a readdir call. - * We're not really decoding anymore, we just leave the buffer untouched - * and only check that it is syntactically correct. - * The real decoding happens in nfs_decode_entry below, called directly - * from nfs_readdir for each entry. + * 2.2.7.  readargs + * + *	struct readargs { + *		fhandle file; + *		unsigned offset; + *		unsigned count; + *		unsigned totalcount; + *	};   */ -static int -nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) +static void encode_readargs(struct xdr_stream *xdr, +			    const struct nfs_pgio_args *args)  { -	struct xdr_buf *rcvbuf = &req->rq_rcv_buf; -	struct kvec *iov = rcvbuf->head; -	struct page **page; -	size_t hdrlen; -	unsigned int pglen, recvd; -	int status, nr = 0; - -	if ((status = ntohl(*p++))) -		return nfs_stat_to_errno(status); +	u32 offset = args->offset; +	u32 count = args->count; +	__be32 *p; -	hdrlen = (u8 *) p - (u8 *) iov->iov_base; -	if (iov->iov_len < hdrlen) { -		dprintk("NFS: READDIR reply header overflowed:" -				"length %Zu > %Zu\n", hdrlen, iov->iov_len); -		return -errno_NFSERR_IO; -	} else if (iov->iov_len != hdrlen) { -		dprintk("NFS: READDIR header is short. iovec will be shifted.\n"); -		xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); -	} +	encode_fhandle(xdr, args->fh); -	pglen = rcvbuf->page_len; -	recvd = rcvbuf->len - hdrlen; -	if (pglen > recvd) -		pglen = recvd; -	page = rcvbuf->pages; -	return nr; +	p = xdr_reserve_space(xdr, 4 + 4 + 4); +	*p++ = cpu_to_be32(offset); +	*p++ = cpu_to_be32(count); +	*p = cpu_to_be32(count);  } -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, +				  struct xdr_stream *xdr, +				  const struct nfs_pgio_args *args)  { -	dprintk("nfs: %s: prematurely hit end of receive buffer. " -		"Remaining buffer length is %tu words.\n", -		func, xdr->end - xdr->p); +	encode_readargs(xdr, args); +	prepare_reply_buffer(req, args->pages, args->pgbase, +					args->count, NFS_readres_sz); +	req->rq_rcv_buf.flags |= XDRBUF_READ;  } -__be32 * -nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus) +/* + * 2.2.9.  writeargs + * + *	struct writeargs { + *		fhandle file; + *		unsigned beginoffset; + *		unsigned offset; + *		unsigned totalcount; + *		nfsdata data; + *	}; + */ +static void encode_writeargs(struct xdr_stream *xdr, +			     const struct nfs_pgio_args *args)  { +	u32 offset = args->offset; +	u32 count = args->count;  	__be32 *p; -	p = xdr_inline_decode(xdr, 4); -	if (unlikely(!p)) -		goto out_overflow; -	if (!ntohl(*p++)) { -		p = xdr_inline_decode(xdr, 4); -		if (unlikely(!p)) -			goto out_overflow; -		if (!ntohl(*p++)) -			return ERR_PTR(-EAGAIN); -		entry->eof = 1; -		return ERR_PTR(-EBADCOOKIE); -	} -	p = xdr_inline_decode(xdr, 8); -	if (unlikely(!p)) -		goto out_overflow; +	encode_fhandle(xdr, args->fh); -	entry->ino	  = ntohl(*p++); -	entry->len	  = ntohl(*p++); +	p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4); +	*p++ = cpu_to_be32(offset); +	*p++ = cpu_to_be32(offset); +	*p++ = cpu_to_be32(count); -	p = xdr_inline_decode(xdr, entry->len + 4); -	if (unlikely(!p)) -		goto out_overflow; -	entry->name	  = (const char *) p; -	p		 += XDR_QUADLEN(entry->len); -	entry->prev_cookie	  = entry->cookie; -	entry->cookie	  = ntohl(*p++); - -	p = xdr_inline_peek(xdr, 8); -	if (p != NULL) -		entry->eof = !p[0] && p[1]; -	else -		entry->eof = 0; - -	return p; +	/* nfsdata */ +	*p = cpu_to_be32(count); +	xdr_write_pages(xdr, args->pages, args->pgbase, count); +} -out_overflow: -	print_overflow_msg(__func__, xdr); -	return ERR_PTR(-EIO); +static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   const struct nfs_pgio_args *args) +{ +	encode_writeargs(xdr, args); +	xdr->buf->flags |= XDRBUF_WRITE;  }  /* - * NFS XDR decode functions + * 2.2.10.  createargs + * + *	struct createargs { + *		diropargs where; + *		sattr attributes; + *	};   */ +static void nfs2_xdr_enc_createargs(struct rpc_rqst *req, +				    struct xdr_stream *xdr, +				    const struct nfs_createargs *args) +{ +	encode_diropargs(xdr, args->fh, args->name, args->len); +	encode_sattr(xdr, args->sattr); +} + +static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req, +				    struct xdr_stream *xdr, +				    const struct nfs_removeargs *args) +{ +	encode_diropargs(xdr, args->fh, args->name.name, args->name.len); +} +  /* - * Decode simple status reply + * 2.2.12.  renameargs + * + *	struct renameargs { + *		diropargs from; + *		diropargs to; + *	};   */ -static int -nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy) +static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req, +				    struct xdr_stream *xdr, +				    const struct nfs_renameargs *args)  { -	int	status; +	const struct qstr *old = args->old_name; +	const struct qstr *new = args->new_name; -	if ((status = ntohl(*p++)) != 0) -		status = nfs_stat_to_errno(status); -	return status; +	encode_diropargs(xdr, args->old_dir, old->name, old->len); +	encode_diropargs(xdr, args->new_dir, new->name, new->len);  }  /* - * Decode attrstat reply - * GETATTR, SETATTR, WRITE + * 2.2.13.  linkargs + * + *	struct linkargs { + *		fhandle from; + *		diropargs to; + *	};   */ -static int -nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) +static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req, +				  struct xdr_stream *xdr, +				  const struct nfs_linkargs *args)  { -	int	status; +	encode_fhandle(xdr, args->fromfh); +	encode_diropargs(xdr, args->tofh, args->toname, args->tolen); +} -	if ((status = ntohl(*p++))) -		return nfs_stat_to_errno(status); -	xdr_decode_fattr(p, fattr); -	return 0; +/* + * 2.2.14.  symlinkargs + * + *	struct symlinkargs { + *		diropargs from; + *		path to; + *		sattr attributes; + *	}; + */ +static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     const struct nfs_symlinkargs *args) +{ +	encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen); +	encode_path(xdr, args->pages, args->pathlen); +	encode_sattr(xdr, args->sattr);  }  /* - * Decode diropres reply - * LOOKUP, CREATE, MKDIR + * 2.2.17.  readdirargs + * + *	struct readdirargs { + *		fhandle dir; + *		nfscookie cookie; + *		unsigned count; + *	};   */ -static int -nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res) +static void encode_readdirargs(struct xdr_stream *xdr, +			       const struct nfs_readdirargs *args)  { -	int	status; +	__be32 *p; -	if ((status = ntohl(*p++))) -		return nfs_stat_to_errno(status); -	p = xdr_decode_fhandle(p, res->fh); -	xdr_decode_fattr(p, res->fattr); -	return 0; +	encode_fhandle(xdr, args->fh); + +	p = xdr_reserve_space(xdr, 4 + 4); +	*p++ = cpu_to_be32(args->cookie); +	*p = cpu_to_be32(args->count); +} + +static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     const struct nfs_readdirargs *args) +{ +	encode_readdirargs(xdr, args); +	prepare_reply_buffer(req, args->pages, 0, +					args->count, NFS_readdirres_sz);  }  /* - * Encode READLINK args + * NFSv2 XDR decode functions + * + * NFSv2 result types are defined in section 2.2 of RFC 1094: + * "NFS: Network File System Protocol Specification".   */ -static int -nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args) + +static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr, +			     void *__unused)  { -	struct rpc_auth	*auth = req->rq_cred->cr_auth; -	unsigned int replen; +	enum nfs_stat status; +	int error; + +	error = decode_stat(xdr, &status); +	if (unlikely(error)) +		goto out; +	if (status != NFS_OK) +		goto out_default; +out: +	return error; +out_default: +	return nfs_stat_to_errno(status); +} -	p = xdr_encode_fhandle(p, args->fh); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); +static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr, +				 struct nfs_fattr *result) +{ +	return decode_attrstat(xdr, result); +} -	/* Inline the page array */ -	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readlinkres_sz) << 2; -	xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen); -	return 0; +static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr, +				 struct nfs_diropok *result) +{ +	return decode_diropres(xdr, result); +} + +/* + * 2.2.6.  readlinkres + * + *	union readlinkres switch (stat status) { + *	case NFS_OK: + *		path data; + *	default: + *		void; + *	}; + */ +static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req, +				    struct xdr_stream *xdr, void *__unused) +{ +	enum nfs_stat status; +	int error; + +	error = decode_stat(xdr, &status); +	if (unlikely(error)) +		goto out; +	if (status != NFS_OK) +		goto out_default; +	error = decode_path(xdr); +out: +	return error; +out_default: +	return nfs_stat_to_errno(status);  }  /* - * Decode READLINK reply + * 2.2.7.  readres + * + *	union readres switch (stat status) { + *	case NFS_OK: + *		fattr attributes; + *		nfsdata data; + *	default: + *		void; + *	};   */ -static int -nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy) +static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr, +				struct nfs_pgio_res *result)  { -	struct xdr_buf *rcvbuf = &req->rq_rcv_buf; -	struct kvec *iov = rcvbuf->head; -	size_t hdrlen; -	u32 len, recvd; -	int	status; +	enum nfs_stat status; +	int error; + +	error = decode_stat(xdr, &status); +	if (unlikely(error)) +		goto out; +	if (status != NFS_OK) +		goto out_default; +	error = decode_fattr(xdr, result->fattr); +	if (unlikely(error)) +		goto out; +	error = decode_nfsdata(xdr, result); +out: +	return error; +out_default: +	return nfs_stat_to_errno(status); +} -	if ((status = ntohl(*p++))) -		return nfs_stat_to_errno(status); -	/* Convert length of symlink */ -	len = ntohl(*p++); -	if (len >= rcvbuf->page_len) { -		dprintk("nfs: server returned giant symlink!\n"); -		return -ENAMETOOLONG; -	} -	hdrlen = (u8 *) p - (u8 *) iov->iov_base; -	if (iov->iov_len < hdrlen) { -		dprintk("NFS: READLINK reply header overflowed:" -				"length %Zu > %Zu\n", hdrlen, iov->iov_len); -		return -errno_NFSERR_IO; -	} else if (iov->iov_len != hdrlen) { -		dprintk("NFS: READLINK header is short. iovec will be shifted.\n"); -		xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); -	} -	recvd = req->rq_rcv_buf.len - hdrlen; -	if (recvd < len) { -		dprintk("NFS: server cheating in readlink reply: " -				"count %u > recvd %u\n", len, recvd); -		return -EIO; +static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr, +				 struct nfs_pgio_res *result) +{ +	/* All NFSv2 writes are "file sync" writes */ +	result->verf->committed = NFS_FILE_SYNC; +	return decode_attrstat(xdr, result->fattr); +} + +/** + * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in + *                      the local page cache. + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + * + * 2.2.17.  entry + * + *	struct entry { + *		unsigned	fileid; + *		filename	name; + *		nfscookie	cookie; + *		entry		*nextentry; + *	}; + */ +int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, +		       int plus) +{ +	__be32 *p; +	int error; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	if (*p++ == xdr_zero) { +		p = xdr_inline_decode(xdr, 4); +		if (unlikely(p == NULL)) +			goto out_overflow; +		if (*p++ == xdr_zero) +			return -EAGAIN; +		entry->eof = 1; +		return -EBADCOOKIE;  	} -	xdr_terminate_string(rcvbuf, len); +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	entry->ino = be32_to_cpup(p); + +	error = decode_filename_inline(xdr, &entry->name, &entry->len); +	if (unlikely(error)) +		return error; + +	/* +	 * The type (size and byte order) of nfscookie isn't defined in +	 * RFC 1094.  This implementation assumes that it's an XDR uint32. +	 */ +	entry->prev_cookie = entry->cookie; +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	entry->cookie = be32_to_cpup(p); + +	entry->d_type = DT_UNKNOWN; +  	return 0; + +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EAGAIN;  }  /* - * Decode WRITE reply + * 2.2.17.  readdirres + * + *	union readdirres switch (stat status) { + *	case NFS_OK: + *		struct { + *			entry *entries; + *			bool eof; + *		} readdirok; + *	default: + *		void; + *	}; + * + * Read the directory contents into the page cache, but don't + * touch them.  The actual decoding is done by nfs2_decode_dirent() + * during subsequent nfs_readdir() calls.   */ -static int -nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) +static int decode_readdirok(struct xdr_stream *xdr)  { -	res->verf->committed = NFS_FILE_SYNC; -	return nfs_xdr_attrstat(req, p, res->fattr); +	return xdr_read_pages(xdr, xdr->buf->page_len); +} + +static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req, +				   struct xdr_stream *xdr, void *__unused) +{ +	enum nfs_stat status; +	int error; + +	error = decode_stat(xdr, &status); +	if (unlikely(error)) +		goto out; +	if (status != NFS_OK) +		goto out_default; +	error = decode_readdirok(xdr); +out: +	return error; +out_default: +	return nfs_stat_to_errno(status);  }  /* - * Decode STATFS reply + * 2.2.18.  statfsres + * + *	union statfsres (stat status) { + *	case NFS_OK: + *		struct { + *			unsigned tsize; + *			unsigned bsize; + *			unsigned blocks; + *			unsigned bfree; + *			unsigned bavail; + *		} info; + *	default: + *		void; + *	};   */ -static int -nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res) +static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result)  { -	int	status; - -	if ((status = ntohl(*p++))) -		return nfs_stat_to_errno(status); +	__be32 *p; -	res->tsize  = ntohl(*p++); -	res->bsize  = ntohl(*p++); -	res->blocks = ntohl(*p++); -	res->bfree  = ntohl(*p++); -	res->bavail = ntohl(*p++); +	p = xdr_inline_decode(xdr, NFS_info_sz << 2); +	if (unlikely(p == NULL)) +		goto out_overflow; +	result->tsize  = be32_to_cpup(p++); +	result->bsize  = be32_to_cpup(p++); +	result->blocks = be32_to_cpup(p++); +	result->bfree  = be32_to_cpup(p++); +	result->bavail = be32_to_cpup(p);  	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO;  } +static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr, +				  struct nfs2_fsstat *result) +{ +	enum nfs_stat status; +	int error; + +	error = decode_stat(xdr, &status); +	if (unlikely(error)) +		goto out; +	if (status != NFS_OK) +		goto out_default; +	error = decode_info(xdr, result); +out: +	return error; +out_default: +	return nfs_stat_to_errno(status); +} + +  /*   * We need to translate between nfs status return values and   * the local errno values which may not be the same.   */ -static struct { +static const struct {  	int stat;  	int errno;  } nfs_errtbl[] = { @@ -676,28 +1092,30 @@ static struct {  	{ -1,			-EIO		}  }; -/* - * Convert an NFS error code to a local one. - * This one is used jointly by NFSv2 and NFSv3. +/** + * nfs_stat_to_errno - convert an NFS status code to a local errno + * @status: NFS status code to convert + * + * Returns a local errno value, or -EIO if the NFS status code is + * not recognized.  This function is used jointly by NFSv2 and NFSv3.   */ -int -nfs_stat_to_errno(int stat) +static int nfs_stat_to_errno(enum nfs_stat status)  {  	int i;  	for (i = 0; nfs_errtbl[i].stat != -1; i++) { -		if (nfs_errtbl[i].stat == stat) +		if (nfs_errtbl[i].stat == (int)status)  			return nfs_errtbl[i].errno;  	} -	dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat); +	dprintk("NFS: Unrecognized nfs status value: %u\n", status);  	return nfs_errtbl[i].errno;  }  #define PROC(proc, argtype, restype, timer)				\  [NFSPROC_##proc] = {							\  	.p_proc	    =  NFSPROC_##proc,					\ -	.p_encode   =  (kxdrproc_t) nfs_xdr_##argtype,			\ -	.p_decode   =  (kxdrproc_t) nfs_xdr_##restype,			\ +	.p_encode   =  (kxdreproc_t)nfs2_xdr_enc_##argtype,		\ +	.p_decode   =  (kxdrdproc_t)nfs2_xdr_dec_##restype,		\  	.p_arglen   =  NFS_##argtype##_sz,				\  	.p_replen   =  NFS_##restype##_sz,				\  	.p_timer    =  timer,						\ @@ -705,24 +1123,24 @@ nfs_stat_to_errno(int stat)  	.p_name     =  #proc,						\  	}  struct rpc_procinfo	nfs_procedures[] = { -    PROC(GETATTR,	fhandle,	attrstat, 1), -    PROC(SETATTR,	sattrargs,	attrstat, 0), -    PROC(LOOKUP,	diropargs,	diropres, 2), -    PROC(READLINK,	readlinkargs,	readlinkres, 3), -    PROC(READ,		readargs,	readres, 3), -    PROC(WRITE,		writeargs,	writeres, 4), -    PROC(CREATE,	createargs,	diropres, 0), -    PROC(REMOVE,	removeargs,	stat, 0), -    PROC(RENAME,	renameargs,	stat, 0), -    PROC(LINK,		linkargs,	stat, 0), -    PROC(SYMLINK,	symlinkargs,	stat, 0), -    PROC(MKDIR,		createargs,	diropres, 0), -    PROC(RMDIR,		diropargs,	stat, 0), -    PROC(READDIR,	readdirargs,	readdirres, 3), -    PROC(STATFS,	fhandle,	statfsres, 0), +	PROC(GETATTR,	fhandle,	attrstat,	1), +	PROC(SETATTR,	sattrargs,	attrstat,	0), +	PROC(LOOKUP,	diropargs,	diropres,	2), +	PROC(READLINK,	readlinkargs,	readlinkres,	3), +	PROC(READ,	readargs,	readres,	3), +	PROC(WRITE,	writeargs,	writeres,	4), +	PROC(CREATE,	createargs,	diropres,	0), +	PROC(REMOVE,	removeargs,	stat,		0), +	PROC(RENAME,	renameargs,	stat,		0), +	PROC(LINK,	linkargs,	stat,		0), +	PROC(SYMLINK,	symlinkargs,	stat,		0), +	PROC(MKDIR,	createargs,	diropres,	0), +	PROC(RMDIR,	diropargs,	stat,		0), +	PROC(READDIR,	readdirargs,	readdirres,	3), +	PROC(STATFS,	fhandle,	statfsres,	0),  }; -struct rpc_version		nfs_version2 = { +const struct rpc_version nfs_version2 = {  	.number			= 2,  	.nrprocs		= ARRAY_SIZE(nfs_procedures),  	.procs			= nfs_procedures diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 9f88c5f4c7e..8f854dde415 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -10,179 +10,7 @@  #define NFSDBG_FACILITY	NFSDBG_PROC -ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ -	struct inode *inode = dentry->d_inode; -	struct posix_acl *acl; -	int pos=0, len=0; - -#	define output(s) do {						\ -			if (pos + sizeof(s) <= size) {			\ -				memcpy(buffer + pos, s, sizeof(s));	\ -				pos += sizeof(s);			\ -			}						\ -			len += sizeof(s);				\ -		} while(0) - -	acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS); -	if (IS_ERR(acl)) -		return PTR_ERR(acl); -	if (acl) { -		output("system.posix_acl_access"); -		posix_acl_release(acl); -	} - -	if (S_ISDIR(inode->i_mode)) { -		acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT); -		if (IS_ERR(acl)) -			return PTR_ERR(acl); -		if (acl) { -			output("system.posix_acl_default"); -			posix_acl_release(acl); -		} -	} - -#	undef output - -	if (!buffer || len <= size) -		return len; -	return -ERANGE; -} - -ssize_t nfs3_getxattr(struct dentry *dentry, const char *name, -		void *buffer, size_t size) -{ -	struct inode *inode = dentry->d_inode; -	struct posix_acl *acl; -	int type, error = 0; - -	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) -		type = ACL_TYPE_ACCESS; -	else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) -		type = ACL_TYPE_DEFAULT; -	else -		return -EOPNOTSUPP; - -	acl = nfs3_proc_getacl(inode, type); -	if (IS_ERR(acl)) -		return PTR_ERR(acl); -	else if (acl) { -		if (type == ACL_TYPE_ACCESS && acl->a_count == 0) -			error = -ENODATA; -		else -			error = posix_acl_to_xattr(acl, buffer, size); -		posix_acl_release(acl); -	} else -		error = -ENODATA; - -	return error; -} - -int nfs3_setxattr(struct dentry *dentry, const char *name, -	     const void *value, size_t size, int flags) -{ -	struct inode *inode = dentry->d_inode; -	struct posix_acl *acl; -	int type, error; - -	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) -		type = ACL_TYPE_ACCESS; -	else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) -		type = ACL_TYPE_DEFAULT; -	else -		return -EOPNOTSUPP; - -	acl = posix_acl_from_xattr(value, size); -	if (IS_ERR(acl)) -		return PTR_ERR(acl); -	error = nfs3_proc_setacl(inode, type, acl); -	posix_acl_release(acl); - -	return error; -} - -int nfs3_removexattr(struct dentry *dentry, const char *name) -{ -	struct inode *inode = dentry->d_inode; -	int type; - -	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) -		type = ACL_TYPE_ACCESS; -	else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) -		type = ACL_TYPE_DEFAULT; -	else -		return -EOPNOTSUPP; - -	return nfs3_proc_setacl(inode, type, NULL); -} - -static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi) -{ -	if (!IS_ERR(nfsi->acl_access)) { -		posix_acl_release(nfsi->acl_access); -		nfsi->acl_access = ERR_PTR(-EAGAIN); -	} -	if (!IS_ERR(nfsi->acl_default)) { -		posix_acl_release(nfsi->acl_default); -		nfsi->acl_default = ERR_PTR(-EAGAIN); -	} -} - -void nfs3_forget_cached_acls(struct inode *inode) -{ -	dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id, -		inode->i_ino); -	spin_lock(&inode->i_lock); -	__nfs3_forget_cached_acls(NFS_I(inode)); -	spin_unlock(&inode->i_lock); -} - -static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type) -{ -	struct nfs_inode *nfsi = NFS_I(inode); -	struct posix_acl *acl = ERR_PTR(-EINVAL); - -	spin_lock(&inode->i_lock); -	switch(type) { -		case ACL_TYPE_ACCESS: -			acl = nfsi->acl_access; -			break; - -		case ACL_TYPE_DEFAULT: -			acl = nfsi->acl_default; -			break; - -		default: -			goto out; -	} -	if (IS_ERR(acl)) -		acl = ERR_PTR(-EAGAIN); -	else -		acl = posix_acl_dup(acl); -out: -	spin_unlock(&inode->i_lock); -	dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id, -		inode->i_ino, type, acl); -	return acl; -} - -static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl, -		    struct posix_acl *dfacl) -{ -	struct nfs_inode *nfsi = NFS_I(inode); - -	dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id, -		inode->i_ino, acl, dfacl); -	spin_lock(&inode->i_lock); -	__nfs3_forget_cached_acls(NFS_I(inode)); -	if (!IS_ERR(acl)) -		nfsi->acl_access = posix_acl_dup(acl); -	if (!IS_ERR(dfacl)) -		nfsi->acl_default = posix_acl_dup(dfacl); -	spin_unlock(&inode->i_lock); -} - -struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) +struct posix_acl *nfs3_get_acl(struct inode *inode, int type)  {  	struct nfs_server *server = NFS_SERVER(inode);  	struct page *pages[NFSACL_MAXPAGES] = { }; @@ -192,13 +20,12 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)  		.pages = pages,  	};  	struct nfs3_getaclres res = { -		0 +		NULL,  	};  	struct rpc_message msg = {  		.rpc_argp	= &args,  		.rpc_resp	= &res,  	}; -	struct posix_acl *acl;  	int status, count;  	if (!nfs_server_capable(inode, NFS_CAP_ACLS)) @@ -207,10 +34,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)  	status = nfs_revalidate_inode(server, inode);  	if (status < 0)  		return ERR_PTR(status); -	acl = nfs3_get_cached_acl(inode, type); -	if (acl != ERR_PTR(-EAGAIN)) -		return acl; -	acl = NULL;  	/*  	 * Only get the access acl when explicitly requested: We don't @@ -257,40 +80,41 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)  	}  	if (res.acl_access != NULL) { -		if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) { +		if ((posix_acl_equiv_mode(res.acl_access, NULL) == 0) || +		    res.acl_access->a_count == 0) {  			posix_acl_release(res.acl_access);  			res.acl_access = NULL;  		}  	} -	nfs3_cache_acls(inode, -		(res.mask & NFS_ACL)   ? res.acl_access  : ERR_PTR(-EINVAL), -		(res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL)); -	switch(type) { -		case ACL_TYPE_ACCESS: -			acl = res.acl_access; -			res.acl_access = NULL; -			break; +	if (res.mask & NFS_ACL) +		set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access); +	else +		forget_cached_acl(inode, ACL_TYPE_ACCESS); -		case ACL_TYPE_DEFAULT: -			acl = res.acl_default; -			res.acl_default = NULL; +	if (res.mask & NFS_DFACL) +		set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default); +	else +		forget_cached_acl(inode, ACL_TYPE_DEFAULT); + +	nfs_free_fattr(res.fattr); +	if (type == ACL_TYPE_ACCESS) { +		posix_acl_release(res.acl_default); +		return res.acl_access; +	} else { +		posix_acl_release(res.acl_access); +		return res.acl_default;  	}  getout:  	posix_acl_release(res.acl_access);  	posix_acl_release(res.acl_default);  	nfs_free_fattr(res.fattr); - -	if (status != 0) { -		posix_acl_release(acl); -		acl = ERR_PTR(status); -	} -	return acl; +	return ERR_PTR(status);  } -static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, -		  struct posix_acl *dfacl) +static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, +		struct posix_acl *dfacl)  {  	struct nfs_server *server = NFS_SERVER(inode);  	struct nfs_fattr *fattr; @@ -311,8 +135,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,  	if (!nfs_server_capable(inode, NFS_CAP_ACLS))  		goto out; -	/* We are doing this here, because XDR marshalling can only -	   return -ENOMEM. */ +	/* We are doing this here because XDR marshalling does not +	 * return any results, it BUGs. */  	status = -ENOSPC;  	if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)  		goto out; @@ -353,7 +177,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,  	switch (status) {  		case 0:  			status = nfs_refresh_inode(inode, fattr); -			nfs3_cache_acls(inode, acl, dfacl); +			set_cached_acl(inode, ACL_TYPE_ACCESS, acl); +			set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl);  			break;  		case -EPFNOSUPPORT:  		case -EPROTONOSUPPORT: @@ -373,40 +198,43 @@ out:  	return status;  } -int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl) +int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, +		struct posix_acl *dfacl) +{ +	int ret; +	ret = __nfs3_proc_setacls(inode, acl, dfacl); +	return (ret == -EOPNOTSUPP) ? 0 : ret; + +} + +int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type)  {  	struct posix_acl *alloc = NULL, *dfacl = NULL;  	int status;  	if (S_ISDIR(inode->i_mode)) {  		switch(type) { -			case ACL_TYPE_ACCESS: -				alloc = dfacl = nfs3_proc_getacl(inode, -						ACL_TYPE_DEFAULT); -				if (IS_ERR(alloc)) -					goto fail; -				break; - -			case ACL_TYPE_DEFAULT: -				dfacl = acl; -				alloc = acl = nfs3_proc_getacl(inode, -						ACL_TYPE_ACCESS); -				if (IS_ERR(alloc)) -					goto fail; -				break; - -			default: -				return -EINVAL; +		case ACL_TYPE_ACCESS: +			alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT); +			if (IS_ERR(alloc)) +				goto fail; +			break; + +		case ACL_TYPE_DEFAULT: +			dfacl = acl; +			alloc = acl = get_acl(inode, ACL_TYPE_ACCESS); +			if (IS_ERR(alloc)) +				goto fail; +			break;  		} -	} else if (type != ACL_TYPE_ACCESS) -			return -EINVAL; +	}  	if (acl == NULL) {  		alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);  		if (IS_ERR(alloc))  			goto fail;  	} -	status = nfs3_proc_setacls(inode, acl, dfacl); +	status = __nfs3_proc_setacls(inode, acl, dfacl);  	posix_acl_release(alloc);  	return status; @@ -414,31 +242,51 @@ fail:  	return PTR_ERR(alloc);  } -int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, -		mode_t mode) +const struct xattr_handler *nfs3_xattr_handlers[] = { +	&posix_acl_access_xattr_handler, +	&posix_acl_default_xattr_handler, +	NULL, +}; + +static int +nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, +		size_t size, ssize_t *result)  { -	struct posix_acl *dfacl, *acl; -	int error = 0; +	struct posix_acl *acl; +	char *p = data + *result; -	dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT); -	if (IS_ERR(dfacl)) { -		error = PTR_ERR(dfacl); -		return (error == -EOPNOTSUPP) ? 0 : error; -	} -	if (!dfacl) -		return 0; -	acl = posix_acl_clone(dfacl, GFP_KERNEL); -	error = -ENOMEM; +	acl = get_acl(inode, type);  	if (!acl) -		goto out_release_dfacl; -	error = posix_acl_create_masq(acl, &mode); -	if (error < 0) -		goto out_release_acl; -	error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ? -						      dfacl : NULL); -out_release_acl: +		return 0; +  	posix_acl_release(acl); -out_release_dfacl: -	posix_acl_release(dfacl); -	return error; + +	*result += strlen(name); +	*result += 1; +	if (!size) +		return 0; +	if (*result > size) +		return -ERANGE; + +	strcpy(p, name); +	return 0; +} + +ssize_t +nfs3_listxattr(struct dentry *dentry, char *data, size_t size) +{ +	struct inode *inode = dentry->d_inode; +	ssize_t result = 0; +	int error; + +	error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS, +			POSIX_ACL_XATTR_ACCESS, data, size, &result); +	if (error) +		return error; + +	error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT, +			POSIX_ACL_XATTR_DEFAULT, data, size, &result); +	if (error) +		return error; +	return result;  } diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c new file mode 100644 index 00000000000..b3fc65ef39c --- /dev/null +++ b/fs/nfs/nfs3client.c @@ -0,0 +1,65 @@ +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include "internal.h" + +#ifdef CONFIG_NFS_V3_ACL +static struct rpc_stat		nfsacl_rpcstat = { &nfsacl_program }; +static const struct rpc_version *nfsacl_version[] = { +	[3]			= &nfsacl_version3, +}; + +const struct rpc_program nfsacl_program = { +	.name			= "nfsacl", +	.number			= NFS_ACL_PROGRAM, +	.nrvers			= ARRAY_SIZE(nfsacl_version), +	.version		= nfsacl_version, +	.stats			= &nfsacl_rpcstat, +}; + +/* + * Initialise an NFSv3 ACL client connection + */ +static void nfs_init_server_aclclient(struct nfs_server *server) +{ +	if (server->flags & NFS_MOUNT_NOACL) +		goto out_noacl; + +	server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3); +	if (IS_ERR(server->client_acl)) +		goto out_noacl; + +	/* No errors! Assume that Sun nfsacls are supported */ +	server->caps |= NFS_CAP_ACLS; +	return; + +out_noacl: +	server->caps &= ~NFS_CAP_ACLS; +} +#else +static inline void nfs_init_server_aclclient(struct nfs_server *server) +{ +	server->flags &= ~NFS_MOUNT_NOACL; +	server->caps &= ~NFS_CAP_ACLS; +} +#endif + +struct nfs_server *nfs3_create_server(struct nfs_mount_info *mount_info, +				      struct nfs_subversion *nfs_mod) +{ +	struct nfs_server *server = nfs_create_server(mount_info, nfs_mod); +	/* Create a client RPC handle for the NFS v3 ACL management interface */ +	if (!IS_ERR(server)) +		nfs_init_server_aclclient(server); +	return server; +} + +struct nfs_server *nfs3_clone_server(struct nfs_server *source, +				     struct nfs_fh *fh, +				     struct nfs_fattr *fattr, +				     rpc_authflavor_t flavor) +{ +	struct nfs_server *server = nfs_clone_server(source, fh, fattr, flavor); +	if (!IS_ERR(server) && !IS_ERR(source->client_acl)) +		nfs_init_server_aclclient(server); +	return server; +} diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index ce939c062a5..f0afa291fd5 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -17,22 +17,24 @@  #include <linux/nfs_page.h>  #include <linux/lockd/bind.h>  #include <linux/nfs_mount.h> +#include <linux/freezer.h> +#include <linux/xattr.h>  #include "iostat.h"  #include "internal.h"  #define NFSDBG_FACILITY		NFSDBG_PROC -/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */ +/* A wrapper to handle the EJUKEBOX error messages */  static int  nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)  {  	int res;  	do {  		res = rpc_call_sync(clnt, msg, flags); -		if (res != -EJUKEBOX && res != -EKEYEXPIRED) +		if (res != -EJUKEBOX)  			break; -		schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); +		freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);  		res = -ERESTARTSYS;  	} while (!fatal_signal_pending(current));  	return res; @@ -43,7 +45,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)  static int  nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)  { -	if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED) +	if (task->tk_status != -EJUKEBOX)  		return 0;  	if (task->tk_status == -EJUKEBOX)  		nfs_inc_stats(inode, NFSIOS_DELAY); @@ -68,7 +70,7 @@ do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle,  	nfs_fattr_init(info->fattr);  	status = rpc_call_sync(client, &msg, 0);  	dprintk("%s: reply fsinfo: %d\n", __func__, status); -	if (!(info->fattr->valid & NFS_ATTR_FATTR)) { +	if (status == 0 && !(info->fattr->valid & NFS_ATTR_FATTR)) {  		msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];  		msg.rpc_resp = info->fattr;  		status = rpc_call_sync(client, &msg, 0); @@ -97,7 +99,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,   */  static int  nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, -		struct nfs_fattr *fattr) +		struct nfs_fattr *fattr, struct nfs4_label *label)  {  	struct rpc_message msg = {  		.rpc_proc	= &nfs3_procedures[NFS3PROC_GETATTR], @@ -142,7 +144,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,  static int  nfs3_proc_lookup(struct inode *dir, struct qstr *name, -		 struct nfs_fh *fhandle, struct nfs_fattr *fattr) +		 struct nfs_fh *fhandle, struct nfs_fattr *fattr, +		 struct nfs4_label *label)  {  	struct nfs3_diropargs	arg = {  		.fh		= NFS_FH(dir), @@ -299,7 +302,7 @@ static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_  	status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);  	nfs_post_op_update_inode(dir, data->res.dir_attr);  	if (status == 0) -		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); +		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);  	return status;  } @@ -313,13 +316,13 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)   */  static int  nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, -		 int flags, struct nfs_open_context *ctx) +		 int flags)  { +	struct posix_acl *default_acl, *acl;  	struct nfs3_createdata *data; -	mode_t mode = sattr->ia_mode;  	int status = -ENOMEM; -	dprintk("NFS call  create %s\n", dentry->d_name.name); +	dprintk("NFS call  create %pd\n", dentry);  	data = nfs3_alloc_createdata();  	if (data == NULL) @@ -334,11 +337,13 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  	data->arg.create.createmode = NFS3_CREATE_UNCHECKED;  	if (flags & O_EXCL) {  		data->arg.create.createmode  = NFS3_CREATE_EXCLUSIVE; -		data->arg.create.verifier[0] = jiffies; -		data->arg.create.verifier[1] = current->pid; +		data->arg.create.verifier[0] = cpu_to_be32(jiffies); +		data->arg.create.verifier[1] = cpu_to_be32(current->pid);  	} -	sattr->ia_mode &= ~current_umask(); +	status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); +	if (status) +		goto out;  	for (;;) {  		status = nfs3_do_create(dir, dentry, data); @@ -364,7 +369,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  	}  	if (status != 0) -		goto out; +		goto out_release_acls;  	/* When we created the file with exclusive semantics, make  	 * sure we set the attributes afterwards. */ @@ -383,9 +388,14 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  		nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);  		dprintk("NFS reply setattr (post-create): %d\n", status);  		if (status != 0) -			goto out; +			goto out_release_acls;  	} -	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); + +	status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: +	posix_acl_release(acl); +	posix_acl_release(default_acl);  out:  	nfs3_free_createdata(data);  	dprintk("NFS reply create: %d\n", status); @@ -397,8 +407,7 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name)  {  	struct nfs_removeargs arg = {  		.fh = NFS_FH(dir), -		.name.len = name->len, -		.name.name = name->name, +		.name = *name,  	};  	struct nfs_removeres res;  	struct rpc_message msg = { @@ -427,6 +436,11 @@ nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)  	msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE];  } +static void nfs3_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) +{ +	rpc_call_start(task); +} +  static int  nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)  { @@ -444,6 +458,11 @@ nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)  	msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];  } +static void nfs3_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) +{ +	rpc_call_start(task); +} +  static int  nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,  		      struct inode *new_dir) @@ -460,41 +479,6 @@ nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,  }  static int -nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, -		 struct inode *new_dir, struct qstr *new_name) -{ -	struct nfs_renameargs	arg = { -		.old_dir	= NFS_FH(old_dir), -		.old_name	= old_name, -		.new_dir	= NFS_FH(new_dir), -		.new_name	= new_name, -	}; -	struct nfs_renameres res; -	struct rpc_message msg = { -		.rpc_proc	= &nfs3_procedures[NFS3PROC_RENAME], -		.rpc_argp	= &arg, -		.rpc_resp	= &res, -	}; -	int status = -ENOMEM; - -	dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name); - -	res.old_fattr = nfs_alloc_fattr(); -	res.new_fattr = nfs_alloc_fattr(); -	if (res.old_fattr == NULL || res.new_fattr == NULL) -		goto out; - -	status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); -	nfs_post_op_update_inode(old_dir, res.old_fattr); -	nfs_post_op_update_inode(new_dir, res.new_fattr); -out: -	nfs_free_fattr(res.old_fattr); -	nfs_free_fattr(res.new_fattr); -	dprintk("NFS reply rename: %d\n", status); -	return status; -} - -static int  nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)  {  	struct nfs3_linkargs	arg = { @@ -537,7 +521,7 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,  	if (len > NFS3_MAXPATHLEN)  		return -ENAMETOOLONG; -	dprintk("NFS call  symlink %s\n", dentry->d_name.name); +	dprintk("NFS call  symlink %pd\n", dentry);  	data = nfs3_alloc_createdata();  	if (data == NULL) @@ -561,18 +545,20 @@ out:  static int  nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)  { +	struct posix_acl *default_acl, *acl;  	struct nfs3_createdata *data; -	int mode = sattr->ia_mode;  	int status = -ENOMEM; -	dprintk("NFS call  mkdir %s\n", dentry->d_name.name); - -	sattr->ia_mode &= ~current_umask(); +	dprintk("NFS call  mkdir %pd\n", dentry);  	data = nfs3_alloc_createdata();  	if (data == NULL)  		goto out; +	status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); +	if (status) +		goto out; +  	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];  	data->arg.mkdir.fh = NFS_FH(dir);  	data->arg.mkdir.name = dentry->d_name.name; @@ -581,9 +567,13 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)  	status = nfs3_do_create(dir, dentry, data);  	if (status != 0) -		goto out; +		goto out_release_acls; + +	status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); -	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); +out_release_acls: +	posix_acl_release(acl); +	posix_acl_release(default_acl);  out:  	nfs3_free_createdata(data);  	dprintk("NFS reply mkdir: %d\n", status); @@ -633,7 +623,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,  		  u64 cookie, struct page **pages, unsigned int count, int plus)  {  	struct inode		*dir = dentry->d_inode; -	__be32			*verf = NFS_COOKIEVERF(dir); +	__be32			*verf = NFS_I(dir)->cookieverf;  	struct nfs3_readdirargs	arg = {  		.fh		= NFS_FH(dir),  		.cookie		= cookie, @@ -680,19 +670,21 @@ static int  nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  		dev_t rdev)  { +	struct posix_acl *default_acl, *acl;  	struct nfs3_createdata *data; -	mode_t mode = sattr->ia_mode;  	int status = -ENOMEM; -	dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name, +	dprintk("NFS call  mknod %pd %u:%u\n", dentry,  			MAJOR(rdev), MINOR(rdev)); -	sattr->ia_mode &= ~current_umask(); -  	data = nfs3_alloc_createdata();  	if (data == NULL)  		goto out; +	status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); +	if (status) +		goto out; +  	data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];  	data->arg.mknod.fh = NFS_FH(dir);  	data->arg.mknod.name = dentry->d_name.name; @@ -720,8 +712,13 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  	status = nfs3_do_create(dir, dentry, data);  	if (status != 0) -		goto out; -	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); +		goto out_release_acls; + +	status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: +	posix_acl_release(acl); +	posix_acl_release(default_acl);  out:  	nfs3_free_createdata(data);  	dprintk("NFS reply mknod: %d\n", status); @@ -798,36 +795,51 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,  	return status;  } -static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data)  { -	if (nfs3_async_handle_jukebox(task, data->inode)) +	struct inode *inode = data->header->inode; + +	if (nfs3_async_handle_jukebox(task, inode))  		return -EAGAIN; -	nfs_invalidate_atime(data->inode); -	nfs_refresh_inode(data->inode, &data->fattr); +	nfs_invalidate_atime(inode); +	nfs_refresh_inode(inode, &data->fattr);  	return 0;  } -static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)  {  	msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];  } -static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)  { -	if (nfs3_async_handle_jukebox(task, data->inode)) +	rpc_call_start(task); +	return 0; +} + +static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data) +{ +	struct inode *inode = data->header->inode; + +	if (nfs3_async_handle_jukebox(task, inode))  		return -EAGAIN;  	if (task->tk_status >= 0) -		nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr); +		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);  	return 0;  } -static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)  {  	msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];  } -static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data) +static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) +{ +	rpc_call_start(task); +} + +static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data)  {  	if (nfs3_async_handle_jukebox(task, data->inode))  		return -EAGAIN; @@ -835,7 +847,7 @@ static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)  	return 0;  } -static void nfs3_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)  {  	msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT];  } @@ -843,17 +855,68 @@ static void nfs3_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa  static int  nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);  } +static int nfs3_have_delegation(struct inode *inode, fmode_t flags) +{ +	return 0; +} + +static int nfs3_return_delegation(struct inode *inode) +{ +	nfs_wb_all(inode); +	return 0; +} + +static const struct inode_operations nfs3_dir_inode_operations = { +	.create		= nfs_create, +	.lookup		= nfs_lookup, +	.link		= nfs_link, +	.unlink		= nfs_unlink, +	.symlink	= nfs_symlink, +	.mkdir		= nfs_mkdir, +	.rmdir		= nfs_rmdir, +	.mknod		= nfs_mknod, +	.rename		= nfs_rename, +	.permission	= nfs_permission, +	.getattr	= nfs_getattr, +	.setattr	= nfs_setattr, +#ifdef CONFIG_NFS_V3_ACL +	.listxattr	= nfs3_listxattr, +	.getxattr	= generic_getxattr, +	.setxattr	= generic_setxattr, +	.removexattr	= generic_removexattr, +	.get_acl	= nfs3_get_acl, +	.set_acl	= nfs3_set_acl, +#endif +}; + +static const struct inode_operations nfs3_file_inode_operations = { +	.permission	= nfs_permission, +	.getattr	= nfs_getattr, +	.setattr	= nfs_setattr, +#ifdef CONFIG_NFS_V3_ACL +	.listxattr	= nfs3_listxattr, +	.getxattr	= generic_getxattr, +	.setxattr	= generic_setxattr, +	.removexattr	= generic_removexattr, +	.get_acl	= nfs3_get_acl, +	.set_acl	= nfs3_set_acl, +#endif +}; +  const struct nfs_rpc_ops nfs_v3_clientops = {  	.version	= 3,			/* protocol version */  	.dentry_ops	= &nfs_dentry_operations,  	.dir_inode_ops	= &nfs3_dir_inode_operations,  	.file_inode_ops	= &nfs3_file_inode_operations, +	.file_ops	= &nfs_file_operations,  	.getroot	= nfs3_proc_get_root, +	.submount	= nfs_submount, +	.try_mount	= nfs_try_mount,  	.getattr	= nfs3_proc_getattr,  	.setattr	= nfs3_proc_setattr,  	.lookup		= nfs3_proc_lookup, @@ -862,9 +925,10 @@ const struct nfs_rpc_ops nfs_v3_clientops = {  	.create		= nfs3_proc_create,  	.remove		= nfs3_proc_remove,  	.unlink_setup	= nfs3_proc_unlink_setup, +	.unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,  	.unlink_done	= nfs3_proc_unlink_done, -	.rename		= nfs3_proc_rename,  	.rename_setup	= nfs3_proc_rename_setup, +	.rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,  	.rename_done	= nfs3_proc_rename_done,  	.link		= nfs3_proc_link,  	.symlink	= nfs3_proc_symlink, @@ -876,13 +940,22 @@ const struct nfs_rpc_ops nfs_v3_clientops = {  	.fsinfo		= nfs3_proc_fsinfo,  	.pathconf	= nfs3_proc_pathconf,  	.decode_dirent	= nfs3_decode_dirent, +	.pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare,  	.read_setup	= nfs3_proc_read_setup,  	.read_done	= nfs3_read_done,  	.write_setup	= nfs3_proc_write_setup,  	.write_done	= nfs3_write_done,  	.commit_setup	= nfs3_proc_commit_setup, +	.commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,  	.commit_done	= nfs3_commit_done,  	.lock		= nfs3_proc_lock, -	.clear_acl_cache = nfs3_forget_cached_acls, +	.clear_acl_cache = forget_all_cached_acls,  	.close_context	= nfs_close_context, +	.have_delegation = nfs3_have_delegation, +	.return_delegation = nfs3_return_delegation, +	.alloc_client	= nfs_alloc_client, +	.init_client	= nfs_init_client, +	.free_client	= nfs_free_client, +	.create_server	= nfs3_create_server, +	.clone_server	= nfs3_clone_server,  }; diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c new file mode 100644 index 00000000000..d6a98949af1 --- /dev/null +++ b/fs/nfs/nfs3super.c @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2012 Netapp, Inc. All rights reserved. + */ +#include <linux/module.h> +#include <linux/nfs_fs.h> +#include "internal.h" +#include "nfs.h" + +static struct nfs_subversion nfs_v3 = { +	.owner = THIS_MODULE, +	.nfs_fs   = &nfs_fs_type, +	.rpc_vers = &nfs_version3, +	.rpc_ops  = &nfs_v3_clientops, +	.sops     = &nfs_sops, +#ifdef CONFIG_NFS_V3_ACL +	.xattr    = nfs3_xattr_handlers, +#endif +}; + +static int __init init_nfs_v3(void) +{ +	register_nfs_version(&nfs_v3); +	return 0; +} + +static void __exit exit_nfs_v3(void) +{ +	unregister_nfs_version(&nfs_v3); +} + +MODULE_LICENSE("GPL"); + +module_init(init_nfs_v3); +module_exit(exit_nfs_v3); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index d9a5e832c25..8f4cbe7f4aa 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -37,18 +37,16 @@  #define NFS3_filename_sz	(1+(NFS3_MAXNAMLEN>>2))  #define NFS3_path_sz		(1+(NFS3_MAXPATHLEN>>2))  #define NFS3_fattr_sz		(21) -#define NFS3_wcc_attr_sz		(6) +#define NFS3_cookieverf_sz	(NFS3_COOKIEVERFSIZE>>2) +#define NFS3_wcc_attr_sz	(6)  #define NFS3_pre_op_attr_sz	(1+NFS3_wcc_attr_sz)  #define NFS3_post_op_attr_sz	(1+NFS3_fattr_sz) -#define NFS3_wcc_data_sz		(NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz) -#define NFS3_fsstat_sz		 -#define NFS3_fsinfo_sz		 -#define NFS3_pathconf_sz		 -#define NFS3_entry_sz		(NFS3_filename_sz+3) - -#define NFS3_sattrargs_sz	(NFS3_fh_sz+NFS3_sattr_sz+3) +#define NFS3_wcc_data_sz	(NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz)  #define NFS3_diropargs_sz	(NFS3_fh_sz+NFS3_filename_sz) -#define NFS3_removeargs_sz	(NFS3_fh_sz+NFS3_filename_sz) + +#define NFS3_getattrargs_sz	(NFS3_fh_sz) +#define NFS3_setattrargs_sz	(NFS3_fh_sz+NFS3_sattr_sz+3) +#define NFS3_lookupargs_sz	(NFS3_fh_sz+NFS3_filename_sz)  #define NFS3_accessargs_sz	(NFS3_fh_sz+1)  #define NFS3_readlinkargs_sz	(NFS3_fh_sz)  #define NFS3_readargs_sz	(NFS3_fh_sz+3) @@ -57,14 +55,16 @@  #define NFS3_mkdirargs_sz	(NFS3_diropargs_sz+NFS3_sattr_sz)  #define NFS3_symlinkargs_sz	(NFS3_diropargs_sz+1+NFS3_sattr_sz)  #define NFS3_mknodargs_sz	(NFS3_diropargs_sz+2+NFS3_sattr_sz) +#define NFS3_removeargs_sz	(NFS3_fh_sz+NFS3_filename_sz)  #define NFS3_renameargs_sz	(NFS3_diropargs_sz+NFS3_diropargs_sz)  #define NFS3_linkargs_sz		(NFS3_fh_sz+NFS3_diropargs_sz) -#define NFS3_readdirargs_sz	(NFS3_fh_sz+2) +#define NFS3_readdirargs_sz	(NFS3_fh_sz+NFS3_cookieverf_sz+3) +#define NFS3_readdirplusargs_sz	(NFS3_fh_sz+NFS3_cookieverf_sz+4)  #define NFS3_commitargs_sz	(NFS3_fh_sz+3) -#define NFS3_attrstat_sz	(1+NFS3_fattr_sz) -#define NFS3_wccstat_sz		(1+NFS3_wcc_data_sz) -#define NFS3_removeres_sz	(NFS3_wccstat_sz) +#define NFS3_getattrres_sz	(1+NFS3_fattr_sz) +#define NFS3_setattrres_sz	(1+NFS3_wcc_data_sz) +#define NFS3_removeres_sz	(NFS3_setattrres_sz)  #define NFS3_lookupres_sz	(1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))  #define NFS3_accessres_sz	(1+NFS3_post_op_attr_sz+1)  #define NFS3_readlinkres_sz	(1+NFS3_post_op_attr_sz+1) @@ -86,6 +86,8 @@  				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))  #define ACL3_setaclres_sz	(1+NFS3_post_op_attr_sz) +static int nfs3_stat_to_errno(enum nfs_stat); +  /*   * Map file type to S_IFMT bits   */ @@ -100,1080 +102,2424 @@ static const umode_t nfs_type2fmt[] = {  	[NF3FIFO] = S_IFIFO,  }; +/* + * While encoding arguments, set up the reply buffer in advance to + * receive reply data directly into the page cache. + */ +static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, +				 unsigned int base, unsigned int len, +				 unsigned int bufsize) +{ +	struct rpc_auth	*auth = req->rq_cred->cr_auth; +	unsigned int replen; + +	replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; +	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); +} + +/* + * Handle decode buffer overflows out-of-line. + */  static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)  { -	dprintk("nfs: %s: prematurely hit end of receive buffer. " +	dprintk("NFS: %s prematurely hit the end of our receive buffer. "  		"Remaining buffer length is %tu words.\n",  		func, xdr->end - xdr->p);  } +  /* - * Common NFS XDR functions as inlines + * Encode/decode NFSv3 basic data types + * + * Basic NFSv3 data types are defined in section 2.5 of RFC 1813: + * "NFS Version 3 Protocol Specification". + * + * Not all basic data types have their own encoding and decoding + * functions.  For run-time efficiency, some data types are encoded + * or decoded inline.   */ -static inline __be32 * -xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fh) + +static void encode_uint32(struct xdr_stream *xdr, u32 value)  { -	return xdr_encode_array(p, fh->data, fh->size); +	__be32 *p = xdr_reserve_space(xdr, 4); +	*p = cpu_to_be32(value);  } -static inline __be32 * -xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh) +static int decode_uint32(struct xdr_stream *xdr, u32 *value)  { -	if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) { -		memcpy(fh->data, p, fh->size); -		return p + XDR_QUADLEN(fh->size); -	} -	return NULL; +	__be32 *p; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	*value = be32_to_cpup(p); +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +static int decode_uint64(struct xdr_stream *xdr, u64 *value) +{ +	__be32 *p; + +	p = xdr_inline_decode(xdr, 8); +	if (unlikely(p == NULL)) +		goto out_overflow; +	xdr_decode_hyper(p, value); +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +/* + * fileid3 + * + *	typedef uint64 fileid3; + */ +static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid) +{ +	return xdr_decode_hyper(p, fileid); +} + +static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid) +{ +	return decode_uint64(xdr, fileid); +} + +/* + * filename3 + * + *	typedef string filename3<>; + */ +static void encode_filename3(struct xdr_stream *xdr, +			     const char *name, u32 length) +{ +	__be32 *p; + +	WARN_ON_ONCE(length > NFS3_MAXNAMLEN); +	p = xdr_reserve_space(xdr, 4 + length); +	xdr_encode_opaque(p, name, length);  } -static inline __be32 * -xdr_decode_fhandle_stream(struct xdr_stream *xdr, struct nfs_fh *fh) +static int decode_inline_filename3(struct xdr_stream *xdr, +				   const char **name, u32 *length)  {  	__be32 *p; +	u32 count; +  	p = xdr_inline_decode(xdr, 4); -	if (unlikely(!p)) +	if (unlikely(p == NULL))  		goto out_overflow; -	fh->size = ntohl(*p++); +	count = be32_to_cpup(p); +	if (count > NFS3_MAXNAMLEN) +		goto out_nametoolong; +	p = xdr_inline_decode(xdr, count); +	if (unlikely(p == NULL)) +		goto out_overflow; +	*name = (const char *)p; +	*length = count; +	return 0; -	if (fh->size <= NFS3_FHSIZE) { -		p = xdr_inline_decode(xdr, fh->size); -		if (unlikely(!p)) -			goto out_overflow; -		memcpy(fh->data, p, fh->size); -		return p + XDR_QUADLEN(fh->size); -	} -	return NULL; +out_nametoolong: +	dprintk("NFS: returned filename too long: %u\n", count); +	return -ENAMETOOLONG; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +/* + * nfspath3 + * + *	typedef string nfspath3<>; + */ +static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages, +			    const u32 length) +{ +	encode_uint32(xdr, length); +	xdr_write_pages(xdr, pages, 0, length); +} +static int decode_nfspath3(struct xdr_stream *xdr) +{ +	u32 recvd, count; +	__be32 *p; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	count = be32_to_cpup(p); +	if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN)) +		goto out_nametoolong; +	recvd = xdr_read_pages(xdr, count); +	if (unlikely(count > recvd)) +		goto out_cheating; +	xdr_terminate_string(xdr->buf, count); +	return 0; + +out_nametoolong: +	dprintk("NFS: returned pathname too long: %u\n", count); +	return -ENAMETOOLONG; +out_cheating: +	dprintk("NFS: server cheating in pathname result: " +		"count %u > recvd %u\n", count, recvd); +	return -EIO;  out_overflow:  	print_overflow_msg(__func__, xdr); -	return ERR_PTR(-EIO); +	return -EIO;  }  /* - * Encode/decode time. + * cookie3 + * + *	typedef uint64 cookie3   */ -static inline __be32 * -xdr_encode_time3(__be32 *p, struct timespec *timep) +static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie)  { -	*p++ = htonl(timep->tv_sec); -	*p++ = htonl(timep->tv_nsec); -	return p; +	return xdr_encode_hyper(p, cookie);  } -static inline __be32 * -xdr_decode_time3(__be32 *p, struct timespec *timep) +static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie)  { -	timep->tv_sec = ntohl(*p++); -	timep->tv_nsec = ntohl(*p++); -	return p; +	return decode_uint64(xdr, cookie); +} + +/* + * cookieverf3 + * + *	typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE]; + */ +static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier) +{ +	memcpy(p, verifier, NFS3_COOKIEVERFSIZE); +	return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE); +} + +static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier) +{ +	__be32 *p; + +	p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); +	if (unlikely(p == NULL)) +		goto out_overflow; +	memcpy(verifier, p, NFS3_COOKIEVERFSIZE); +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +/* + * createverf3 + * + *	typedef opaque createverf3[NFS3_CREATEVERFSIZE]; + */ +static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier) +{ +	__be32 *p; + +	p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE); +	memcpy(p, verifier, NFS3_CREATEVERFSIZE); +} + +static int decode_writeverf3(struct xdr_stream *xdr, struct nfs_write_verifier *verifier) +{ +	__be32 *p; + +	p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE); +	if (unlikely(p == NULL)) +		goto out_overflow; +	memcpy(verifier->data, p, NFS3_WRITEVERFSIZE); +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +/* + * size3 + * + *	typedef uint64 size3; + */ +static __be32 *xdr_decode_size3(__be32 *p, u64 *size) +{ +	return xdr_decode_hyper(p, size); +} + +/* + * nfsstat3 + * + *	enum nfsstat3 { + *		NFS3_OK = 0, + *		... + *	} + */ +#define NFS3_OK		NFS_OK + +static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status) +{ +	__be32 *p; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	*status = be32_to_cpup(p); +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +/* + * ftype3 + * + *	enum ftype3 { + *		NF3REG	= 1, + *		NF3DIR	= 2, + *		NF3BLK	= 3, + *		NF3CHR	= 4, + *		NF3LNK	= 5, + *		NF3SOCK	= 6, + *		NF3FIFO	= 7 + *	}; + */ +static void encode_ftype3(struct xdr_stream *xdr, const u32 type) +{ +	encode_uint32(xdr, type);  } -static __be32 * -xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) +static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode)  { -	unsigned int	type, major, minor; -	umode_t		fmode; +	u32 type; -	type = ntohl(*p++); +	type = be32_to_cpup(p++);  	if (type > NF3FIFO)  		type = NF3NON; -	fmode = nfs_type2fmt[type]; -	fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode; -	fattr->nlink = ntohl(*p++); -	fattr->uid = ntohl(*p++); -	fattr->gid = ntohl(*p++); -	p = xdr_decode_hyper(p, &fattr->size); -	p = xdr_decode_hyper(p, &fattr->du.nfs3.used); - -	/* Turn remote device info into Linux-specific dev_t */ -	major = ntohl(*p++); -	minor = ntohl(*p++); -	fattr->rdev = MKDEV(major, minor); -	if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor) -		fattr->rdev = 0; +	*mode = nfs_type2fmt[type]; +	return p; +} -	p = xdr_decode_hyper(p, &fattr->fsid.major); -	fattr->fsid.minor = 0; -	p = xdr_decode_hyper(p, &fattr->fileid); -	p = xdr_decode_time3(p, &fattr->atime); -	p = xdr_decode_time3(p, &fattr->mtime); -	p = xdr_decode_time3(p, &fattr->ctime); +/* + * specdata3 + * + *     struct specdata3 { + *             uint32  specdata1; + *             uint32  specdata2; + *     }; + */ +static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev) +{ +	__be32 *p; -	/* Update the mode bits */ -	fattr->valid |= NFS_ATTR_FATTR_V3; +	p = xdr_reserve_space(xdr, 8); +	*p++ = cpu_to_be32(MAJOR(rdev)); +	*p = cpu_to_be32(MINOR(rdev)); +} + +static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev) +{ +	unsigned int major, minor; + +	major = be32_to_cpup(p++); +	minor = be32_to_cpup(p++); +	*rdev = MKDEV(major, minor); +	if (MAJOR(*rdev) != major || MINOR(*rdev) != minor) +		*rdev = 0;  	return p;  } -static inline __be32 * -xdr_encode_sattr(__be32 *p, struct iattr *attr) +/* + * nfs_fh3 + * + *	struct nfs_fh3 { + *		opaque       data<NFS3_FHSIZE>; + *	}; + */ +static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)  { +	__be32 *p; + +	WARN_ON_ONCE(fh->size > NFS3_FHSIZE); +	p = xdr_reserve_space(xdr, 4 + fh->size); +	xdr_encode_opaque(p, fh->data, fh->size); +} + +static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) +{ +	u32 length; +	__be32 *p; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	length = be32_to_cpup(p++); +	if (unlikely(length > NFS3_FHSIZE)) +		goto out_toobig; +	p = xdr_inline_decode(xdr, length); +	if (unlikely(p == NULL)) +		goto out_overflow; +	fh->size = length; +	memcpy(fh->data, p, length); +	return 0; +out_toobig: +	dprintk("NFS: file handle size (%u) too big\n", length); +	return -E2BIG; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +static void zero_nfs_fh3(struct nfs_fh *fh) +{ +	memset(fh, 0, sizeof(*fh)); +} + +/* + * nfstime3 + * + *	struct nfstime3 { + *		uint32	seconds; + *		uint32	nseconds; + *	}; + */ +static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep) +{ +	*p++ = cpu_to_be32(timep->tv_sec); +	*p++ = cpu_to_be32(timep->tv_nsec); +	return p; +} + +static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep) +{ +	timep->tv_sec = be32_to_cpup(p++); +	timep->tv_nsec = be32_to_cpup(p++); +	return p; +} + +/* + * sattr3 + * + *	enum time_how { + *		DONT_CHANGE		= 0, + *		SET_TO_SERVER_TIME	= 1, + *		SET_TO_CLIENT_TIME	= 2 + *	}; + * + *	union set_mode3 switch (bool set_it) { + *	case TRUE: + *		mode3	mode; + *	default: + *		void; + *	}; + * + *	union set_uid3 switch (bool set_it) { + *	case TRUE: + *		uid3	uid; + *	default: + *		void; + *	}; + * + *	union set_gid3 switch (bool set_it) { + *	case TRUE: + *		gid3	gid; + *	default: + *		void; + *	}; + * + *	union set_size3 switch (bool set_it) { + *	case TRUE: + *		size3	size; + *	default: + *		void; + *	}; + * + *	union set_atime switch (time_how set_it) { + *	case SET_TO_CLIENT_TIME: + *		nfstime3	atime; + *	default: + *		void; + *	}; + * + *	union set_mtime switch (time_how set_it) { + *	case SET_TO_CLIENT_TIME: + *		nfstime3  mtime; + *	default: + *		void; + *	}; + * + *	struct sattr3 { + *		set_mode3	mode; + *		set_uid3	uid; + *		set_gid3	gid; + *		set_size3	size; + *		set_atime	atime; + *		set_mtime	mtime; + *	}; + */ +static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) +{ +	u32 nbytes; +	__be32 *p; + +	/* +	 * In order to make only a single xdr_reserve_space() call, +	 * pre-compute the total number of bytes to be reserved. +	 * Six boolean values, one for each set_foo field, are always +	 * present in the encoded result, so start there. +	 */ +	nbytes = 6 * 4; +	if (attr->ia_valid & ATTR_MODE) +		nbytes += 4; +	if (attr->ia_valid & ATTR_UID) +		nbytes += 4; +	if (attr->ia_valid & ATTR_GID) +		nbytes += 4; +	if (attr->ia_valid & ATTR_SIZE) +		nbytes += 8; +	if (attr->ia_valid & ATTR_ATIME_SET) +		nbytes += 8; +	if (attr->ia_valid & ATTR_MTIME_SET) +		nbytes += 8; +	p = xdr_reserve_space(xdr, nbytes); +  	if (attr->ia_valid & ATTR_MODE) {  		*p++ = xdr_one; -		*p++ = htonl(attr->ia_mode & S_IALLUGO); -	} else { +		*p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO); +	} else  		*p++ = xdr_zero; -	} +  	if (attr->ia_valid & ATTR_UID) {  		*p++ = xdr_one; -		*p++ = htonl(attr->ia_uid); -	} else { +		*p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid)); +	} else  		*p++ = xdr_zero; -	} +  	if (attr->ia_valid & ATTR_GID) {  		*p++ = xdr_one; -		*p++ = htonl(attr->ia_gid); -	} else { +		*p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid)); +	} else  		*p++ = xdr_zero; -	} +  	if (attr->ia_valid & ATTR_SIZE) {  		*p++ = xdr_one; -		p = xdr_encode_hyper(p, (__u64) attr->ia_size); -	} else { +		p = xdr_encode_hyper(p, (u64)attr->ia_size); +	} else  		*p++ = xdr_zero; -	} +  	if (attr->ia_valid & ATTR_ATIME_SET) {  		*p++ = xdr_two; -		p = xdr_encode_time3(p, &attr->ia_atime); +		p = xdr_encode_nfstime3(p, &attr->ia_atime);  	} else if (attr->ia_valid & ATTR_ATIME) {  		*p++ = xdr_one; -	} else { +	} else  		*p++ = xdr_zero; -	} +  	if (attr->ia_valid & ATTR_MTIME_SET) {  		*p++ = xdr_two; -		p = xdr_encode_time3(p, &attr->ia_mtime); +		xdr_encode_nfstime3(p, &attr->ia_mtime);  	} else if (attr->ia_valid & ATTR_MTIME) { -		*p++ = xdr_one; -	} else { -		*p++ = xdr_zero; -	} -	return p; +		*p = xdr_one; +	} else +		*p = xdr_zero;  } -static inline __be32 * -xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr) +/* + * fattr3 + * + *	struct fattr3 { + *		ftype3		type; + *		mode3		mode; + *		uint32		nlink; + *		uid3		uid; + *		gid3		gid; + *		size3		size; + *		size3		used; + *		specdata3	rdev; + *		uint64		fsid; + *		fileid3		fileid; + *		nfstime3	atime; + *		nfstime3	mtime; + *		nfstime3	ctime; + *	}; + */ +static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)  { -	p = xdr_decode_hyper(p, &fattr->pre_size); -	p = xdr_decode_time3(p, &fattr->pre_mtime); -	p = xdr_decode_time3(p, &fattr->pre_ctime); +	umode_t fmode; +	__be32 *p; + +	p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2); +	if (unlikely(p == NULL)) +		goto out_overflow; + +	p = xdr_decode_ftype3(p, &fmode); + +	fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode; +	fattr->nlink = be32_to_cpup(p++); +	fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); +	if (!uid_valid(fattr->uid)) +		goto out_uid; +	fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); +	if (!gid_valid(fattr->gid)) +		goto out_gid; + +	p = xdr_decode_size3(p, &fattr->size); +	p = xdr_decode_size3(p, &fattr->du.nfs3.used); +	p = xdr_decode_specdata3(p, &fattr->rdev); + +	p = xdr_decode_hyper(p, &fattr->fsid.major); +	fattr->fsid.minor = 0; + +	p = xdr_decode_fileid3(p, &fattr->fileid); +	p = xdr_decode_nfstime3(p, &fattr->atime); +	p = xdr_decode_nfstime3(p, &fattr->mtime); +	xdr_decode_nfstime3(p, &fattr->ctime); +	fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime); + +	fattr->valid |= NFS_ATTR_FATTR_V3; +	return 0; +out_uid: +	dprintk("NFS: returned invalid uid\n"); +	return -EINVAL; +out_gid: +	dprintk("NFS: returned invalid gid\n"); +	return -EINVAL; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +/* + * post_op_attr + * + *	union post_op_attr switch (bool attributes_follow) { + *	case TRUE: + *		fattr3	attributes; + *	case FALSE: + *		void; + *	}; + */ +static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ +	__be32 *p; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	if (*p != xdr_zero) +		return decode_fattr3(xdr, fattr); +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +/* + * wcc_attr + *	struct wcc_attr { + *		size3		size; + *		nfstime3	mtime; + *		nfstime3	ctime; + *	}; + */ +static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ +	__be32 *p; + +	p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2); +	if (unlikely(p == NULL)) +		goto out_overflow; +  	fattr->valid |= NFS_ATTR_FATTR_PRESIZE +		| NFS_ATTR_FATTR_PRECHANGE  		| NFS_ATTR_FATTR_PREMTIME  		| NFS_ATTR_FATTR_PRECTIME; -	return p; -} -static inline __be32 * -xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr) -{ -	if (*p++) -		p = xdr_decode_fattr(p, fattr); -	return p; +	p = xdr_decode_size3(p, &fattr->pre_size); +	p = xdr_decode_nfstime3(p, &fattr->pre_mtime); +	xdr_decode_nfstime3(p, &fattr->pre_ctime); +	fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime); + +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO;  } -static inline __be32 * -xdr_decode_post_op_attr_stream(struct xdr_stream *xdr, struct nfs_fattr *fattr) +/* + * pre_op_attr + *	union pre_op_attr switch (bool attributes_follow) { + *	case TRUE: + *		wcc_attr	attributes; + *	case FALSE: + *		void; + *	}; + * + * wcc_data + * + *	struct wcc_data { + *		pre_op_attr	before; + *		post_op_attr	after; + *	}; + */ +static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr)  {  	__be32 *p;  	p = xdr_inline_decode(xdr, 4); -	if (unlikely(!p)) +	if (unlikely(p == NULL))  		goto out_overflow; -	if (ntohl(*p++)) { -		p = xdr_inline_decode(xdr, 84); -		if (unlikely(!p)) -			goto out_overflow; -		p = xdr_decode_fattr(p, fattr); -	} -	return p; +	if (*p != xdr_zero) +		return decode_wcc_attr(xdr, fattr); +	return 0;  out_overflow:  	print_overflow_msg(__func__, xdr); -	return ERR_PTR(-EIO); +	return -EIO;  } -static inline __be32 * -xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr) +static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr)  { -	if (*p++) -		return xdr_decode_wcc_attr(p, fattr); -	return p; +	int error; + +	error = decode_pre_op_attr(xdr, fattr); +	if (unlikely(error)) +		goto out; +	error = decode_post_op_attr(xdr, fattr); +out: +	return error;  } +/* + * post_op_fh3 + * + *	union post_op_fh3 switch (bool handle_follows) { + *	case TRUE: + *		nfs_fh3  handle; + *	case FALSE: + *		void; + *	}; + */ +static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) +{ +	__be32 *p = xdr_inline_decode(xdr, 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	if (*p != xdr_zero) +		return decode_nfs_fh3(xdr, fh); +	zero_nfs_fh3(fh); +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} -static inline __be32 * -xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr) +/* + * diropargs3 + * + *	struct diropargs3 { + *		nfs_fh3		dir; + *		filename3	name; + *	}; + */ +static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh, +			      const char *name, u32 length)  { -	p = xdr_decode_pre_op_attr(p, fattr); -	return xdr_decode_post_op_attr(p, fattr); +	encode_nfs_fh3(xdr, fh); +	encode_filename3(xdr, name, length);  } +  /* - * NFS encode functions + * NFSv3 XDR encode functions + * + * NFSv3 argument types are defined in section 3.3 of RFC 1813: + * "NFS Version 3 Protocol Specification".   */  /* - * Encode file handle argument + * 3.3.1  GETATTR3args + * + *	struct GETATTR3args { + *		nfs_fh3  object; + *	};   */ -static int -nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh) +static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req, +				      struct xdr_stream *xdr, +				      const struct nfs_fh *fh)  { -	p = xdr_encode_fhandle(p, fh); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +	encode_nfs_fh3(xdr, fh);  }  /* - * Encode SETATTR arguments + * 3.3.2  SETATTR3args + * + *	union sattrguard3 switch (bool check) { + *	case TRUE: + *		nfstime3  obj_ctime; + *	case FALSE: + *		void; + *	}; + * + *	struct SETATTR3args { + *		nfs_fh3		object; + *		sattr3		new_attributes; + *		sattrguard3	guard; + *	};   */ -static int -nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args) -{ -	p = xdr_encode_fhandle(p, args->fh); -	p = xdr_encode_sattr(p, args->sattr); -	*p++ = htonl(args->guard); -	if (args->guard) -		p = xdr_encode_time3(p, &args->guardtime); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +static void encode_sattrguard3(struct xdr_stream *xdr, +			       const struct nfs3_sattrargs *args) +{ +	__be32 *p; + +	if (args->guard) { +		p = xdr_reserve_space(xdr, 4 + 8); +		*p++ = xdr_one; +		xdr_encode_nfstime3(p, &args->guardtime); +	} else { +		p = xdr_reserve_space(xdr, 4); +		*p = xdr_zero; +	} +} + +static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req, +				      struct xdr_stream *xdr, +				      const struct nfs3_sattrargs *args) +{ +	encode_nfs_fh3(xdr, args->fh); +	encode_sattr3(xdr, args->sattr); +	encode_sattrguard3(xdr, args);  }  /* - * Encode directory ops argument + * 3.3.3  LOOKUP3args + * + *	struct LOOKUP3args { + *		diropargs3  what; + *	};   */ -static int -nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args) +static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     const struct nfs3_diropargs *args)  { -	p = xdr_encode_fhandle(p, args->fh); -	p = xdr_encode_array(p, args->name, args->len); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +	encode_diropargs3(xdr, args->fh, args->name, args->len);  }  /* - * Encode REMOVE argument + * 3.3.4  ACCESS3args + * + *	struct ACCESS3args { + *		nfs_fh3		object; + *		uint32		access; + *	};   */ -static int -nfs3_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args) +static void encode_access3args(struct xdr_stream *xdr, +			       const struct nfs3_accessargs *args)  { -	p = xdr_encode_fhandle(p, args->fh); -	p = xdr_encode_array(p, args->name.name, args->name.len); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +	encode_nfs_fh3(xdr, args->fh); +	encode_uint32(xdr, args->access); +} + +static void nfs3_xdr_enc_access3args(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     const struct nfs3_accessargs *args) +{ +	encode_access3args(xdr, args);  }  /* - * Encode access() argument + * 3.3.5  READLINK3args + * + *	struct READLINK3args { + *		nfs_fh3	symlink; + *	};   */ -static int -nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args) +static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, +				       struct xdr_stream *xdr, +				       const struct nfs3_readlinkargs *args)  { -	p = xdr_encode_fhandle(p, args->fh); -	*p++ = htonl(args->access); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +	encode_nfs_fh3(xdr, args->fh); +	prepare_reply_buffer(req, args->pages, args->pgbase, +					args->pglen, NFS3_readlinkres_sz);  }  /* - * Arguments to a READ call. Since we read data directly into the page - * cache, we also set up the reply iovec here so that iov[1] points - * exactly to the page we want to fetch. + * 3.3.6  READ3args + * + *	struct READ3args { + *		nfs_fh3		file; + *		offset3		offset; + *		count3		count; + *	};   */ -static int -nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) +static void encode_read3args(struct xdr_stream *xdr, +			     const struct nfs_pgio_args *args)  { -	struct rpc_auth	*auth = req->rq_cred->cr_auth; -	unsigned int replen; -	u32 count = args->count; +	__be32 *p; -	p = xdr_encode_fhandle(p, args->fh); +	encode_nfs_fh3(xdr, args->fh); + +	p = xdr_reserve_space(xdr, 8 + 4);  	p = xdr_encode_hyper(p, args->offset); -	*p++ = htonl(count); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); +	*p = cpu_to_be32(args->count); +} -	/* Inline the page array */ -	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2; -	xdr_inline_pages(&req->rq_rcv_buf, replen, -			 args->pages, args->pgbase, count); +static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   const struct nfs_pgio_args *args) +{ +	encode_read3args(xdr, args); +	prepare_reply_buffer(req, args->pages, args->pgbase, +					args->count, NFS3_readres_sz);  	req->rq_rcv_buf.flags |= XDRBUF_READ; -	return 0;  }  /* - * Write arguments. Splice the buffer to be written into the iovec. + * 3.3.7  WRITE3args + * + *	enum stable_how { + *		UNSTABLE  = 0, + *		DATA_SYNC = 1, + *		FILE_SYNC = 2 + *	}; + * + *	struct WRITE3args { + *		nfs_fh3		file; + *		offset3		offset; + *		count3		count; + *		stable_how	stable; + *		opaque		data<>; + *	};   */ -static int -nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) +static void encode_write3args(struct xdr_stream *xdr, +			      const struct nfs_pgio_args *args)  { -	struct xdr_buf *sndbuf = &req->rq_snd_buf; -	u32 count = args->count; +	__be32 *p; -	p = xdr_encode_fhandle(p, args->fh); +	encode_nfs_fh3(xdr, args->fh); + +	p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4);  	p = xdr_encode_hyper(p, args->offset); -	*p++ = htonl(count); -	*p++ = htonl(args->stable); -	*p++ = htonl(count); -	sndbuf->len = xdr_adjust_iovec(sndbuf->head, p); - -	/* Copy the page array */ -	xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); -	sndbuf->flags |= XDRBUF_WRITE; -	return 0; +	*p++ = cpu_to_be32(args->count); +	*p++ = cpu_to_be32(args->stable); +	*p = cpu_to_be32(args->count); +	xdr_write_pages(xdr, args->pages, args->pgbase, args->count); +} + +static void nfs3_xdr_enc_write3args(struct rpc_rqst *req, +				    struct xdr_stream *xdr, +				    const struct nfs_pgio_args *args) +{ +	encode_write3args(xdr, args); +	xdr->buf->flags |= XDRBUF_WRITE;  }  /* - * Encode CREATE arguments + * 3.3.8  CREATE3args + * + *	enum createmode3 { + *		UNCHECKED = 0, + *		GUARDED   = 1, + *		EXCLUSIVE = 2 + *	}; + * + *	union createhow3 switch (createmode3 mode) { + *	case UNCHECKED: + *	case GUARDED: + *		sattr3       obj_attributes; + *	case EXCLUSIVE: + *		createverf3  verf; + *	}; + * + *	struct CREATE3args { + *		diropargs3	where; + *		createhow3	how; + *	};   */ -static int -nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args) +static void encode_createhow3(struct xdr_stream *xdr, +			      const struct nfs3_createargs *args)  { -	p = xdr_encode_fhandle(p, args->fh); -	p = xdr_encode_array(p, args->name, args->len); - -	*p++ = htonl(args->createmode); -	if (args->createmode == NFS3_CREATE_EXCLUSIVE) { -		*p++ = args->verifier[0]; -		*p++ = args->verifier[1]; -	} else -		p = xdr_encode_sattr(p, args->sattr); +	encode_uint32(xdr, args->createmode); +	switch (args->createmode) { +	case NFS3_CREATE_UNCHECKED: +	case NFS3_CREATE_GUARDED: +		encode_sattr3(xdr, args->sattr); +		break; +	case NFS3_CREATE_EXCLUSIVE: +		encode_createverf3(xdr, args->verifier); +		break; +	default: +		BUG(); +	} +} -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +static void nfs3_xdr_enc_create3args(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     const struct nfs3_createargs *args) +{ +	encode_diropargs3(xdr, args->fh, args->name, args->len); +	encode_createhow3(xdr, args);  }  /* - * Encode MKDIR arguments + * 3.3.9  MKDIR3args + * + *	struct MKDIR3args { + *		diropargs3	where; + *		sattr3		attributes; + *	};   */ -static int -nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args) +static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req, +				    struct xdr_stream *xdr, +				    const struct nfs3_mkdirargs *args)  { -	p = xdr_encode_fhandle(p, args->fh); -	p = xdr_encode_array(p, args->name, args->len); -	p = xdr_encode_sattr(p, args->sattr); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +	encode_diropargs3(xdr, args->fh, args->name, args->len); +	encode_sattr3(xdr, args->sattr);  }  /* - * Encode SYMLINK arguments + * 3.3.10  SYMLINK3args + * + *	struct symlinkdata3 { + *		sattr3		symlink_attributes; + *		nfspath3	symlink_data; + *	}; + * + *	struct SYMLINK3args { + *		diropargs3	where; + *		symlinkdata3	symlink; + *	};   */ -static int -nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args) +static void encode_symlinkdata3(struct xdr_stream *xdr, +				const struct nfs3_symlinkargs *args)  { -	p = xdr_encode_fhandle(p, args->fromfh); -	p = xdr_encode_array(p, args->fromname, args->fromlen); -	p = xdr_encode_sattr(p, args->sattr); -	*p++ = htonl(args->pathlen); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); +	encode_sattr3(xdr, args->sattr); +	encode_nfspath3(xdr, args->pages, args->pathlen); +} -	/* Copy the page */ -	xdr_encode_pages(&req->rq_snd_buf, args->pages, 0, args->pathlen); -	return 0; +static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req, +				      struct xdr_stream *xdr, +				      const struct nfs3_symlinkargs *args) +{ +	encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen); +	encode_symlinkdata3(xdr, args);  }  /* - * Encode MKNOD arguments + * 3.3.11  MKNOD3args + * + *	struct devicedata3 { + *		sattr3		dev_attributes; + *		specdata3	spec; + *	}; + * + *	union mknoddata3 switch (ftype3 type) { + *	case NF3CHR: + *	case NF3BLK: + *		devicedata3	device; + *	case NF3SOCK: + *	case NF3FIFO: + *		sattr3		pipe_attributes; + *	default: + *		void; + *	}; + * + *	struct MKNOD3args { + *		diropargs3	where; + *		mknoddata3	what; + *	};   */ -static int -nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args) -{ -	p = xdr_encode_fhandle(p, args->fh); -	p = xdr_encode_array(p, args->name, args->len); -	*p++ = htonl(args->type); -	p = xdr_encode_sattr(p, args->sattr); -	if (args->type == NF3CHR || args->type == NF3BLK) { -		*p++ = htonl(MAJOR(args->rdev)); -		*p++ = htonl(MINOR(args->rdev)); +static void encode_devicedata3(struct xdr_stream *xdr, +			       const struct nfs3_mknodargs *args) +{ +	encode_sattr3(xdr, args->sattr); +	encode_specdata3(xdr, args->rdev); +} + +static void encode_mknoddata3(struct xdr_stream *xdr, +			      const struct nfs3_mknodargs *args) +{ +	encode_ftype3(xdr, args->type); +	switch (args->type) { +	case NF3CHR: +	case NF3BLK: +		encode_devicedata3(xdr, args); +		break; +	case NF3SOCK: +	case NF3FIFO: +		encode_sattr3(xdr, args->sattr); +		break; +	case NF3REG: +	case NF3DIR: +		break; +	default: +		BUG();  	} +} -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req, +				    struct xdr_stream *xdr, +				    const struct nfs3_mknodargs *args) +{ +	encode_diropargs3(xdr, args->fh, args->name, args->len); +	encode_mknoddata3(xdr, args);  }  /* - * Encode RENAME arguments + * 3.3.12  REMOVE3args + * + *	struct REMOVE3args { + *		diropargs3  object; + *	};   */ -static int -nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) -{ -	p = xdr_encode_fhandle(p, args->old_dir); -	p = xdr_encode_array(p, args->old_name->name, args->old_name->len); -	p = xdr_encode_fhandle(p, args->new_dir); -	p = xdr_encode_array(p, args->new_name->name, args->new_name->len); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     const struct nfs_removeargs *args) +{ +	encode_diropargs3(xdr, args->fh, args->name.name, args->name.len);  }  /* - * Encode LINK arguments + * 3.3.14  RENAME3args + * + *	struct RENAME3args { + *		diropargs3	from; + *		diropargs3	to; + *	};   */ -static int -nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args) +static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     const struct nfs_renameargs *args)  { -	p = xdr_encode_fhandle(p, args->fromfh); -	p = xdr_encode_fhandle(p, args->tofh); -	p = xdr_encode_array(p, args->toname, args->tolen); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +	const struct qstr *old = args->old_name; +	const struct qstr *new = args->new_name; + +	encode_diropargs3(xdr, args->old_dir, old->name, old->len); +	encode_diropargs3(xdr, args->new_dir, new->name, new->len);  }  /* - * Encode arguments to readdir call + * 3.3.15  LINK3args + * + *	struct LINK3args { + *		nfs_fh3		file; + *		diropargs3	link; + *	};   */ -static int -nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args) +static void nfs3_xdr_enc_link3args(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   const struct nfs3_linkargs *args)  { -	struct rpc_auth	*auth = req->rq_cred->cr_auth; -	unsigned int replen; -	u32 count = args->count; - -	p = xdr_encode_fhandle(p, args->fh); -	p = xdr_encode_hyper(p, args->cookie); -	*p++ = args->verf[0]; -	*p++ = args->verf[1]; -	if (args->plus) { -		/* readdirplus: need dircount + buffer size. -		 * We just make sure we make dircount big enough */ -		*p++ = htonl(count >> 3); -	} -	*p++ = htonl(count); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - -	/* Inline the page array */ -	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readdirres_sz) << 2; -	xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count); -	return 0; +	encode_nfs_fh3(xdr, args->fromfh); +	encode_diropargs3(xdr, args->tofh, args->toname, args->tolen);  }  /* - * Decode the result of a readdir call. - * We just check for syntactical correctness. + * 3.3.16  READDIR3args + * + *	struct READDIR3args { + *		nfs_fh3		dir; + *		cookie3		cookie; + *		cookieverf3	cookieverf; + *		count3		count; + *	};   */ -static int -nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res) +static void encode_readdir3args(struct xdr_stream *xdr, +				const struct nfs3_readdirargs *args)  { -	struct xdr_buf *rcvbuf = &req->rq_rcv_buf; -	struct kvec *iov = rcvbuf->head; -	struct page **page; -	size_t hdrlen; -	u32 recvd, pglen; -	int status, nr = 0; - -	status = ntohl(*p++); -	/* Decode post_op_attrs */ -	p = xdr_decode_post_op_attr(p, res->dir_attr); -	if (status) -		return nfs_stat_to_errno(status); -	/* Decode verifier cookie */ -	if (res->verf) { -		res->verf[0] = *p++; -		res->verf[1] = *p++; -	} else { -		p += 2; -	} +	__be32 *p; -	hdrlen = (u8 *) p - (u8 *) iov->iov_base; -	if (iov->iov_len < hdrlen) { -		dprintk("NFS: READDIR reply header overflowed:" -				"length %Zu > %Zu\n", hdrlen, iov->iov_len); -		return -errno_NFSERR_IO; -	} else if (iov->iov_len != hdrlen) { -		dprintk("NFS: READDIR header is short. iovec will be shifted.\n"); -		xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); -	} +	encode_nfs_fh3(xdr, args->fh); -	pglen = rcvbuf->page_len; -	recvd = rcvbuf->len - hdrlen; -	if (pglen > recvd) -		pglen = recvd; -	page = rcvbuf->pages; +	p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4); +	p = xdr_encode_cookie3(p, args->cookie); +	p = xdr_encode_cookieverf3(p, args->verf); +	*p = cpu_to_be32(args->count); +} -	return nr; +static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, +				      struct xdr_stream *xdr, +				      const struct nfs3_readdirargs *args) +{ +	encode_readdir3args(xdr, args); +	prepare_reply_buffer(req, args->pages, 0, +				args->count, NFS3_readdirres_sz);  } -__be32 * -nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_server *server, int plus) +/* + * 3.3.17  READDIRPLUS3args + * + *	struct READDIRPLUS3args { + *		nfs_fh3		dir; + *		cookie3		cookie; + *		cookieverf3	cookieverf; + *		count3		dircount; + *		count3		maxcount; + *	}; + */ +static void encode_readdirplus3args(struct xdr_stream *xdr, +				    const struct nfs3_readdirargs *args)  {  	__be32 *p; -	struct nfs_entry old = *entry; -	p = xdr_inline_decode(xdr, 4); -	if (unlikely(!p)) -		goto out_overflow; -	if (!ntohl(*p++)) { -		p = xdr_inline_decode(xdr, 4); -		if (unlikely(!p)) -			goto out_overflow; -		if (!ntohl(*p++)) -			return ERR_PTR(-EAGAIN); -		entry->eof = 1; -		return ERR_PTR(-EBADCOOKIE); -	} +	encode_nfs_fh3(xdr, args->fh); -	p = xdr_inline_decode(xdr, 12); -	if (unlikely(!p)) -		goto out_overflow; -	p = xdr_decode_hyper(p, &entry->ino); -	entry->len  = ntohl(*p++); +	p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4); +	p = xdr_encode_cookie3(p, args->cookie); +	p = xdr_encode_cookieverf3(p, args->verf); -	p = xdr_inline_decode(xdr, entry->len + 8); -	if (unlikely(!p)) -		goto out_overflow; -	entry->name = (const char *) p; -	p += XDR_QUADLEN(entry->len); -	entry->prev_cookie = entry->cookie; -	p = xdr_decode_hyper(p, &entry->cookie); - -	if (plus) { -		entry->fattr->valid = 0; -		p = xdr_decode_post_op_attr_stream(xdr, entry->fattr); -		if (IS_ERR(p)) -			goto out_overflow_exit; -		/* In fact, a post_op_fh3: */ -		p = xdr_inline_decode(xdr, 4); -		if (unlikely(!p)) -			goto out_overflow; -		if (*p++) { -			p = xdr_decode_fhandle_stream(xdr, entry->fh); -			if (IS_ERR(p)) -				goto out_overflow_exit; -			/* Ugh -- server reply was truncated */ -			if (p == NULL) { -				dprintk("NFS: FH truncated\n"); -				*entry = old; -				return ERR_PTR(-EAGAIN); -			} -		} else -			memset((u8*)(entry->fh), 0, sizeof(*entry->fh)); -	} +	/* +	 * readdirplus: need dircount + buffer size. +	 * We just make sure we make dircount big enough +	 */ +	*p++ = cpu_to_be32(args->count >> 3); -	p = xdr_inline_peek(xdr, 8); -	if (p != NULL) -		entry->eof = !p[0] && p[1]; -	else -		entry->eof = 0; - -	return p; +	*p = cpu_to_be32(args->count); +} -out_overflow: -	print_overflow_msg(__func__, xdr); -out_overflow_exit: -	return ERR_PTR(-EIO); +static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, +					  struct xdr_stream *xdr, +					  const struct nfs3_readdirargs *args) +{ +	encode_readdirplus3args(xdr, args); +	prepare_reply_buffer(req, args->pages, 0, +				args->count, NFS3_readdirres_sz);  }  /* - * Encode COMMIT arguments + * 3.3.21  COMMIT3args + * + *	struct COMMIT3args { + *		nfs_fh3		file; + *		offset3		offset; + *		count3		count; + *	};   */ -static int -nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) +static void encode_commit3args(struct xdr_stream *xdr, +			       const struct nfs_commitargs *args)  { -	p = xdr_encode_fhandle(p, args->fh); +	__be32 *p; + +	encode_nfs_fh3(xdr, args->fh); + +	p = xdr_reserve_space(xdr, 8 + 4);  	p = xdr_encode_hyper(p, args->offset); -	*p++ = htonl(args->count); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	return 0; +	*p = cpu_to_be32(args->count);  } -#ifdef CONFIG_NFS_V3_ACL -/* - * Encode GETACL arguments - */ -static int -nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p, -		    struct nfs3_getaclargs *args) +static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     const struct nfs_commitargs *args)  { -	struct rpc_auth	*auth = req->rq_cred->cr_auth; -	unsigned int replen; +	encode_commit3args(xdr, args); +} -	p = xdr_encode_fhandle(p, args->fh); -	*p++ = htonl(args->mask); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); +#ifdef CONFIG_NFS_V3_ACL -	if (args->mask & (NFS_ACL | NFS_DFACL)) { -		/* Inline the page array */ -		replen = (RPC_REPHDRSIZE + auth->au_rslack + -			  ACL3_getaclres_sz) << 2; -		xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, -				 NFSACL_MAXPAGES << PAGE_SHIFT); -	} -	return 0; +static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     const struct nfs3_getaclargs *args) +{ +	encode_nfs_fh3(xdr, args->fh); +	encode_uint32(xdr, args->mask); +	if (args->mask & (NFS_ACL | NFS_DFACL)) +		prepare_reply_buffer(req, args->pages, 0, +					NFSACL_MAXPAGES << PAGE_SHIFT, +					ACL3_getaclres_sz);  } -/* - * Encode SETACL arguments - */ -static int -nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p, -                   struct nfs3_setaclargs *args) +static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     const struct nfs3_setaclargs *args)  { -	struct xdr_buf *buf = &req->rq_snd_buf;  	unsigned int base; -	int err; +	int error; -	p = xdr_encode_fhandle(p, NFS_FH(args->inode)); -	*p++ = htonl(args->mask); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); -	base = req->rq_slen; +	encode_nfs_fh3(xdr, NFS_FH(args->inode)); +	encode_uint32(xdr, args->mask); +	base = req->rq_slen;  	if (args->npages != 0) -		xdr_encode_pages(buf, args->pages, 0, args->len); +		xdr_write_pages(xdr, args->pages, 0, args->len);  	else -		req->rq_slen = xdr_adjust_iovec(req->rq_svec, -				p + XDR_QUADLEN(args->len)); +		xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE); -	err = nfsacl_encode(buf, base, args->inode, +	error = nfsacl_encode(xdr->buf, base, args->inode,  			    (args->mask & NFS_ACL) ?  			    args->acl_access : NULL, 1, 0); -	if (err > 0) -		err = nfsacl_encode(buf, base + err, args->inode, -				    (args->mask & NFS_DFACL) ? -				    args->acl_default : NULL, 1, -				    NFS_ACL_DEFAULT); -	return (err > 0) ? 0 : err; +	/* FIXME: this is just broken */ +	BUG_ON(error < 0); +	error = nfsacl_encode(xdr->buf, base + error, args->inode, +			    (args->mask & NFS_DFACL) ? +			    args->acl_default : NULL, 1, +			    NFS_ACL_DEFAULT); +	BUG_ON(error < 0);  } +  #endif  /* CONFIG_NFS_V3_ACL */  /* - * NFS XDR decode functions + * NFSv3 XDR decode functions + * + * NFSv3 result types are defined in section 3.3 of RFC 1813: + * "NFS Version 3 Protocol Specification".   */  /* - * Decode attrstat reply. + * 3.3.1  GETATTR3res + * + *	struct GETATTR3resok { + *		fattr3		obj_attributes; + *	}; + * + *	union GETATTR3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		GETATTR3resok  resok; + *	default: + *		void; + *	};   */ -static int -nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) +static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req, +				    struct xdr_stream *xdr, +				    struct nfs_fattr *result)  { -	int	status; - -	if ((status = ntohl(*p++))) -		return nfs_stat_to_errno(status); -	xdr_decode_fattr(p, fattr); -	return 0; +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_default; +	error = decode_fattr3(xdr, result); +out: +	return error; +out_default: +	return nfs3_stat_to_errno(status);  }  /* - * Decode status+wcc_data reply - * SATTR, REMOVE, RMDIR + * 3.3.2  SETATTR3res + * + *	struct SETATTR3resok { + *		wcc_data  obj_wcc; + *	}; + * + *	struct SETATTR3resfail { + *		wcc_data  obj_wcc; + *	}; + * + *	union SETATTR3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		SETATTR3resok   resok; + *	default: + *		SETATTR3resfail resfail; + *	};   */ -static int -nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) +static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req, +				    struct xdr_stream *xdr, +				    struct nfs_fattr *result)  { -	int	status; - -	if ((status = ntohl(*p++))) -		status = nfs_stat_to_errno(status); -	xdr_decode_wcc_data(p, fattr); -	return status; +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	error = decode_wcc_data(xdr, result); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_status; +out: +	return error; +out_status: +	return nfs3_stat_to_errno(status);  } -static int -nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res) +/* + * 3.3.3  LOOKUP3res + * + *	struct LOOKUP3resok { + *		nfs_fh3		object; + *		post_op_attr	obj_attributes; + *		post_op_attr	dir_attributes; + *	}; + * + *	struct LOOKUP3resfail { + *		post_op_attr	dir_attributes; + *	}; + * + *	union LOOKUP3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		LOOKUP3resok	resok; + *	default: + *		LOOKUP3resfail	resfail; + *	}; + */ +static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   struct nfs3_diropres *result)  { -	return nfs3_xdr_wccstat(req, p, res->dir_attr); +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_default; +	error = decode_nfs_fh3(xdr, result->fh); +	if (unlikely(error)) +		goto out; +	error = decode_post_op_attr(xdr, result->fattr); +	if (unlikely(error)) +		goto out; +	error = decode_post_op_attr(xdr, result->dir_attr); +out: +	return error; +out_default: +	error = decode_post_op_attr(xdr, result->dir_attr); +	if (unlikely(error)) +		goto out; +	return nfs3_stat_to_errno(status);  }  /* - * Decode LOOKUP reply + * 3.3.4  ACCESS3res + * + *	struct ACCESS3resok { + *		post_op_attr	obj_attributes; + *		uint32		access; + *	}; + * + *	struct ACCESS3resfail { + *		post_op_attr	obj_attributes; + *	}; + * + *	union ACCESS3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		ACCESS3resok	resok; + *	default: + *		ACCESS3resfail	resfail; + *	};   */ -static int -nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) +static int nfs3_xdr_dec_access3res(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   struct nfs3_accessres *result)  { -	int	status; - -	if ((status = ntohl(*p++))) { -		status = nfs_stat_to_errno(status); -	} else { -		if (!(p = xdr_decode_fhandle(p, res->fh))) -			return -errno_NFSERR_IO; -		p = xdr_decode_post_op_attr(p, res->fattr); -	} -	xdr_decode_post_op_attr(p, res->dir_attr); -	return status; +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	error = decode_post_op_attr(xdr, result->fattr); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_default; +	error = decode_uint32(xdr, &result->access); +out: +	return error; +out_default: +	return nfs3_stat_to_errno(status);  }  /* - * Decode ACCESS reply + * 3.3.5  READLINK3res + * + *	struct READLINK3resok { + *		post_op_attr	symlink_attributes; + *		nfspath3	data; + *	}; + * + *	struct READLINK3resfail { + *		post_op_attr	symlink_attributes; + *	}; + * + *	union READLINK3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		READLINK3resok	resok; + *	default: + *		READLINK3resfail resfail; + *	};   */ -static int -nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res) +static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     struct nfs_fattr *result)  { -	int	status = ntohl(*p++); - -	p = xdr_decode_post_op_attr(p, res->fattr); -	if (status) -		return nfs_stat_to_errno(status); -	res->access = ntohl(*p++); -	return 0; +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	error = decode_post_op_attr(xdr, result); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_default; +	error = decode_nfspath3(xdr); +out: +	return error; +out_default: +	return nfs3_stat_to_errno(status);  } -static int -nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args) +/* + * 3.3.6  READ3res + * + *	struct READ3resok { + *		post_op_attr	file_attributes; + *		count3		count; + *		bool		eof; + *		opaque		data<>; + *	}; + * + *	struct READ3resfail { + *		post_op_attr	file_attributes; + *	}; + * + *	union READ3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		READ3resok	resok; + *	default: + *		READ3resfail	resfail; + *	}; + */ +static int decode_read3resok(struct xdr_stream *xdr, +			     struct nfs_pgio_res *result)  { -	struct rpc_auth	*auth = req->rq_cred->cr_auth; -	unsigned int replen; +	u32 eof, count, ocount, recvd; +	__be32 *p; -	p = xdr_encode_fhandle(p, args->fh); -	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); +	p = xdr_inline_decode(xdr, 4 + 4 + 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	count = be32_to_cpup(p++); +	eof = be32_to_cpup(p++); +	ocount = be32_to_cpup(p++); +	if (unlikely(ocount != count)) +		goto out_mismatch; +	recvd = xdr_read_pages(xdr, count); +	if (unlikely(count > recvd)) +		goto out_cheating; +out: +	result->eof = eof; +	result->count = count; +	return count; +out_mismatch: +	dprintk("NFS: READ count doesn't match length of opaque: " +		"count %u != ocount %u\n", count, ocount); +	return -EIO; +out_cheating: +	dprintk("NFS: server cheating in read result: " +		"count %u > recvd %u\n", count, recvd); +	count = recvd; +	eof = 0; +	goto out; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} -	/* Inline the page array */ -	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readlinkres_sz) << 2; -	xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen); -	return 0; +static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, +				 struct nfs_pgio_res *result) +{ +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	error = decode_post_op_attr(xdr, result->fattr); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_status; +	error = decode_read3resok(xdr, result); +out: +	return error; +out_status: +	return nfs3_stat_to_errno(status);  }  /* - * Decode READLINK reply + * 3.3.7  WRITE3res + * + *	enum stable_how { + *		UNSTABLE  = 0, + *		DATA_SYNC = 1, + *		FILE_SYNC = 2 + *	}; + * + *	struct WRITE3resok { + *		wcc_data	file_wcc; + *		count3		count; + *		stable_how	committed; + *		writeverf3	verf; + *	}; + * + *	struct WRITE3resfail { + *		wcc_data	file_wcc; + *	}; + * + *	union WRITE3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		WRITE3resok	resok; + *	default: + *		WRITE3resfail	resfail; + *	};   */ -static int -nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) +static int decode_write3resok(struct xdr_stream *xdr, +			      struct nfs_pgio_res *result)  { -	struct xdr_buf *rcvbuf = &req->rq_rcv_buf; -	struct kvec *iov = rcvbuf->head; -	size_t hdrlen; -	u32 len, recvd; -	int	status; - -	status = ntohl(*p++); -	p = xdr_decode_post_op_attr(p, fattr); +	__be32 *p; -	if (status != 0) -		return nfs_stat_to_errno(status); +	p = xdr_inline_decode(xdr, 4 + 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	result->count = be32_to_cpup(p++); +	result->verf->committed = be32_to_cpup(p++); +	if (unlikely(result->verf->committed > NFS_FILE_SYNC)) +		goto out_badvalue; +	if (decode_writeverf3(xdr, &result->verf->verifier)) +		goto out_eio; +	return result->count; +out_badvalue: +	dprintk("NFS: bad stable_how value: %u\n", result->verf->committed); +	return -EIO; +out_overflow: +	print_overflow_msg(__func__, xdr); +out_eio: +	return -EIO; +} -	/* Convert length of symlink */ -	len = ntohl(*p++); -	if (len >= rcvbuf->page_len) { -		dprintk("nfs: server returned giant symlink!\n"); -		return -ENAMETOOLONG; -	} +static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, +				  struct nfs_pgio_res *result) +{ +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	error = decode_wcc_data(xdr, result->fattr); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_status; +	error = decode_write3resok(xdr, result); +out: +	return error; +out_status: +	return nfs3_stat_to_errno(status); +} -	hdrlen = (u8 *) p - (u8 *) iov->iov_base; -	if (iov->iov_len < hdrlen) { -		dprintk("NFS: READLINK reply header overflowed:" -				"length %Zu > %Zu\n", hdrlen, iov->iov_len); -		return -errno_NFSERR_IO; -	} else if (iov->iov_len != hdrlen) { -		dprintk("NFS: READLINK header is short. " -			"iovec will be shifted.\n"); -		xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); -	} -	recvd = req->rq_rcv_buf.len - hdrlen; -	if (recvd < len) { -		dprintk("NFS: server cheating in readlink reply: " -				"count %u > recvd %u\n", len, recvd); -		return -EIO; -	} +/* + * 3.3.8  CREATE3res + * + *	struct CREATE3resok { + *		post_op_fh3	obj; + *		post_op_attr	obj_attributes; + *		wcc_data	dir_wcc; + *	}; + * + *	struct CREATE3resfail { + *		wcc_data	dir_wcc; + *	}; + * + *	union CREATE3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		CREATE3resok	resok; + *	default: + *		CREATE3resfail	resfail; + *	}; + */ +static int decode_create3resok(struct xdr_stream *xdr, +			       struct nfs3_diropres *result) +{ +	int error; + +	error = decode_post_op_fh3(xdr, result->fh); +	if (unlikely(error)) +		goto out; +	error = decode_post_op_attr(xdr, result->fattr); +	if (unlikely(error)) +		goto out; +	/* The server isn't required to return a file handle. +	 * If it didn't, force the client to perform a LOOKUP +	 * to determine the correct file handle and attribute +	 * values for the new object. */ +	if (result->fh->size == 0) +		result->fattr->valid = 0; +	error = decode_wcc_data(xdr, result->dir_attr); +out: +	return error; +} -	xdr_terminate_string(rcvbuf, len); -	return 0; +static int nfs3_xdr_dec_create3res(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   struct nfs3_diropres *result) +{ +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_default; +	error = decode_create3resok(xdr, result); +out: +	return error; +out_default: +	error = decode_wcc_data(xdr, result->dir_attr); +	if (unlikely(error)) +		goto out; +	return nfs3_stat_to_errno(status);  }  /* - * Decode READ reply + * 3.3.12  REMOVE3res + * + *	struct REMOVE3resok { + *		wcc_data    dir_wcc; + *	}; + * + *	struct REMOVE3resfail { + *		wcc_data    dir_wcc; + *	}; + * + *	union REMOVE3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		REMOVE3resok   resok; + *	default: + *		REMOVE3resfail resfail; + *	};   */ -static int -nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) +static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   struct nfs_removeres *result)  { -	struct kvec *iov = req->rq_rcv_buf.head; -	size_t hdrlen; -	u32 count, ocount, recvd; -	int status; - -	status = ntohl(*p++); -	p = xdr_decode_post_op_attr(p, res->fattr); +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	error = decode_wcc_data(xdr, result->dir_attr); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_status; +out: +	return error; +out_status: +	return nfs3_stat_to_errno(status); +} -	if (status != 0) -		return nfs_stat_to_errno(status); +/* + * 3.3.14  RENAME3res + * + *	struct RENAME3resok { + *		wcc_data	fromdir_wcc; + *		wcc_data	todir_wcc; + *	}; + * + *	struct RENAME3resfail { + *		wcc_data	fromdir_wcc; + *		wcc_data	todir_wcc; + *	}; + * + *	union RENAME3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		RENAME3resok   resok; + *	default: + *		RENAME3resfail resfail; + *	}; + */ +static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   struct nfs_renameres *result) +{ +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	error = decode_wcc_data(xdr, result->old_fattr); +	if (unlikely(error)) +		goto out; +	error = decode_wcc_data(xdr, result->new_fattr); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_status; +out: +	return error; +out_status: +	return nfs3_stat_to_errno(status); +} -	/* Decode reply count and EOF flag. NFSv3 is somewhat redundant -	 * in that it puts the count both in the res struct and in the -	 * opaque data count. */ -	count    = ntohl(*p++); -	res->eof = ntohl(*p++); -	ocount   = ntohl(*p++); +/* + * 3.3.15  LINK3res + * + *	struct LINK3resok { + *		post_op_attr	file_attributes; + *		wcc_data	linkdir_wcc; + *	}; + * + *	struct LINK3resfail { + *		post_op_attr	file_attributes; + *		wcc_data	linkdir_wcc; + *	}; + * + *	union LINK3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		LINK3resok	resok; + *	default: + *		LINK3resfail	resfail; + *	}; + */ +static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr, +				 struct nfs3_linkres *result) +{ +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	error = decode_post_op_attr(xdr, result->fattr); +	if (unlikely(error)) +		goto out; +	error = decode_wcc_data(xdr, result->dir_attr); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_status; +out: +	return error; +out_status: +	return nfs3_stat_to_errno(status); +} -	if (ocount != count) { -		dprintk("NFS: READ count doesn't match RPC opaque count.\n"); -		return -errno_NFSERR_IO; -	} +/** + * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in + *			the local page cache + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + * + * 3.3.16  entry3 + * + *	struct entry3 { + *		fileid3		fileid; + *		filename3	name; + *		cookie3		cookie; + *		fhandle3	filehandle; + *		post_op_attr3	attributes; + *		entry3		*nextentry; + *	}; + * + * 3.3.17  entryplus3 + *	struct entryplus3 { + *		fileid3		fileid; + *		filename3	name; + *		cookie3		cookie; + *		post_op_attr	name_attributes; + *		post_op_fh3	name_handle; + *		entryplus3	*nextentry; + *	}; + */ +int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, +		       int plus) +{ +	struct nfs_entry old = *entry; +	__be32 *p; +	int error; -	hdrlen = (u8 *) p - (u8 *) iov->iov_base; -	if (iov->iov_len < hdrlen) { -		dprintk("NFS: READ reply header overflowed:" -				"length %Zu > %Zu\n", hdrlen, iov->iov_len); -       		return -errno_NFSERR_IO; -	} else if (iov->iov_len != hdrlen) { -		dprintk("NFS: READ header is short. iovec will be shifted.\n"); -		xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen); +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	if (*p == xdr_zero) { +		p = xdr_inline_decode(xdr, 4); +		if (unlikely(p == NULL)) +			goto out_overflow; +		if (*p == xdr_zero) +			return -EAGAIN; +		entry->eof = 1; +		return -EBADCOOKIE;  	} -	recvd = req->rq_rcv_buf.len - hdrlen; -	if (count > recvd) { -		dprintk("NFS: server cheating in read reply: " -			"count %u > recvd %u\n", count, recvd); -		count = recvd; -		res->eof = 0; -	} +	error = decode_fileid3(xdr, &entry->ino); +	if (unlikely(error)) +		return error; -	if (count < res->count) -		res->count = count; +	error = decode_inline_filename3(xdr, &entry->name, &entry->len); +	if (unlikely(error)) +		return error; -	return count; -} +	entry->prev_cookie = entry->cookie; +	error = decode_cookie3(xdr, &entry->cookie); +	if (unlikely(error)) +		return error; -/* - * Decode WRITE response - */ -static int -nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) -{ -	int	status; +	entry->d_type = DT_UNKNOWN; -	status = ntohl(*p++); -	p = xdr_decode_wcc_data(p, res->fattr); +	if (plus) { +		entry->fattr->valid = 0; +		error = decode_post_op_attr(xdr, entry->fattr); +		if (unlikely(error)) +			return error; +		if (entry->fattr->valid & NFS_ATTR_FATTR_V3) +			entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); -	if (status != 0) -		return nfs_stat_to_errno(status); +		/* In fact, a post_op_fh3: */ +		p = xdr_inline_decode(xdr, 4); +		if (unlikely(p == NULL)) +			goto out_overflow; +		if (*p != xdr_zero) { +			error = decode_nfs_fh3(xdr, entry->fh); +			if (unlikely(error)) { +				if (error == -E2BIG) +					goto out_truncated; +				return error; +			} +		} else +			zero_nfs_fh3(entry->fh); +	} -	res->count = ntohl(*p++); -	res->verf->committed = (enum nfs3_stable_how)ntohl(*p++); -	res->verf->verifier[0] = *p++; -	res->verf->verifier[1] = *p++; +	return 0; -	return res->count; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EAGAIN; +out_truncated: +	dprintk("NFS: directory entry contains invalid file handle\n"); +	*entry = old; +	return -EAGAIN;  }  /* - * Decode a CREATE response + * 3.3.16  READDIR3res + * + *	struct dirlist3 { + *		entry3		*entries; + *		bool		eof; + *	}; + * + *	struct READDIR3resok { + *		post_op_attr	dir_attributes; + *		cookieverf3	cookieverf; + *		dirlist3	reply; + *	}; + * + *	struct READDIR3resfail { + *		post_op_attr	dir_attributes; + *	}; + * + *	union READDIR3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		READDIR3resok	resok; + *	default: + *		READDIR3resfail	resfail; + *	}; + * + * Read the directory contents into the page cache, but otherwise + * don't touch them.  The actual decoding is done by nfs3_decode_entry() + * during subsequent nfs_readdir() calls.   */ -static int -nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) -{ -	int	status; - -	status = ntohl(*p++); -	if (status == 0) { -		if (*p++) { -			if (!(p = xdr_decode_fhandle(p, res->fh))) -				return -errno_NFSERR_IO; -			p = xdr_decode_post_op_attr(p, res->fattr); -		} else { -			memset(res->fh, 0, sizeof(*res->fh)); -			/* Do decode post_op_attr but set it to NULL */ -			p = xdr_decode_post_op_attr(p, res->fattr); -			res->fattr->valid = 0; -		} -	} else { -		status = nfs_stat_to_errno(status); -	} -	p = xdr_decode_wcc_data(p, res->dir_attr); -	return status; +static int decode_dirlist3(struct xdr_stream *xdr) +{ +	return xdr_read_pages(xdr, xdr->buf->page_len);  } -/* - * Decode RENAME reply - */ -static int -nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs_renameres *res) +static int decode_readdir3resok(struct xdr_stream *xdr, +				struct nfs3_readdirres *result)  { -	int	status; - -	if ((status = ntohl(*p++)) != 0) -		status = nfs_stat_to_errno(status); -	p = xdr_decode_wcc_data(p, res->old_fattr); -	p = xdr_decode_wcc_data(p, res->new_fattr); -	return status; +	int error; + +	error = decode_post_op_attr(xdr, result->dir_attr); +	if (unlikely(error)) +		goto out; +	/* XXX: do we need to check if result->verf != NULL ? */ +	error = decode_cookieverf3(xdr, result->verf); +	if (unlikely(error)) +		goto out; +	error = decode_dirlist3(xdr); +out: +	return error;  } -/* - * Decode LINK reply - */ -static int -nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res) +static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req, +				    struct xdr_stream *xdr, +				    struct nfs3_readdirres *result)  { -	int	status; - -	if ((status = ntohl(*p++)) != 0) -		status = nfs_stat_to_errno(status); -	p = xdr_decode_post_op_attr(p, res->fattr); -	p = xdr_decode_wcc_data(p, res->dir_attr); -	return status; +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_default; +	error = decode_readdir3resok(xdr, result); +out: +	return error; +out_default: +	error = decode_post_op_attr(xdr, result->dir_attr); +	if (unlikely(error)) +		goto out; +	return nfs3_stat_to_errno(status);  }  /* - * Decode FSSTAT reply + * 3.3.18  FSSTAT3res + * + *	struct FSSTAT3resok { + *		post_op_attr	obj_attributes; + *		size3		tbytes; + *		size3		fbytes; + *		size3		abytes; + *		size3		tfiles; + *		size3		ffiles; + *		size3		afiles; + *		uint32		invarsec; + *	}; + * + *	struct FSSTAT3resfail { + *		post_op_attr	obj_attributes; + *	}; + * + *	union FSSTAT3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		FSSTAT3resok	resok; + *	default: + *		FSSTAT3resfail	resfail; + *	};   */ -static int -nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res) +static int decode_fsstat3resok(struct xdr_stream *xdr, +			       struct nfs_fsstat *result)  { -	int		status; - -	status = ntohl(*p++); - -	p = xdr_decode_post_op_attr(p, res->fattr); -	if (status != 0) -		return nfs_stat_to_errno(status); - -	p = xdr_decode_hyper(p, &res->tbytes); -	p = xdr_decode_hyper(p, &res->fbytes); -	p = xdr_decode_hyper(p, &res->abytes); -	p = xdr_decode_hyper(p, &res->tfiles); -	p = xdr_decode_hyper(p, &res->ffiles); -	p = xdr_decode_hyper(p, &res->afiles); +	__be32 *p; +	p = xdr_inline_decode(xdr, 8 * 6 + 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	p = xdr_decode_size3(p, &result->tbytes); +	p = xdr_decode_size3(p, &result->fbytes); +	p = xdr_decode_size3(p, &result->abytes); +	p = xdr_decode_size3(p, &result->tfiles); +	p = xdr_decode_size3(p, &result->ffiles); +	xdr_decode_size3(p, &result->afiles);  	/* ignore invarsec */  	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   struct nfs_fsstat *result) +{ +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	error = decode_post_op_attr(xdr, result->fattr); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_status; +	error = decode_fsstat3resok(xdr, result); +out: +	return error; +out_status: +	return nfs3_stat_to_errno(status);  }  /* - * Decode FSINFO reply + * 3.3.19  FSINFO3res + * + *	struct FSINFO3resok { + *		post_op_attr	obj_attributes; + *		uint32		rtmax; + *		uint32		rtpref; + *		uint32		rtmult; + *		uint32		wtmax; + *		uint32		wtpref; + *		uint32		wtmult; + *		uint32		dtpref; + *		size3		maxfilesize; + *		nfstime3	time_delta; + *		uint32		properties; + *	}; + * + *	struct FSINFO3resfail { + *		post_op_attr	obj_attributes; + *	}; + * + *	union FSINFO3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		FSINFO3resok	resok; + *	default: + *		FSINFO3resfail	resfail; + *	};   */ -static int -nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res) +static int decode_fsinfo3resok(struct xdr_stream *xdr, +			       struct nfs_fsinfo *result)  { -	int		status; - -	status = ntohl(*p++); - -	p = xdr_decode_post_op_attr(p, res->fattr); -	if (status != 0) -		return nfs_stat_to_errno(status); +	__be32 *p; -	res->rtmax  = ntohl(*p++); -	res->rtpref = ntohl(*p++); -	res->rtmult = ntohl(*p++); -	res->wtmax  = ntohl(*p++); -	res->wtpref = ntohl(*p++); -	res->wtmult = ntohl(*p++); -	res->dtpref = ntohl(*p++); -	p = xdr_decode_hyper(p, &res->maxfilesize); -	p = xdr_decode_time3(p, &res->time_delta); +	p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4); +	if (unlikely(p == NULL)) +		goto out_overflow; +	result->rtmax  = be32_to_cpup(p++); +	result->rtpref = be32_to_cpup(p++); +	result->rtmult = be32_to_cpup(p++); +	result->wtmax  = be32_to_cpup(p++); +	result->wtpref = be32_to_cpup(p++); +	result->wtmult = be32_to_cpup(p++); +	result->dtpref = be32_to_cpup(p++); +	p = xdr_decode_size3(p, &result->maxfilesize); +	xdr_decode_nfstime3(p, &result->time_delta);  	/* ignore properties */ -	res->lease_time = 0; +	result->lease_time = 0;  	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   struct nfs_fsinfo *result) +{ +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	error = decode_post_op_attr(xdr, result->fattr); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_status; +	error = decode_fsinfo3resok(xdr, result); +out: +	return error; +out_status: +	return nfs3_stat_to_errno(status);  }  /* - * Decode PATHCONF reply + * 3.3.20  PATHCONF3res + * + *	struct PATHCONF3resok { + *		post_op_attr	obj_attributes; + *		uint32		linkmax; + *		uint32		name_max; + *		bool		no_trunc; + *		bool		chown_restricted; + *		bool		case_insensitive; + *		bool		case_preserving; + *	}; + * + *	struct PATHCONF3resfail { + *		post_op_attr	obj_attributes; + *	}; + * + *	union PATHCONF3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		PATHCONF3resok	resok; + *	default: + *		PATHCONF3resfail resfail; + *	};   */ -static int -nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res) +static int decode_pathconf3resok(struct xdr_stream *xdr, +				 struct nfs_pathconf *result)  { -	int		status; - -	status = ntohl(*p++); - -	p = xdr_decode_post_op_attr(p, res->fattr); -	if (status != 0) -		return nfs_stat_to_errno(status); -	res->max_link = ntohl(*p++); -	res->max_namelen = ntohl(*p++); +	__be32 *p; +	p = xdr_inline_decode(xdr, 4 * 6); +	if (unlikely(p == NULL)) +		goto out_overflow; +	result->max_link = be32_to_cpup(p++); +	result->max_namelen = be32_to_cpup(p);  	/* ignore remaining fields */  	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     struct nfs_pathconf *result) +{ +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	error = decode_post_op_attr(xdr, result->fattr); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_status; +	error = decode_pathconf3resok(xdr, result); +out: +	return error; +out_status: +	return nfs3_stat_to_errno(status);  }  /* - * Decode COMMIT reply + * 3.3.21  COMMIT3res + * + *	struct COMMIT3resok { + *		wcc_data	file_wcc; + *		writeverf3	verf; + *	}; + * + *	struct COMMIT3resfail { + *		wcc_data	file_wcc; + *	}; + * + *	union COMMIT3res switch (nfsstat3 status) { + *	case NFS3_OK: + *		COMMIT3resok	resok; + *	default: + *		COMMIT3resfail	resfail; + *	};   */ -static int -nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) +static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   struct nfs_commitres *result)  { -	int		status; - -	status = ntohl(*p++); -	p = xdr_decode_wcc_data(p, res->fattr); -	if (status != 0) -		return nfs_stat_to_errno(status); - -	res->verf->verifier[0] = *p++; -	res->verf->verifier[1] = *p++; -	return 0; +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	error = decode_wcc_data(xdr, result->fattr); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_status; +	error = decode_writeverf3(xdr, &result->verf->verifier); +out: +	return error; +out_status: +	return nfs3_stat_to_errno(status);  }  #ifdef CONFIG_NFS_V3_ACL -/* - * Decode GETACL reply - */ -static int -nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p, -		   struct nfs3_getaclres *res) + +static inline int decode_getacl3resok(struct xdr_stream *xdr, +				      struct nfs3_getaclres *result)  { -	struct xdr_buf *buf = &req->rq_rcv_buf; -	int status = ntohl(*p++);  	struct posix_acl **acl;  	unsigned int *aclcnt; -	int err, base; - -	if (status != 0) -		return nfs_stat_to_errno(status); -	p = xdr_decode_post_op_attr(p, res->fattr); -	res->mask = ntohl(*p++); -	if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) -		return -EINVAL; -	base = (char *)p - (char *)req->rq_rcv_buf.head->iov_base; +	size_t hdrlen; +	int error; + +	error = decode_post_op_attr(xdr, result->fattr); +	if (unlikely(error)) +		goto out; +	error = decode_uint32(xdr, &result->mask); +	if (unlikely(error)) +		goto out; +	error = -EINVAL; +	if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) +		goto out; + +	hdrlen = xdr_stream_pos(xdr); + +	acl = NULL; +	if (result->mask & NFS_ACL) +		acl = &result->acl_access; +	aclcnt = NULL; +	if (result->mask & NFS_ACLCNT) +		aclcnt = &result->acl_access_count; +	error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl); +	if (unlikely(error <= 0)) +		goto out; + +	acl = NULL; +	if (result->mask & NFS_DFACL) +		acl = &result->acl_default; +	aclcnt = NULL; +	if (result->mask & NFS_DFACLCNT) +		aclcnt = &result->acl_default_count; +	error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl); +	if (unlikely(error <= 0)) +		return error; +	error = 0; +out: +	return error; +} -	acl = (res->mask & NFS_ACL) ? &res->acl_access : NULL; -	aclcnt = (res->mask & NFS_ACLCNT) ? &res->acl_access_count : NULL; -	err = nfsacl_decode(buf, base, aclcnt, acl); +static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   struct nfs3_getaclres *result) +{ +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_default; +	error = decode_getacl3resok(xdr, result); +out: +	return error; +out_default: +	return nfs3_stat_to_errno(status); +} -	acl = (res->mask & NFS_DFACL) ? &res->acl_default : NULL; -	aclcnt = (res->mask & NFS_DFACLCNT) ? &res->acl_default_count : NULL; -	if (err > 0) -		err = nfsacl_decode(buf, base + err, aclcnt, acl); -	return (err > 0) ? 0 : err; +static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   struct nfs_fattr *result) +{ +	enum nfs_stat status; +	int error; + +	error = decode_nfsstat3(xdr, &status); +	if (unlikely(error)) +		goto out; +	if (status != NFS3_OK) +		goto out_default; +	error = decode_post_op_attr(xdr, result); +out: +	return error; +out_default: +	return nfs3_stat_to_errno(status);  } +#endif  /* CONFIG_NFS_V3_ACL */ + +  /* - * Decode setacl reply. + * We need to translate between nfs status return values and + * the local errno values which may not be the same. + */ +static const struct { +	int stat; +	int errno; +} nfs_errtbl[] = { +	{ NFS_OK,		0		}, +	{ NFSERR_PERM,		-EPERM		}, +	{ NFSERR_NOENT,		-ENOENT		}, +	{ NFSERR_IO,		-errno_NFSERR_IO}, +	{ NFSERR_NXIO,		-ENXIO		}, +/*	{ NFSERR_EAGAIN,	-EAGAIN		}, */ +	{ NFSERR_ACCES,		-EACCES		}, +	{ NFSERR_EXIST,		-EEXIST		}, +	{ NFSERR_XDEV,		-EXDEV		}, +	{ NFSERR_NODEV,		-ENODEV		}, +	{ NFSERR_NOTDIR,	-ENOTDIR	}, +	{ NFSERR_ISDIR,		-EISDIR		}, +	{ NFSERR_INVAL,		-EINVAL		}, +	{ NFSERR_FBIG,		-EFBIG		}, +	{ NFSERR_NOSPC,		-ENOSPC		}, +	{ NFSERR_ROFS,		-EROFS		}, +	{ NFSERR_MLINK,		-EMLINK		}, +	{ NFSERR_NAMETOOLONG,	-ENAMETOOLONG	}, +	{ NFSERR_NOTEMPTY,	-ENOTEMPTY	}, +	{ NFSERR_DQUOT,		-EDQUOT		}, +	{ NFSERR_STALE,		-ESTALE		}, +	{ NFSERR_REMOTE,	-EREMOTE	}, +#ifdef EWFLUSH +	{ NFSERR_WFLUSH,	-EWFLUSH	}, +#endif +	{ NFSERR_BADHANDLE,	-EBADHANDLE	}, +	{ NFSERR_NOT_SYNC,	-ENOTSYNC	}, +	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	}, +	{ NFSERR_NOTSUPP,	-ENOTSUPP	}, +	{ NFSERR_TOOSMALL,	-ETOOSMALL	}, +	{ NFSERR_SERVERFAULT,	-EREMOTEIO	}, +	{ NFSERR_BADTYPE,	-EBADTYPE	}, +	{ NFSERR_JUKEBOX,	-EJUKEBOX	}, +	{ -1,			-EIO		} +}; + +/** + * nfs3_stat_to_errno - convert an NFS status code to a local errno + * @status: NFS status code to convert + * + * Returns a local errno value, or -EIO if the NFS status code is + * not recognized.  This function is used jointly by NFSv2 and NFSv3.   */ -static int -nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) +static int nfs3_stat_to_errno(enum nfs_stat status)  { -	int status = ntohl(*p++); +	int i; -	if (status) -		return nfs_stat_to_errno(status); -	xdr_decode_post_op_attr(p, fattr); -	return 0; +	for (i = 0; nfs_errtbl[i].stat != -1; i++) { +		if (nfs_errtbl[i].stat == (int)status) +			return nfs_errtbl[i].errno; +	} +	dprintk("NFS: Unrecognized nfs status value: %u\n", status); +	return nfs_errtbl[i].errno;  } -#endif  /* CONFIG_NFS_V3_ACL */ +  #define PROC(proc, argtype, restype, timer)				\  [NFS3PROC_##proc] = {							\  	.p_proc      = NFS3PROC_##proc,					\ -	.p_encode    = (kxdrproc_t) nfs3_xdr_##argtype,			\ -	.p_decode    = (kxdrproc_t) nfs3_xdr_##restype,			\ -	.p_arglen    = NFS3_##argtype##_sz,				\ -	.p_replen    = NFS3_##restype##_sz,				\ +	.p_encode    = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args,	\ +	.p_decode    = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res,	\ +	.p_arglen    = NFS3_##argtype##args_sz,				\ +	.p_replen    = NFS3_##restype##res_sz,				\  	.p_timer     = timer,						\  	.p_statidx   = NFS3PROC_##proc,					\  	.p_name      = #proc,						\  	}  struct rpc_procinfo	nfs3_procedures[] = { -  PROC(GETATTR,		fhandle,	attrstat, 1), -  PROC(SETATTR, 	sattrargs,	wccstat, 0), -  PROC(LOOKUP,		diropargs,	lookupres, 2), -  PROC(ACCESS,		accessargs,	accessres, 1), -  PROC(READLINK,	readlinkargs,	readlinkres, 3), -  PROC(READ,		readargs,	readres, 3), -  PROC(WRITE,		writeargs,	writeres, 4), -  PROC(CREATE,		createargs,	createres, 0), -  PROC(MKDIR,		mkdirargs,	createres, 0), -  PROC(SYMLINK,		symlinkargs,	createres, 0), -  PROC(MKNOD,		mknodargs,	createres, 0), -  PROC(REMOVE,		removeargs,	removeres, 0), -  PROC(RMDIR,		diropargs,	wccstat, 0), -  PROC(RENAME,		renameargs,	renameres, 0), -  PROC(LINK,		linkargs,	linkres, 0), -  PROC(READDIR,		readdirargs,	readdirres, 3), -  PROC(READDIRPLUS,	readdirargs,	readdirres, 3), -  PROC(FSSTAT,		fhandle,	fsstatres, 0), -  PROC(FSINFO,  	fhandle,	fsinfores, 0), -  PROC(PATHCONF,	fhandle,	pathconfres, 0), -  PROC(COMMIT,		commitargs,	commitres, 5), +	PROC(GETATTR,		getattr,	getattr,	1), +	PROC(SETATTR,		setattr,	setattr,	0), +	PROC(LOOKUP,		lookup,		lookup,		2), +	PROC(ACCESS,		access,		access,		1), +	PROC(READLINK,		readlink,	readlink,	3), +	PROC(READ,		read,		read,		3), +	PROC(WRITE,		write,		write,		4), +	PROC(CREATE,		create,		create,		0), +	PROC(MKDIR,		mkdir,		create,		0), +	PROC(SYMLINK,		symlink,	create,		0), +	PROC(MKNOD,		mknod,		create,		0), +	PROC(REMOVE,		remove,		remove,		0), +	PROC(RMDIR,		lookup,		setattr,	0), +	PROC(RENAME,		rename,		rename,		0), +	PROC(LINK,		link,		link,		0), +	PROC(READDIR,		readdir,	readdir,	3), +	PROC(READDIRPLUS,	readdirplus,	readdir,	3), +	PROC(FSSTAT,		getattr,	fsstat,		0), +	PROC(FSINFO,		getattr,	fsinfo,		0), +	PROC(PATHCONF,		getattr,	pathconf,	0), +	PROC(COMMIT,		commit,		commit,		5),  }; -struct rpc_version		nfs_version3 = { +const struct rpc_version nfs_version3 = {  	.number			= 3,  	.nrprocs		= ARRAY_SIZE(nfs3_procedures),  	.procs			= nfs3_procedures @@ -1183,8 +2529,8 @@ struct rpc_version		nfs_version3 = {  static struct rpc_procinfo	nfs3_acl_procedures[] = {  	[ACLPROC3_GETACL] = {  		.p_proc = ACLPROC3_GETACL, -		.p_encode = (kxdrproc_t) nfs3_xdr_getaclargs, -		.p_decode = (kxdrproc_t) nfs3_xdr_getaclres, +		.p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args, +		.p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res,  		.p_arglen = ACL3_getaclargs_sz,  		.p_replen = ACL3_getaclres_sz,  		.p_timer = 1, @@ -1192,8 +2538,8 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {  	},  	[ACLPROC3_SETACL] = {  		.p_proc = ACLPROC3_SETACL, -		.p_encode = (kxdrproc_t) nfs3_xdr_setaclargs, -		.p_decode = (kxdrproc_t) nfs3_xdr_setaclres, +		.p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args, +		.p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res,  		.p_arglen = ACL3_setaclargs_sz,  		.p_replen = ACL3_setaclres_sz,  		.p_timer = 0, @@ -1201,7 +2547,7 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {  	},  }; -struct rpc_version		nfsacl_version3 = { +const struct rpc_version nfsacl_version3 = {  	.number			= 3,  	.nrprocs		= sizeof(nfs3_acl_procedures)/  				  sizeof(nfs3_acl_procedures[0]), diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 9fa496387fd..ba2affa5194 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -9,33 +9,21 @@  #ifndef __LINUX_FS_NFS_NFS4_FS_H  #define __LINUX_FS_NFS_NFS4_FS_H -#ifdef CONFIG_NFS_V4 +#if defined(CONFIG_NFS_V4_2) +#define NFS4_MAX_MINOR_VERSION 2 +#elif defined(CONFIG_NFS_V4_1) +#define NFS4_MAX_MINOR_VERSION 1 +#else +#define NFS4_MAX_MINOR_VERSION 0 +#endif -struct idmap; +#if IS_ENABLED(CONFIG_NFS_V4) -/* - * In a seqid-mutating op, this macro controls which error return - * values trigger incrementation of the seqid. - * - * from rfc 3010: - * The client MUST monotonically increment the sequence number for the - * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE - * operations.  This is true even in the event that the previous - * operation that used the sequence number received an error.  The only - * exception to this rule is if the previous operation received one of - * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID, - * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR, - * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE. - * - */ -#define seqid_mutating_err(err)       \ -(((err) != NFSERR_STALE_CLIENTID) &&  \ - ((err) != NFSERR_STALE_STATEID)  &&  \ - ((err) != NFSERR_BAD_STATEID)    &&  \ - ((err) != NFSERR_BAD_SEQID)      &&  \ - ((err) != NFSERR_BAD_XDR)        &&  \ - ((err) != NFSERR_RESOURCE)       &&  \ - ((err) != NFSERR_NOFILEHANDLE)) +#define NFS4_MAX_LOOP_ON_RECOVER (10) + +#include <linux/seqlock.h> + +struct idmap;  enum nfs4_client_state {  	NFS4CLNT_MANAGER_RUNNING  = 0, @@ -45,49 +33,51 @@ enum nfs4_client_state {  	NFS4CLNT_RECLAIM_NOGRACE,  	NFS4CLNT_DELEGRETURN,  	NFS4CLNT_SESSION_RESET, -	NFS4CLNT_RECALL_SLOT, +	NFS4CLNT_LEASE_CONFIRM, +	NFS4CLNT_SERVER_SCOPE_MISMATCH, +	NFS4CLNT_PURGE_STATE, +	NFS4CLNT_BIND_CONN_TO_SESSION, +	NFS4CLNT_MOVED, +	NFS4CLNT_LEASE_MOVED,  }; -enum nfs4_session_state { -	NFS4_SESSION_INITING, -	NFS4_SESSION_DRAINING, -}; +#define NFS4_RENEW_TIMEOUT		0x01 +#define NFS4_RENEW_DELEGATION_CB	0x02  struct nfs4_minor_version_ops {  	u32	minor_version; +	unsigned init_caps; -	int	(*call_sync)(struct nfs_server *server, -			struct rpc_message *msg, -			struct nfs4_sequence_args *args, -			struct nfs4_sequence_res *res, -			int cache_reply); -	int	(*validate_stateid)(struct nfs_delegation *, +	int	(*init_client)(struct nfs_client *); +	void	(*shutdown_client)(struct nfs_client *); +	bool	(*match_stateid)(const nfs4_stateid *,  			const nfs4_stateid *); +	int	(*find_root_sec)(struct nfs_server *, struct nfs_fh *, +			struct nfs_fsinfo *); +	int	(*free_lock_state)(struct nfs_server *, +			struct nfs4_lock_state *); +	const struct rpc_call_ops *call_sync_ops;  	const struct nfs4_state_recovery_ops *reboot_recovery_ops;  	const struct nfs4_state_recovery_ops *nograce_recovery_ops;  	const struct nfs4_state_maintenance_ops *state_renewal_ops; -}; - -/* - * struct rpc_sequence ensures that RPC calls are sent in the exact - * order that they appear on the list. - */ -struct rpc_sequence { -	struct rpc_wait_queue	wait;	/* RPC call delay queue */ -	spinlock_t lock;		/* Protects the list */ -	struct list_head list;		/* Defines sequence of RPC calls */ +	const struct nfs4_mig_recovery_ops *mig_recovery_ops;  };  #define NFS_SEQID_CONFIRMED 1  struct nfs_seqid_counter { -	struct rpc_sequence *sequence; +	ktime_t create_time; +	int owner_id;  	int flags;  	u32 counter; +	spinlock_t lock;		/* Protects the list */ +	struct list_head list;		/* Defines sequence of RPC calls */ +	struct rpc_wait_queue	wait;	/* RPC call delay queue */  };  struct nfs_seqid {  	struct nfs_seqid_counter *sequence;  	struct list_head list; +	struct rpc_task *task;  };  static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status) @@ -96,20 +86,16 @@ static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status  		seqid->flags |= NFS_SEQID_CONFIRMED;  } -struct nfs_unique_id { -	struct rb_node rb_node; -	__u64 id; -}; -  /*   * NFS4 state_owners and lock_owners are simply labels for ordered   * sequences of RPC calls. Their sole purpose is to provide once-only   * semantics by allowing the server to identify replayed requests.   */  struct nfs4_state_owner { -	struct nfs_unique_id so_owner_id;  	struct nfs_server    *so_server; -	struct rb_node	     so_client_node; +	struct list_head     so_lru; +	unsigned long        so_expires; +	struct rb_node	     so_server_node;  	struct rpc_cred	     *so_cred;	 /* Associated cred */ @@ -118,7 +104,8 @@ struct nfs4_state_owner {  	unsigned long	     so_flags;  	struct list_head     so_states;  	struct nfs_seqid_counter so_seqid; -	struct rpc_sequence  so_sequence; +	seqcount_t	     so_reclaim_seqcount; +	struct mutex	     so_delegreturn_mutex;  };  enum { @@ -156,11 +143,10 @@ struct nfs4_lock_owner {  struct nfs4_lock_state {  	struct list_head	ls_locks;	/* Other lock stateids */  	struct nfs4_state *	ls_state;	/* Pointer to open state */ -#define NFS_LOCK_INITIALIZED 1 -	int			ls_flags; +#define NFS_LOCK_INITIALIZED 0 +#define NFS_LOCK_LOST        1 +	unsigned long		ls_flags;  	struct nfs_seqid_counter	ls_seqid; -	struct rpc_sequence	ls_sequence; -	struct nfs_unique_id	ls_id;  	nfs4_stateid		ls_stateid;  	atomic_t		ls_count;  	struct nfs4_lock_owner	ls_owner; @@ -170,12 +156,14 @@ struct nfs4_lock_state {  enum {  	LK_STATE_IN_USE,  	NFS_DELEGATED_STATE,		/* Current stateid is delegation */ +	NFS_OPEN_STATE,			/* OPEN stateid is set */  	NFS_O_RDONLY_STATE,		/* OPEN stateid has read-only state */  	NFS_O_WRONLY_STATE,		/* OPEN stateid has write-only state */  	NFS_O_RDWR_STATE,		/* OPEN stateid has read/write state */  	NFS_STATE_RECLAIM_REBOOT,	/* OPEN stateid server rebooted */  	NFS_STATE_RECLAIM_NOGRACE,	/* OPEN stateid needs to recover state */  	NFS_STATE_POSIX_LOCKS,		/* Posix locks are supported */ +	NFS_STATE_RECOVERY_FAILED,	/* OPEN stateid state recovery failed */  };  struct nfs4_state { @@ -206,6 +194,7 @@ struct nfs4_exception {  	long timeout;  	int retry;  	struct nfs4_state *state; +	struct inode *inode;  };  struct nfs4_state_recovery_ops { @@ -214,38 +203,63 @@ struct nfs4_state_recovery_ops {  	int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);  	int (*recover_lock)(struct nfs4_state *, struct file_lock *);  	int (*establish_clid)(struct nfs_client *, struct rpc_cred *); -	struct rpc_cred * (*get_clid_cred)(struct nfs_client *); -	int (*reclaim_complete)(struct nfs_client *); +	int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *); +	int (*detect_trunking)(struct nfs_client *, struct nfs_client **, +		struct rpc_cred *);  };  struct nfs4_state_maintenance_ops { -	int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *); +	int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned);  	struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);  	int (*renew_lease)(struct nfs_client *, struct rpc_cred *);  }; +struct nfs4_mig_recovery_ops { +	int (*get_locations)(struct inode *, struct nfs4_fs_locations *, +		struct page *, struct rpc_cred *); +	int (*fsid_present)(struct inode *, struct rpc_cred *); +}; +  extern const struct dentry_operations nfs4_dentry_operations; -extern const struct inode_operations nfs4_dir_inode_operations; -/* inode.c */ -extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t); -extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int); -extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t); +/* dir.c */ +int nfs_atomic_open(struct inode *, struct dentry *, struct file *, +		    unsigned, umode_t, int *); + +/* super.c */ +extern struct file_system_type nfs4_fs_type; +/* nfs4namespace.c */ +struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, struct qstr *); +struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, +			       struct nfs_fh *, struct nfs_fattr *); +int nfs4_replace_transport(struct nfs_server *server, +				const struct nfs4_fs_locations *locations);  /* nfs4proc.c */  extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);  extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); +extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool); +extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, struct rpc_cred *cred);  extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); -extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); -extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); +extern int nfs4_destroy_clientid(struct nfs_client *clp);  extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);  extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); -extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); +extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait);  extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); -extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, -		struct nfs4_fs_locations *fs_locations, struct page *page); -extern void nfs4_release_lockowner(const struct nfs4_lock_state *); +extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *, +				  struct nfs4_fs_locations *, struct page *); +extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *, +		struct page *page, struct rpc_cred *); +extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *); +extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *, +			    struct nfs_fh *, struct nfs_fattr *); +extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); +extern const struct xattr_handler *nfs4_xattr_handlers[]; +extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, +		const struct nfs_open_context *ctx, +		const struct nfs_lock_context *l_ctx, +		fmode_t fmode);  #if defined(CONFIG_NFS_V4_1)  static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) @@ -253,42 +267,128 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser  	return server->nfs_client->cl_session;  } -extern int nfs4_setup_sequence(const struct nfs_server *server, +extern int nfs41_setup_sequence(struct nfs4_session *session,  		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, -		int cache_reply, struct rpc_task *task); -extern void nfs4_destroy_session(struct nfs4_session *session); -extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); -extern int nfs4_proc_create_session(struct nfs_client *); -extern int nfs4_proc_destroy_session(struct nfs4_session *); -extern int nfs4_init_session(struct nfs_server *server); +		struct rpc_task *task); +extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *); +extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *); +extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *);  extern int nfs4_proc_get_lease_time(struct nfs_client *clp,  		struct nfs_fsinfo *fsinfo); +extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, +				  bool sync); + +static inline bool +is_ds_only_client(struct nfs_client *clp) +{ +	return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) == +		EXCHGID4_FLAG_USE_PNFS_DS; +} + +static inline bool +is_ds_client(struct nfs_client *clp) +{ +	return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS; +} + +static inline bool +_nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, +		    struct rpc_clnt **clntp, struct rpc_message *msg) +{ +	struct rpc_cred *newcred = NULL; +	rpc_authflavor_t flavor; + +	if (test_bit(sp4_mode, &clp->cl_sp4_flags)) { +		spin_lock(&clp->cl_lock); +		if (clp->cl_machine_cred != NULL) +			/* don't call get_rpccred on the machine cred - +			 * a reference will be held for life of clp */ +			newcred = clp->cl_machine_cred; +		spin_unlock(&clp->cl_lock); +		msg->rpc_cred = newcred; + +		flavor = clp->cl_rpcclient->cl_auth->au_flavor; +		WARN_ON_ONCE(flavor != RPC_AUTH_GSS_KRB5I && +			     flavor != RPC_AUTH_GSS_KRB5P); +		*clntp = clp->cl_rpcclient; + +		return true; +	} +	return false; +} + +/* + * Function responsible for determining if an rpc_message should use the + * machine cred under SP4_MACH_CRED and if so switching the credential and + * authflavor (using the nfs_client's rpc_clnt which will be krb5i/p). + * Should be called before rpc_call_sync/rpc_call_async. + */ +static inline void +nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, +		   struct rpc_clnt **clntp, struct rpc_message *msg) +{ +	_nfs4_state_protect(clp, sp4_mode, clntp, msg); +} + +/* + * Special wrapper to nfs4_state_protect for write. + * If WRITE can use machine cred but COMMIT cannot, make sure all writes + * that use machine cred use NFS_FILE_SYNC. + */ +static inline void +nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, +			 struct rpc_message *msg, struct nfs_pgio_data *wdata) +{ +	if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) && +	    !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags)) +		wdata->args.stable = NFS_FILE_SYNC; +}  #else /* CONFIG_NFS_v4_1 */  static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)  {  	return NULL;  } -static inline int nfs4_setup_sequence(const struct nfs_server *server, -		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, -		int cache_reply, struct rpc_task *task) +static inline bool +is_ds_only_client(struct nfs_client *clp)  { -	return 0; +	return false;  } -static inline int nfs4_init_session(struct nfs_server *server) +static inline bool +is_ds_client(struct nfs_client *clp) +{ +	return false; +} + +static inline void +nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags, +		   struct rpc_clnt **clntp, struct rpc_message *msg) +{ +} + +static inline void +nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, +			 struct rpc_message *msg, struct nfs_pgio_data *wdata)  { -	return 0;  }  #endif /* CONFIG_NFS_V4_1 */  extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; -extern const u32 nfs4_fattr_bitmap[2]; -extern const u32 nfs4_statfs_bitmap[2]; -extern const u32 nfs4_pathconf_bitmap[2]; -extern const u32 nfs4_fsinfo_bitmap[2]; -extern const u32 nfs4_fs_locations_bitmap[2]; +extern const u32 nfs4_fattr_bitmap[3]; +extern const u32 nfs4_statfs_bitmap[3]; +extern const u32 nfs4_pathconf_bitmap[3]; +extern const u32 nfs4_fsinfo_bitmap[3]; +extern const u32 nfs4_fs_locations_bitmap[3]; + +void nfs40_shutdown_client(struct nfs_client *); +void nfs41_shutdown_client(struct nfs_client *); +int nfs40_init_client(struct nfs_client *); +int nfs41_init_client(struct nfs_client *); +void nfs4_free_client(struct nfs_client *); + +struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *);  /* nfs4renewd.c */  extern void nfs4_schedule_state_renewal(struct nfs_client *); @@ -297,29 +397,52 @@ extern void nfs4_kill_renewd(struct nfs_client *);  extern void nfs4_renew_state(struct work_struct *);  /* nfs4state.c */ -struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp); +struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp); +struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);  struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); +int nfs4_discover_server_trunking(struct nfs_client *clp, +			struct nfs_client **); +int nfs40_discover_server_trunking(struct nfs_client *clp, +			struct nfs_client **, struct rpc_cred *);  #if defined(CONFIG_NFS_V4_1) -struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); -struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); +int nfs41_discover_server_trunking(struct nfs_client *clp, +			struct nfs_client **, struct rpc_cred *); +extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); +extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp); +extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp); + +#else +static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) +{ +}  #endif /* CONFIG_NFS_V4_1 */ -extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); +extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *, gfp_t);  extern void nfs4_put_state_owner(struct nfs4_state_owner *); +extern void nfs4_purge_state_owners(struct nfs_server *);  extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);  extern void nfs4_put_open_state(struct nfs4_state *); -extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t); -extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t); +extern void nfs4_close_state(struct nfs4_state *, fmode_t); +extern void nfs4_close_sync(struct nfs4_state *, fmode_t);  extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); -extern void nfs4_schedule_state_recovery(struct nfs_client *); +extern void nfs_inode_find_state_and_recover(struct inode *inode, +		const nfs4_stateid *stateid); +extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *, struct nfs4_state *); +extern void nfs4_schedule_lease_recovery(struct nfs_client *); +extern int nfs4_wait_clnt_recover(struct nfs_client *clp); +extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);  extern void nfs4_schedule_state_manager(struct nfs_client *); -extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); -extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state); +extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); +extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); +extern int nfs4_schedule_migration_recovery(const struct nfs_server *); +extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *);  extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); -extern void nfs41_handle_recall_slot(struct nfs_client *clp); +extern void nfs41_handle_server_scope(struct nfs_client *, +				      struct nfs41_server_scope **);  extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);  extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); -extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); +extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, +		fmode_t, const struct nfs_lockowner *);  extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);  extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); @@ -328,10 +451,38 @@ extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);  extern void nfs_release_seqid(struct nfs_seqid *seqid);  extern void nfs_free_seqid(struct nfs_seqid *seqid); +extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp); +  extern const nfs4_stateid zero_stateid; +/* nfs4super.c */ +struct nfs_mount_info; +extern struct nfs_subversion nfs_v4; +struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *); +extern bool nfs4_disable_idmapping; +extern unsigned short max_session_slots; +extern unsigned short send_implementation_id; +extern bool recover_lost_locks; + +#define NFS4_CLIENT_ID_UNIQ_LEN		(64) +extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN]; + +/* nfs4sysctl.c */ +#ifdef CONFIG_SYSCTL +int nfs4_register_sysctl(void); +void nfs4_unregister_sysctl(void); +#else +static inline int nfs4_register_sysctl(void) +{ +	return 0; +} + +static inline void nfs4_unregister_sysctl(void) +{ +} +#endif +  /* nfs4xdr.c */ -extern __be32 *nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int);  extern struct rpc_procinfo nfs4_procedures[];  struct nfs4_mount_data; @@ -340,10 +491,37 @@ struct nfs4_mount_data;  extern struct svc_version nfs4_callback_version1;  extern struct svc_version nfs4_callback_version4; +static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src) +{ +	memcpy(dst, src, sizeof(*dst)); +} + +static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src) +{ +	return memcmp(dst, src, sizeof(*dst)) == 0; +} + +static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src) +{ +	return memcmp(dst->other, src->other, NFS4_STATEID_OTHER_SIZE) == 0; +} + +static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stateid *s2) +{ +	return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; +} + +static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) +{ +	return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; +} +  #else -#define nfs4_close_state(a, b, c) do { } while (0) -#define nfs4_close_sync(a, b, c) do { } while (0) +#define nfs4_close_state(a, b) do { } while (0) +#define nfs4_close_sync(a, b) do { } while (0) +#define nfs4_state_protect(a, b, c, d) do { } while (0) +#define nfs4_state_protect_write(a, b, c, d) do { } while (0)  #endif /* CONFIG_NFS_V4 */  #endif /* __LINUX_FS_NFS_NFS4_FS.H */ diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c new file mode 100644 index 00000000000..aa9ef487604 --- /dev/null +++ b/fs/nfs/nfs4client.c @@ -0,0 +1,1221 @@ +/* + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ +#include <linux/module.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_idmap.h> +#include <linux/nfs_mount.h> +#include <linux/sunrpc/addr.h> +#include <linux/sunrpc/auth.h> +#include <linux/sunrpc/xprt.h> +#include <linux/sunrpc/bc_xprt.h> +#include <linux/sunrpc/rpc_pipe_fs.h> +#include "internal.h" +#include "callback.h" +#include "delegation.h" +#include "nfs4session.h" +#include "pnfs.h" +#include "netns.h" + +#define NFSDBG_FACILITY		NFSDBG_CLIENT + +/* + * Get a unique NFSv4.0 callback identifier which will be used + * by the V4.0 callback service to lookup the nfs_client struct + */ +static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) +{ +	int ret = 0; +	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + +	if (clp->rpc_ops->version != 4 || minorversion != 0) +		return ret; +	idr_preload(GFP_KERNEL); +	spin_lock(&nn->nfs_client_lock); +	ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT); +	if (ret >= 0) +		clp->cl_cb_ident = ret; +	spin_unlock(&nn->nfs_client_lock); +	idr_preload_end(); +	return ret < 0 ? ret : 0; +} + +#ifdef CONFIG_NFS_V4_1 +/** + * Per auth flavor data server rpc clients + */ +struct nfs4_ds_server { +	struct list_head	list;   /* ds_clp->cl_ds_clients */ +	struct rpc_clnt		*rpc_clnt; +}; + +/** + * Common lookup case for DS I/O + */ +static struct nfs4_ds_server * +nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor) +{ +	struct nfs4_ds_server *dss; + +	rcu_read_lock(); +	list_for_each_entry_rcu(dss, &ds_clp->cl_ds_clients, list) { +		if (dss->rpc_clnt->cl_auth->au_flavor != flavor) +			continue; +		goto out; +	} +	dss = NULL; +out: +	rcu_read_unlock(); +	return dss; +} + +static struct nfs4_ds_server * +nfs4_add_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor, +			   struct nfs4_ds_server *new) +{ +	struct nfs4_ds_server *dss; + +	spin_lock(&ds_clp->cl_lock); +	list_for_each_entry(dss, &ds_clp->cl_ds_clients, list) { +		if (dss->rpc_clnt->cl_auth->au_flavor != flavor) +			continue; +		goto out; +	} +	if (new) +		list_add_rcu(&new->list, &ds_clp->cl_ds_clients); +	dss = new; +out: +	spin_unlock(&ds_clp->cl_lock); /* need some lock to protect list */ +	return dss; +} + +static struct nfs4_ds_server * +nfs4_alloc_ds_server(struct nfs_client *ds_clp, rpc_authflavor_t flavor) +{ +	struct nfs4_ds_server *dss; + +	dss = kmalloc(sizeof(*dss), GFP_NOFS); +	if (dss == NULL) +		return ERR_PTR(-ENOMEM); + +	dss->rpc_clnt = rpc_clone_client_set_auth(ds_clp->cl_rpcclient, flavor); +	if (IS_ERR(dss->rpc_clnt)) { +		int err = PTR_ERR(dss->rpc_clnt); +		kfree (dss); +		return ERR_PTR(err); +	} +	INIT_LIST_HEAD(&dss->list); + +	return dss; +} + +static void +nfs4_free_ds_server(struct nfs4_ds_server *dss) +{ +	rpc_release_client(dss->rpc_clnt); +	kfree(dss); +} + +/** +* Find or create a DS rpc client with th MDS server rpc client auth flavor +* in the nfs_client cl_ds_clients list. +*/ +struct rpc_clnt * +nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode) +{ +	struct nfs4_ds_server *dss, *new; +	rpc_authflavor_t flavor = NFS_SERVER(inode)->client->cl_auth->au_flavor; + +	dss = nfs4_find_ds_client(ds_clp, flavor); +	if (dss != NULL) +		goto out; +	new = nfs4_alloc_ds_server(ds_clp, flavor); +	if (IS_ERR(new)) +		return ERR_CAST(new); +	dss = nfs4_add_ds_client(ds_clp, flavor, new); +	if (dss != new) +		nfs4_free_ds_server(new); +out: +	return dss->rpc_clnt; +} +EXPORT_SYMBOL_GPL(nfs4_find_or_create_ds_client); + +static void +nfs4_shutdown_ds_clients(struct nfs_client *clp) +{ +	struct nfs4_ds_server *dss; +	LIST_HEAD(shutdown_list); + +	while (!list_empty(&clp->cl_ds_clients)) { +		dss = list_entry(clp->cl_ds_clients.next, +					struct nfs4_ds_server, list); +		list_del(&dss->list); +		rpc_shutdown_client(dss->rpc_clnt); +		kfree (dss); +	} +} + +void nfs41_shutdown_client(struct nfs_client *clp) +{ +	if (nfs4_has_session(clp)) { +		nfs4_shutdown_ds_clients(clp); +		nfs4_destroy_session(clp->cl_session); +		nfs4_destroy_clientid(clp); +	} + +} +#endif	/* CONFIG_NFS_V4_1 */ + +void nfs40_shutdown_client(struct nfs_client *clp) +{ +	if (clp->cl_slot_tbl) { +		nfs4_shutdown_slot_table(clp->cl_slot_tbl); +		kfree(clp->cl_slot_tbl); +	} +} + +struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) +{ +	int err; +	struct nfs_client *clp = nfs_alloc_client(cl_init); +	if (IS_ERR(clp)) +		return clp; + +	err = nfs_get_cb_ident_idr(clp, cl_init->minorversion); +	if (err) +		goto error; + +	if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) { +		err = -EINVAL; +		goto error; +	} + +	spin_lock_init(&clp->cl_lock); +	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); +	INIT_LIST_HEAD(&clp->cl_ds_clients); +	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); +	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; +	clp->cl_minorversion = cl_init->minorversion; +	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; +	clp->cl_mig_gen = 1; +	return clp; + +error: +	nfs_free_client(clp); +	return ERR_PTR(err); +} + +/* + * Destroy the NFS4 callback service + */ +static void nfs4_destroy_callback(struct nfs_client *clp) +{ +	if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) +		nfs_callback_down(clp->cl_mvops->minor_version, clp->cl_net); +} + +static void nfs4_shutdown_client(struct nfs_client *clp) +{ +	if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) +		nfs4_kill_renewd(clp); +	clp->cl_mvops->shutdown_client(clp); +	nfs4_destroy_callback(clp); +	if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) +		nfs_idmap_delete(clp); + +	rpc_destroy_wait_queue(&clp->cl_rpcwaitq); +	kfree(clp->cl_serverowner); +	kfree(clp->cl_serverscope); +	kfree(clp->cl_implid); +} + +void nfs4_free_client(struct nfs_client *clp) +{ +	nfs4_shutdown_client(clp); +	nfs_free_client(clp); +} + +/* + * Initialize the NFS4 callback service + */ +static int nfs4_init_callback(struct nfs_client *clp) +{ +	int error; + +	if (clp->rpc_ops->version == 4) { +		struct rpc_xprt *xprt; + +		xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt); + +		if (nfs4_has_session(clp)) { +			error = xprt_setup_backchannel(xprt, +						NFS41_BC_MIN_CALLBACKS); +			if (error < 0) +				return error; +		} + +		error = nfs_callback_up(clp->cl_mvops->minor_version, xprt); +		if (error < 0) { +			dprintk("%s: failed to start callback. Error = %d\n", +				__func__, error); +			return error; +		} +		__set_bit(NFS_CS_CALLBACK, &clp->cl_res_state); +	} +	return 0; +} + +/** + * nfs40_init_client - nfs_client initialization tasks for NFSv4.0 + * @clp - nfs_client to initialize + * + * Returns zero on success, or a negative errno if some error occurred. + */ +int nfs40_init_client(struct nfs_client *clp) +{ +	struct nfs4_slot_table *tbl; +	int ret; + +	tbl = kzalloc(sizeof(*tbl), GFP_NOFS); +	if (tbl == NULL) +		return -ENOMEM; + +	ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE, +					"NFSv4.0 transport Slot table"); +	if (ret) { +		kfree(tbl); +		return ret; +	} + +	clp->cl_slot_tbl = tbl; +	return 0; +} + +#if defined(CONFIG_NFS_V4_1) + +/** + * nfs41_init_client - nfs_client initialization tasks for NFSv4.1+ + * @clp - nfs_client to initialize + * + * Returns zero on success, or a negative errno if some error occurred. + */ +int nfs41_init_client(struct nfs_client *clp) +{ +	struct nfs4_session *session = NULL; + +	/* +	 * Create the session and mark it expired. +	 * When a SEQUENCE operation encounters the expired session +	 * it will do session recovery to initialize it. +	 */ +	session = nfs4_alloc_session(clp); +	if (!session) +		return -ENOMEM; + +	clp->cl_session = session; + +	/* +	 * The create session reply races with the server back +	 * channel probe. Mark the client NFS_CS_SESSION_INITING +	 * so that the client back channel can find the +	 * nfs_client struct +	 */ +	nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING); +	return 0; +} + +#endif	/* CONFIG_NFS_V4_1 */ + +/* + * Initialize the minor version specific parts of an NFS4 client record + */ +static int nfs4_init_client_minor_version(struct nfs_client *clp) +{ +	int ret; + +	ret = clp->cl_mvops->init_client(clp); +	if (ret) +		return ret; +	return nfs4_init_callback(clp); +} + +/** + * nfs4_init_client - Initialise an NFS4 client record + * + * @clp: nfs_client to initialise + * @timeparms: timeout parameters for underlying RPC transport + * @ip_addr: callback IP address in presentation format + * @authflavor: authentication flavor for underlying RPC transport + * + * Returns pointer to an NFS client, or an ERR_PTR value. + */ +struct nfs_client *nfs4_init_client(struct nfs_client *clp, +				    const struct rpc_timeout *timeparms, +				    const char *ip_addr) +{ +	char buf[INET6_ADDRSTRLEN + 1]; +	struct nfs_client *old; +	int error; + +	if (clp->cl_cons_state == NFS_CS_READY) { +		/* the client is initialised already */ +		dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp); +		return clp; +	} + +	/* Check NFS protocol revision and initialize RPC op vector */ +	clp->rpc_ops = &nfs_v4_clientops; + +	if (clp->cl_minorversion != 0) +		__set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); +	__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); +	__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); + +	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); +	if (error == -EINVAL) +		error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); +	if (error < 0) +		goto error; + +	/* If no clientaddr= option was specified, find a usable cb address */ +	if (ip_addr == NULL) { +		struct sockaddr_storage cb_addr; +		struct sockaddr *sap = (struct sockaddr *)&cb_addr; + +		error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr)); +		if (error < 0) +			goto error; +		error = rpc_ntop(sap, buf, sizeof(buf)); +		if (error < 0) +			goto error; +		ip_addr = (const char *)buf; +	} +	strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + +	error = nfs_idmap_new(clp); +	if (error < 0) { +		dprintk("%s: failed to create idmapper. Error = %d\n", +			__func__, error); +		goto error; +	} +	__set_bit(NFS_CS_IDMAP, &clp->cl_res_state); + +	error = nfs4_init_client_minor_version(clp); +	if (error < 0) +		goto error; + +	if (!nfs4_has_session(clp)) +		nfs_mark_client_ready(clp, NFS_CS_READY); + +	error = nfs4_discover_server_trunking(clp, &old); +	if (error < 0) +		goto error; + +	if (clp != old) +		clp->cl_preserve_clid = true; +	nfs_put_client(clp); +	return old; + +error: +	nfs_mark_client_ready(clp, error); +	nfs_put_client(clp); +	dprintk("<-- nfs4_init_client() = xerror %d\n", error); +	return ERR_PTR(error); +} + +/* + * SETCLIENTID just did a callback update with the callback ident in + * "drop," but server trunking discovery claims "drop" and "keep" are + * actually the same server.  Swap the callback IDs so that "keep" + * will continue to use the callback ident the server now knows about, + * and so that "keep"'s original callback ident is destroyed when + * "drop" is freed. + */ +static void nfs4_swap_callback_idents(struct nfs_client *keep, +				      struct nfs_client *drop) +{ +	struct nfs_net *nn = net_generic(keep->cl_net, nfs_net_id); +	unsigned int save = keep->cl_cb_ident; + +	if (keep->cl_cb_ident == drop->cl_cb_ident) +		return; + +	dprintk("%s: keeping callback ident %u and dropping ident %u\n", +		__func__, keep->cl_cb_ident, drop->cl_cb_ident); + +	spin_lock(&nn->nfs_client_lock); + +	idr_replace(&nn->cb_ident_idr, keep, drop->cl_cb_ident); +	keep->cl_cb_ident = drop->cl_cb_ident; + +	idr_replace(&nn->cb_ident_idr, drop, save); +	drop->cl_cb_ident = save; + +	spin_unlock(&nn->nfs_client_lock); +} + +/** + * nfs40_walk_client_list - Find server that recognizes a client ID + * + * @new: nfs_client with client ID to test + * @result: OUT: found nfs_client, or new + * @cred: credential to use for trunking test + * + * Returns zero, a negative errno, or a negative NFS4ERR status. + * If zero is returned, an nfs_client pointer is planted in "result." + * + * NB: nfs40_walk_client_list() relies on the new nfs_client being + *     the last nfs_client on the list. + */ +int nfs40_walk_client_list(struct nfs_client *new, +			   struct nfs_client **result, +			   struct rpc_cred *cred) +{ +	struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); +	struct nfs_client *pos, *prev = NULL; +	struct nfs4_setclientid_res clid = { +		.clientid	= new->cl_clientid, +		.confirm	= new->cl_confirm, +	}; +	int status = -NFS4ERR_STALE_CLIENTID; + +	spin_lock(&nn->nfs_client_lock); +	list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { +		/* If "pos" isn't marked ready, we can't trust the +		 * remaining fields in "pos" */ +		if (pos->cl_cons_state > NFS_CS_READY) { +			atomic_inc(&pos->cl_count); +			spin_unlock(&nn->nfs_client_lock); + +			if (prev) +				nfs_put_client(prev); +			prev = pos; + +			status = nfs_wait_client_init_complete(pos); +			if (status < 0) +				goto out; +			status = -NFS4ERR_STALE_CLIENTID; +			spin_lock(&nn->nfs_client_lock); +		} +		if (pos->cl_cons_state != NFS_CS_READY) +			continue; + +		if (pos->rpc_ops != new->rpc_ops) +			continue; + +		if (pos->cl_proto != new->cl_proto) +			continue; + +		if (pos->cl_minorversion != new->cl_minorversion) +			continue; + +		if (pos->cl_clientid != new->cl_clientid) +			continue; + +		atomic_inc(&pos->cl_count); +		spin_unlock(&nn->nfs_client_lock); + +		if (prev) +			nfs_put_client(prev); +		prev = pos; + +		status = nfs4_proc_setclientid_confirm(pos, &clid, cred); +		switch (status) { +		case -NFS4ERR_STALE_CLIENTID: +			break; +		case 0: +			nfs4_swap_callback_idents(pos, new); + +			prev = NULL; +			*result = pos; +			dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", +				__func__, pos, atomic_read(&pos->cl_count)); +			goto out; +		case -ERESTARTSYS: +		case -ETIMEDOUT: +			/* The callback path may have been inadvertently +			 * changed. Schedule recovery! +			 */ +			nfs4_schedule_path_down_recovery(pos); +		default: +			goto out; +		} + +		spin_lock(&nn->nfs_client_lock); +	} +	spin_unlock(&nn->nfs_client_lock); + +	/* No match found. The server lost our clientid */ +out: +	if (prev) +		nfs_put_client(prev); +	dprintk("NFS: <-- %s status = %d\n", __func__, status); +	return status; +} + +#ifdef CONFIG_NFS_V4_1 +/* + * Returns true if the client IDs match + */ +static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) +{ +	if (a->cl_clientid != b->cl_clientid) { +		dprintk("NFS: --> %s client ID %llx does not match %llx\n", +			__func__, a->cl_clientid, b->cl_clientid); +		return false; +	} +	dprintk("NFS: --> %s client ID %llx matches %llx\n", +		__func__, a->cl_clientid, b->cl_clientid); +	return true; +} + +/* + * Returns true if the server owners match + */ +static bool +nfs4_match_serverowners(struct nfs_client *a, struct nfs_client *b) +{ +	struct nfs41_server_owner *o1 = a->cl_serverowner; +	struct nfs41_server_owner *o2 = b->cl_serverowner; + +	if (o1->minor_id != o2->minor_id) { +		dprintk("NFS: --> %s server owner minor IDs do not match\n", +			__func__); +		return false; +	} + +	if (o1->major_id_sz != o2->major_id_sz) +		goto out_major_mismatch; +	if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0) +		goto out_major_mismatch; + +	dprintk("NFS: --> %s server owners match\n", __func__); +	return true; + +out_major_mismatch: +	dprintk("NFS: --> %s server owner major IDs do not match\n", +		__func__); +	return false; +} + +/** + * nfs41_walk_client_list - Find nfs_client that matches a client/server owner + * + * @new: nfs_client with client ID to test + * @result: OUT: found nfs_client, or new + * @cred: credential to use for trunking test + * + * Returns zero, a negative errno, or a negative NFS4ERR status. + * If zero is returned, an nfs_client pointer is planted in "result." + * + * NB: nfs41_walk_client_list() relies on the new nfs_client being + *     the last nfs_client on the list. + */ +int nfs41_walk_client_list(struct nfs_client *new, +			   struct nfs_client **result, +			   struct rpc_cred *cred) +{ +	struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); +	struct nfs_client *pos, *prev = NULL; +	int status = -NFS4ERR_STALE_CLIENTID; + +	spin_lock(&nn->nfs_client_lock); +	list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { +		/* If "pos" isn't marked ready, we can't trust the +		 * remaining fields in "pos", especially the client +		 * ID and serverowner fields.  Wait for CREATE_SESSION +		 * to finish. */ +		if (pos->cl_cons_state > NFS_CS_READY) { +			atomic_inc(&pos->cl_count); +			spin_unlock(&nn->nfs_client_lock); + +			if (prev) +				nfs_put_client(prev); +			prev = pos; + +			status = nfs_wait_client_init_complete(pos); +			if (status == 0) { +				nfs4_schedule_lease_recovery(pos); +				status = nfs4_wait_clnt_recover(pos); +			} +			spin_lock(&nn->nfs_client_lock); +			if (status < 0) +				break; +			status = -NFS4ERR_STALE_CLIENTID; +		} +		if (pos->cl_cons_state != NFS_CS_READY) +			continue; + +		if (pos->rpc_ops != new->rpc_ops) +			continue; + +		if (pos->cl_proto != new->cl_proto) +			continue; + +		if (pos->cl_minorversion != new->cl_minorversion) +			continue; + +		if (!nfs4_match_clientids(pos, new)) +			continue; + +		if (!nfs4_match_serverowners(pos, new)) +			continue; + +		atomic_inc(&pos->cl_count); +		*result = pos; +		status = 0; +		dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", +			__func__, pos, atomic_read(&pos->cl_count)); +		break; +	} + +	/* No matching nfs_client found. */ +	spin_unlock(&nn->nfs_client_lock); +	dprintk("NFS: <-- %s status = %d\n", __func__, status); +	if (prev) +		nfs_put_client(prev); +	return status; +} +#endif	/* CONFIG_NFS_V4_1 */ + +static void nfs4_destroy_server(struct nfs_server *server) +{ +	nfs_server_return_all_delegations(server); +	unset_pnfs_layoutdriver(server); +	nfs4_purge_state_owners(server); +} + +/* + * NFSv4.0 callback thread helper + * + * Find a client by callback identifier + */ +struct nfs_client * +nfs4_find_client_ident(struct net *net, int cb_ident) +{ +	struct nfs_client *clp; +	struct nfs_net *nn = net_generic(net, nfs_net_id); + +	spin_lock(&nn->nfs_client_lock); +	clp = idr_find(&nn->cb_ident_idr, cb_ident); +	if (clp) +		atomic_inc(&clp->cl_count); +	spin_unlock(&nn->nfs_client_lock); +	return clp; +} + +#if defined(CONFIG_NFS_V4_1) +/* Common match routine for v4.0 and v4.1 callback services */ +static bool nfs4_cb_match_client(const struct sockaddr *addr, +		struct nfs_client *clp, u32 minorversion) +{ +	struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; + +	/* Don't match clients that failed to initialise */ +	if (!(clp->cl_cons_state == NFS_CS_READY || +	    clp->cl_cons_state == NFS_CS_SESSION_INITING)) +		return false; + +	smp_rmb(); + +	/* Match the version and minorversion */ +	if (clp->rpc_ops->version != 4 || +	    clp->cl_minorversion != minorversion) +		return false; + +	/* Match only the IP address, not the port number */ +	if (!nfs_sockaddr_match_ipaddr(addr, clap)) +		return false; + +	return true; +} + +/* + * NFSv4.1 callback thread helper + * For CB_COMPOUND calls, find a client by IP address, protocol version, + * minorversion, and sessionID + * + * Returns NULL if no such client + */ +struct nfs_client * +nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, +			   struct nfs4_sessionid *sid, u32 minorversion) +{ +	struct nfs_client *clp; +	struct nfs_net *nn = net_generic(net, nfs_net_id); + +	spin_lock(&nn->nfs_client_lock); +	list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { +		if (nfs4_cb_match_client(addr, clp, minorversion) == false) +			continue; + +		if (!nfs4_has_session(clp)) +			continue; + +		/* Match sessionid*/ +		if (memcmp(clp->cl_session->sess_id.data, +		    sid->data, NFS4_MAX_SESSIONID_LEN) != 0) +			continue; + +		atomic_inc(&clp->cl_count); +		spin_unlock(&nn->nfs_client_lock); +		return clp; +	} +	spin_unlock(&nn->nfs_client_lock); +	return NULL; +} + +#else /* CONFIG_NFS_V4_1 */ + +struct nfs_client * +nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, +			   struct nfs4_sessionid *sid, u32 minorversion) +{ +	return NULL; +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * Set up an NFS4 client + */ +static int nfs4_set_client(struct nfs_server *server, +		const char *hostname, +		const struct sockaddr *addr, +		const size_t addrlen, +		const char *ip_addr, +		rpc_authflavor_t authflavour, +		int proto, const struct rpc_timeout *timeparms, +		u32 minorversion, struct net *net) +{ +	struct nfs_client_initdata cl_init = { +		.hostname = hostname, +		.addr = addr, +		.addrlen = addrlen, +		.nfs_mod = &nfs_v4, +		.proto = proto, +		.minorversion = minorversion, +		.net = net, +	}; +	struct nfs_client *clp; +	int error; + +	dprintk("--> nfs4_set_client()\n"); + +	if (server->flags & NFS_MOUNT_NORESVPORT) +		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); +	if (server->options & NFS_OPTION_MIGRATION) +		set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); + +	/* Allocate or find a client reference we can use */ +	clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); +	if (IS_ERR(clp)) { +		error = PTR_ERR(clp); +		goto error; +	} + +	/* +	 * Query for the lease time on clientid setup or renewal +	 * +	 * Note that this will be set on nfs_clients that were created +	 * only for the DS role and did not set this bit, but now will +	 * serve a dual role. +	 */ +	set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); + +	server->nfs_client = clp; +	dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); +	return 0; +error: +	dprintk("<-- nfs4_set_client() = xerror %d\n", error); +	return error; +} + +/* + * Set up a pNFS Data Server client. + * + * Return any existing nfs_client that matches server address,port,version + * and minorversion. + * + * For a new nfs_client, use a soft mount (default), a low retrans and a + * low timeout interval so that if a connection is lost, we retry through + * the MDS. + */ +struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, +		const struct sockaddr *ds_addr, int ds_addrlen, +		int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) +{ +	struct nfs_client_initdata cl_init = { +		.addr = ds_addr, +		.addrlen = ds_addrlen, +		.nfs_mod = &nfs_v4, +		.proto = ds_proto, +		.minorversion = mds_clp->cl_minorversion, +		.net = mds_clp->cl_net, +	}; +	struct rpc_timeout ds_timeout; +	struct nfs_client *clp; + +	/* +	 * Set an authflavor equual to the MDS value. Use the MDS nfs_client +	 * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS +	 * (section 13.1 RFC 5661). +	 */ +	nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); +	clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, +			     mds_clp->cl_rpcclient->cl_auth->au_flavor); + +	dprintk("<-- %s %p\n", __func__, clp); +	return clp; +} +EXPORT_SYMBOL_GPL(nfs4_set_ds_client); + +/* + * Session has been established, and the client marked ready. + * Set the mount rsize and wsize with negotiated fore channel + * attributes which will be bound checked in nfs_server_set_fsinfo. + */ +static void nfs4_session_set_rwsize(struct nfs_server *server) +{ +#ifdef CONFIG_NFS_V4_1 +	struct nfs4_session *sess; +	u32 server_resp_sz; +	u32 server_rqst_sz; + +	if (!nfs4_has_session(server->nfs_client)) +		return; +	sess = server->nfs_client->cl_session; +	server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; +	server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; + +	if (server->rsize > server_resp_sz) +		server->rsize = server_resp_sz; +	if (server->wsize > server_rqst_sz) +		server->wsize = server_rqst_sz; +#endif /* CONFIG_NFS_V4_1 */ +} + +static int nfs4_server_common_setup(struct nfs_server *server, +		struct nfs_fh *mntfh, bool auth_probe) +{ +	struct nfs_fattr *fattr; +	int error; + +	/* data servers support only a subset of NFSv4.1 */ +	if (is_ds_only_client(server->nfs_client)) +		return -EPROTONOSUPPORT; + +	fattr = nfs_alloc_fattr(); +	if (fattr == NULL) +		return -ENOMEM; + +	/* We must ensure the session is initialised first */ +	error = nfs4_init_session(server->nfs_client); +	if (error < 0) +		goto out; + +	/* Set the basic capabilities */ +	server->caps |= server->nfs_client->cl_mvops->init_caps; +	if (server->flags & NFS_MOUNT_NORDIRPLUS) +			server->caps &= ~NFS_CAP_READDIRPLUS; +	/* +	 * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower +	 * authentication. +	 */ +	if (nfs4_disable_idmapping && +			server->client->cl_auth->au_flavor == RPC_AUTH_UNIX) +		server->caps |= NFS_CAP_UIDGID_NOMAP; + + +	/* Probe the root fh to retrieve its FSID and filehandle */ +	error = nfs4_get_rootfh(server, mntfh, auth_probe); +	if (error < 0) +		goto out; + +	dprintk("Server FSID: %llx:%llx\n", +			(unsigned long long) server->fsid.major, +			(unsigned long long) server->fsid.minor); +	nfs_display_fhandle(mntfh, "Pseudo-fs root FH"); + +	nfs4_session_set_rwsize(server); + +	error = nfs_probe_fsinfo(server, mntfh, fattr); +	if (error < 0) +		goto out; + +	if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) +		server->namelen = NFS4_MAXNAMLEN; + +	nfs_server_insert_lists(server); +	server->mount_time = jiffies; +	server->destroy = nfs4_destroy_server; +out: +	nfs_free_fattr(fattr); +	return error; +} + +/* + * Create a version 4 volume record + */ +static int nfs4_init_server(struct nfs_server *server, +		struct nfs_parsed_mount_data *data) +{ +	struct rpc_timeout timeparms; +	int error; + +	dprintk("--> nfs4_init_server()\n"); + +	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, +			data->timeo, data->retrans); + +	/* Initialise the client representation from the mount data */ +	server->flags = data->flags; +	server->options = data->options; +	server->auth_info = data->auth_info; + +	/* Use the first specified auth flavor. If this flavor isn't +	 * allowed by the server, use the SECINFO path to try the +	 * other specified flavors */ +	if (data->auth_info.flavor_len >= 1) +		data->selected_flavor = data->auth_info.flavors[0]; +	else +		data->selected_flavor = RPC_AUTH_UNIX; + +	/* Get a client record */ +	error = nfs4_set_client(server, +			data->nfs_server.hostname, +			(const struct sockaddr *)&data->nfs_server.address, +			data->nfs_server.addrlen, +			data->client_address, +			data->selected_flavor, +			data->nfs_server.protocol, +			&timeparms, +			data->minorversion, +			data->net); +	if (error < 0) +		goto error; + +	if (data->rsize) +		server->rsize = nfs_block_size(data->rsize, NULL); +	if (data->wsize) +		server->wsize = nfs_block_size(data->wsize, NULL); + +	server->acregmin = data->acregmin * HZ; +	server->acregmax = data->acregmax * HZ; +	server->acdirmin = data->acdirmin * HZ; +	server->acdirmax = data->acdirmax * HZ; + +	server->port = data->nfs_server.port; + +	error = nfs_init_server_rpcclient(server, &timeparms, +					  data->selected_flavor); + +error: +	/* Done */ +	dprintk("<-- nfs4_init_server() = %d\n", error); +	return error; +} + +/* + * Create a version 4 volume record + * - keyed on server and FSID + */ +/*struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, +				      struct nfs_fh *mntfh)*/ +struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, +				      struct nfs_subversion *nfs_mod) +{ +	struct nfs_server *server; +	bool auth_probe; +	int error; + +	dprintk("--> nfs4_create_server()\n"); + +	server = nfs_alloc_server(); +	if (!server) +		return ERR_PTR(-ENOMEM); + +	auth_probe = mount_info->parsed->auth_info.flavor_len < 1; + +	/* set up the general RPC client */ +	error = nfs4_init_server(server, mount_info->parsed); +	if (error < 0) +		goto error; + +	error = nfs4_server_common_setup(server, mount_info->mntfh, auth_probe); +	if (error < 0) +		goto error; + +	dprintk("<-- nfs4_create_server() = %p\n", server); +	return server; + +error: +	nfs_free_server(server); +	dprintk("<-- nfs4_create_server() = error %d\n", error); +	return ERR_PTR(error); +} + +/* + * Create an NFS4 referral server record + */ +struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, +					       struct nfs_fh *mntfh) +{ +	struct nfs_client *parent_client; +	struct nfs_server *server, *parent_server; +	bool auth_probe; +	int error; + +	dprintk("--> nfs4_create_referral_server()\n"); + +	server = nfs_alloc_server(); +	if (!server) +		return ERR_PTR(-ENOMEM); + +	parent_server = NFS_SB(data->sb); +	parent_client = parent_server->nfs_client; + +	/* Initialise the client representation from the parent server */ +	nfs_server_copy_userdata(server, parent_server); + +	/* Get a client representation. +	 * Note: NFSv4 always uses TCP, */ +	error = nfs4_set_client(server, data->hostname, +				data->addr, +				data->addrlen, +				parent_client->cl_ipaddr, +				data->authflavor, +				rpc_protocol(parent_server->client), +				parent_server->client->cl_timeout, +				parent_client->cl_mvops->minor_version, +				parent_client->cl_net); +	if (error < 0) +		goto error; + +	error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); +	if (error < 0) +		goto error; + +	auth_probe = parent_server->auth_info.flavor_len < 1; + +	error = nfs4_server_common_setup(server, mntfh, auth_probe); +	if (error < 0) +		goto error; + +	dprintk("<-- nfs_create_referral_server() = %p\n", server); +	return server; + +error: +	nfs_free_server(server); +	dprintk("<-- nfs4_create_referral_server() = error %d\n", error); +	return ERR_PTR(error); +} + +/* + * Grab the destination's particulars, including lease expiry time. + * + * Returns zero if probe succeeded and retrieved FSID matches the FSID + * we have cached. + */ +static int nfs_probe_destination(struct nfs_server *server) +{ +	struct inode *inode = server->super->s_root->d_inode; +	struct nfs_fattr *fattr; +	int error; + +	fattr = nfs_alloc_fattr(); +	if (fattr == NULL) +		return -ENOMEM; + +	/* Sanity: the probe won't work if the destination server +	 * does not recognize the migrated FH. */ +	error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr); + +	nfs_free_fattr(fattr); +	return error; +} + +/** + * nfs4_update_server - Move an nfs_server to a different nfs_client + * + * @server: represents FSID to be moved + * @hostname: new end-point's hostname + * @sap: new end-point's socket address + * @salen: size of "sap" + * @net: net namespace + * + * The nfs_server must be quiescent before this function is invoked. + * Either its session is drained (NFSv4.1+), or its transport is + * plugged and drained (NFSv4.0). + * + * Returns zero on success, or a negative errno value. + */ +int nfs4_update_server(struct nfs_server *server, const char *hostname, +		       struct sockaddr *sap, size_t salen, struct net *net) +{ +	struct nfs_client *clp = server->nfs_client; +	struct rpc_clnt *clnt = server->client; +	struct xprt_create xargs = { +		.ident		= clp->cl_proto, +		.net		= net, +		.dstaddr	= sap, +		.addrlen	= salen, +		.servername	= hostname, +	}; +	char buf[INET6_ADDRSTRLEN + 1]; +	struct sockaddr_storage address; +	struct sockaddr *localaddr = (struct sockaddr *)&address; +	int error; + +	dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__, +			(unsigned long long)server->fsid.major, +			(unsigned long long)server->fsid.minor, +			hostname); + +	error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout); +	if (error != 0) { +		dprintk("<-- %s(): rpc_switch_client_transport returned %d\n", +			__func__, error); +		goto out; +	} + +	error = rpc_localaddr(clnt, localaddr, sizeof(address)); +	if (error != 0) { +		dprintk("<-- %s(): rpc_localaddr returned %d\n", +			__func__, error); +		goto out; +	} + +	error = -EAFNOSUPPORT; +	if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) { +		dprintk("<-- %s(): rpc_ntop returned %d\n", +			__func__, error); +		goto out; +	} + +	nfs_server_remove_lists(server); +	error = nfs4_set_client(server, hostname, sap, salen, buf, +				clp->cl_rpcclient->cl_auth->au_flavor, +				clp->cl_proto, clnt->cl_timeout, +				clp->cl_minorversion, net); +	nfs_put_client(clp); +	if (error != 0) { +		nfs_server_insert_lists(server); +		dprintk("<-- %s(): nfs4_set_client returned %d\n", +			__func__, error); +		goto out; +	} + +	if (server->nfs_client->cl_hostname == NULL) +		server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); +	nfs_server_insert_lists(server); + +	error = nfs_probe_destination(server); +	if (error < 0) +		goto out; + +	dprintk("<-- %s() succeeded\n", __func__); + +out: +	return error; +} diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c new file mode 100644 index 00000000000..a816f0627a6 --- /dev/null +++ b/fs/nfs/nfs4file.c @@ -0,0 +1,135 @@ +/* + *  linux/fs/nfs/file.c + * + *  Copyright (C) 1992  Rick Sladkey + */ +#include <linux/nfs_fs.h> +#include "internal.h" +#include "fscache.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY		NFSDBG_FILE + +static int +nfs4_file_open(struct inode *inode, struct file *filp) +{ +	struct nfs_open_context *ctx; +	struct dentry *dentry = filp->f_path.dentry; +	struct dentry *parent = NULL; +	struct inode *dir; +	unsigned openflags = filp->f_flags; +	struct iattr attr; +	int opened = 0; +	int err; + +	/* +	 * If no cached dentry exists or if it's negative, NFSv4 handled the +	 * opens in ->lookup() or ->create(). +	 * +	 * We only get this far for a cached positive dentry.  We skipped +	 * revalidation, so handle it here by dropping the dentry and returning +	 * -EOPENSTALE.  The VFS will retry the lookup/create/open. +	 */ + +	dprintk("NFS: open file(%pd2)\n", dentry); + +	if ((openflags & O_ACCMODE) == 3) +		openflags--; + +	/* We can't create new files here */ +	openflags &= ~(O_CREAT|O_EXCL); + +	parent = dget_parent(dentry); +	dir = parent->d_inode; + +	ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); +	err = PTR_ERR(ctx); +	if (IS_ERR(ctx)) +		goto out; + +	attr.ia_valid = ATTR_OPEN; +	if (openflags & O_TRUNC) { +		attr.ia_valid |= ATTR_SIZE; +		attr.ia_size = 0; +		nfs_wb_all(inode); +	} + +	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened); +	if (IS_ERR(inode)) { +		err = PTR_ERR(inode); +		switch (err) { +		case -EPERM: +		case -EACCES: +		case -EDQUOT: +		case -ENOSPC: +		case -EROFS: +			goto out_put_ctx; +		default: +			goto out_drop; +		} +	} +	if (inode != dentry->d_inode) +		goto out_drop; + +	nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); +	nfs_file_set_open_context(filp, ctx); +	nfs_fscache_open_file(inode, filp); +	err = 0; + +out_put_ctx: +	put_nfs_open_context(ctx); +out: +	dput(parent); +	return err; + +out_drop: +	d_drop(dentry); +	err = -EOPENSTALE; +	goto out_put_ctx; +} + +static int +nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ +	int ret; +	struct inode *inode = file_inode(file); + +	do { +		ret = filemap_write_and_wait_range(inode->i_mapping, start, end); +		if (ret != 0) +			break; +		mutex_lock(&inode->i_mutex); +		ret = nfs_file_fsync_commit(file, start, end, datasync); +		if (!ret) +			ret = pnfs_layoutcommit_inode(inode, true); +		mutex_unlock(&inode->i_mutex); +		/* +		 * If nfs_file_fsync_commit detected a server reboot, then +		 * resend all dirty pages that might have been covered by +		 * the NFS_CONTEXT_RESEND_WRITES flag +		 */ +		start = 0; +		end = LLONG_MAX; +	} while (ret == -EAGAIN); + +	return ret; +} + +const struct file_operations nfs4_file_operations = { +	.llseek		= nfs_file_llseek, +	.read		= new_sync_read, +	.write		= new_sync_write, +	.read_iter	= nfs_file_read, +	.write_iter	= nfs_file_write, +	.mmap		= nfs_file_mmap, +	.open		= nfs4_file_open, +	.flush		= nfs_file_flush, +	.release	= nfs_file_release, +	.fsync		= nfs4_file_fsync, +	.lock		= nfs_lock, +	.flock		= nfs_flock, +	.splice_read	= nfs_file_splice_read, +	.splice_write	= iter_file_splice_write, +	.check_flags	= nfs_check_flags, +	.setlease	= nfs_setlease, +}; diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c deleted file mode 100644 index 2e92f0d8d65..00000000000 --- a/fs/nfs/nfs4filelayout.c +++ /dev/null @@ -1,280 +0,0 @@ -/* - *  Module for the pnfs nfs4 file layout driver. - *  Defines all I/O and Policy interface operations, plus code - *  to register itself with the pNFS client. - * - *  Copyright (c) 2002 - *  The Regents of the University of Michigan - *  All Rights Reserved - * - *  Dean Hildebrand <dhildebz@umich.edu> - * - *  Permission is granted to use, copy, create derivative works, and - *  redistribute this software and such derivative works for any purpose, - *  so long as the name of the University of Michigan is not used in - *  any advertising or publicity pertaining to the use or distribution - *  of this software without specific, written prior authorization. If - *  the above copyright notice or any other identification of the - *  University of Michigan is included in any copy of any portion of - *  this software, then the disclaimer below must also be included. - * - *  This software is provided as is, without representation or warranty - *  of any kind either express or implied, including without limitation - *  the implied warranties of merchantability, fitness for a particular - *  purpose, or noninfringement.  The Regents of the University of - *  Michigan shall not be liable for any damages, including special, - *  indirect, incidental, or consequential damages, with respect to any - *  claim arising out of or in connection with the use of the software, - *  even if it has been or is hereafter advised of the possibility of - *  such damages. - */ - -#include <linux/nfs_fs.h> - -#include "internal.h" -#include "nfs4filelayout.h" - -#define NFSDBG_FACILITY         NFSDBG_PNFS_LD - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>"); -MODULE_DESCRIPTION("The NFSv4 file layout driver"); - -static int -filelayout_set_layoutdriver(struct nfs_server *nfss) -{ -	int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client, -						nfs4_fl_free_deviceid_callback); -	if (status) { -		printk(KERN_WARNING "%s: deviceid cache could not be " -			"initialized\n", __func__); -		return status; -	} -	dprintk("%s: deviceid cache has been initialized successfully\n", -		__func__); -	return 0; -} - -/* Clear out the layout by destroying its device list */ -static int -filelayout_clear_layoutdriver(struct nfs_server *nfss) -{ -	dprintk("--> %s\n", __func__); - -	if (nfss->nfs_client->cl_devid_cache) -		pnfs_put_deviceid_cache(nfss->nfs_client); -	return 0; -} - -/* - * filelayout_check_layout() - * - * Make sure layout segment parameters are sane WRT the device. - * At this point no generic layer initialization of the lseg has occurred, - * and nothing has been added to the layout_hdr cache. - * - */ -static int -filelayout_check_layout(struct pnfs_layout_hdr *lo, -			struct nfs4_filelayout_segment *fl, -			struct nfs4_layoutget_res *lgr, -			struct nfs4_deviceid *id) -{ -	struct nfs4_file_layout_dsaddr *dsaddr; -	int status = -EINVAL; -	struct nfs_server *nfss = NFS_SERVER(lo->inode); - -	dprintk("--> %s\n", __func__); - -	if (fl->pattern_offset > lgr->range.offset) { -		dprintk("%s pattern_offset %lld to large\n", -				__func__, fl->pattern_offset); -		goto out; -	} - -	if (fl->stripe_unit % PAGE_SIZE) { -		dprintk("%s Stripe unit (%u) not page aligned\n", -			__func__, fl->stripe_unit); -		goto out; -	} - -	/* find and reference the deviceid */ -	dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id); -	if (dsaddr == NULL) { -		dsaddr = get_device_info(lo->inode, id); -		if (dsaddr == NULL) -			goto out; -	} -	fl->dsaddr = dsaddr; - -	if (fl->first_stripe_index < 0 || -	    fl->first_stripe_index >= dsaddr->stripe_count) { -		dprintk("%s Bad first_stripe_index %d\n", -				__func__, fl->first_stripe_index); -		goto out_put; -	} - -	if ((fl->stripe_type == STRIPE_SPARSE && -	    fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) || -	    (fl->stripe_type == STRIPE_DENSE && -	    fl->num_fh != dsaddr->stripe_count)) { -		dprintk("%s num_fh %u not valid for given packing\n", -			__func__, fl->num_fh); -		goto out_put; -	} - -	if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { -		dprintk("%s Stripe unit (%u) not aligned with rsize %u " -			"wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, -			nfss->wsize); -	} - -	status = 0; -out: -	dprintk("--> %s returns %d\n", __func__, status); -	return status; -out_put: -	pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid); -	goto out; -} - -static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) -{ -	int i; - -	for (i = 0; i < fl->num_fh; i++) { -		if (!fl->fh_array[i]) -			break; -		kfree(fl->fh_array[i]); -	} -	kfree(fl->fh_array); -	fl->fh_array = NULL; -} - -static void -_filelayout_free_lseg(struct nfs4_filelayout_segment *fl) -{ -	filelayout_free_fh_array(fl); -	kfree(fl); -} - -static int -filelayout_decode_layout(struct pnfs_layout_hdr *flo, -			 struct nfs4_filelayout_segment *fl, -			 struct nfs4_layoutget_res *lgr, -			 struct nfs4_deviceid *id) -{ -	uint32_t *p = (uint32_t *)lgr->layout.buf; -	uint32_t nfl_util; -	int i; - -	dprintk("%s: set_layout_map Begin\n", __func__); - -	memcpy(id, p, sizeof(*id)); -	p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); -	print_deviceid(id); - -	nfl_util = be32_to_cpup(p++); -	if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) -		fl->commit_through_mds = 1; -	if (nfl_util & NFL4_UFLG_DENSE) -		fl->stripe_type = STRIPE_DENSE; -	else -		fl->stripe_type = STRIPE_SPARSE; -	fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; - -	fl->first_stripe_index = be32_to_cpup(p++); -	p = xdr_decode_hyper(p, &fl->pattern_offset); -	fl->num_fh = be32_to_cpup(p++); - -	dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n", -		__func__, nfl_util, fl->num_fh, fl->first_stripe_index, -		fl->pattern_offset); - -	fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), -			       GFP_KERNEL); -	if (!fl->fh_array) -		return -ENOMEM; - -	for (i = 0; i < fl->num_fh; i++) { -		/* Do we want to use a mempool here? */ -		fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); -		if (!fl->fh_array[i]) { -			filelayout_free_fh_array(fl); -			return -ENOMEM; -		} -		fl->fh_array[i]->size = be32_to_cpup(p++); -		if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { -			printk(KERN_ERR "Too big fh %d received %d\n", -			       i, fl->fh_array[i]->size); -			filelayout_free_fh_array(fl); -			return -EIO; -		} -		memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); -		p += XDR_QUADLEN(fl->fh_array[i]->size); -		dprintk("DEBUG: %s: fh len %d\n", __func__, -			fl->fh_array[i]->size); -	} - -	return 0; -} - -static struct pnfs_layout_segment * -filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, -		      struct nfs4_layoutget_res *lgr) -{ -	struct nfs4_filelayout_segment *fl; -	int rc; -	struct nfs4_deviceid id; - -	dprintk("--> %s\n", __func__); -	fl = kzalloc(sizeof(*fl), GFP_KERNEL); -	if (!fl) -		return NULL; - -	rc = filelayout_decode_layout(layoutid, fl, lgr, &id); -	if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id)) { -		_filelayout_free_lseg(fl); -		return NULL; -	} -	return &fl->generic_hdr; -} - -static void -filelayout_free_lseg(struct pnfs_layout_segment *lseg) -{ -	struct nfs_server *nfss = NFS_SERVER(lseg->layout->inode); -	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); - -	dprintk("--> %s\n", __func__); -	pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, -			  &fl->dsaddr->deviceid); -	_filelayout_free_lseg(fl); -} - -static struct pnfs_layoutdriver_type filelayout_type = { -	.id = LAYOUT_NFSV4_1_FILES, -	.name = "LAYOUT_NFSV4_1_FILES", -	.owner = THIS_MODULE, -	.set_layoutdriver = filelayout_set_layoutdriver, -	.clear_layoutdriver = filelayout_clear_layoutdriver, -	.alloc_lseg              = filelayout_alloc_lseg, -	.free_lseg               = filelayout_free_lseg, -}; - -static int __init nfs4filelayout_init(void) -{ -	printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", -	       __func__); -	return pnfs_register_layoutdriver(&filelayout_type); -} - -static void __exit nfs4filelayout_exit(void) -{ -	printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", -	       __func__); -	pnfs_unregister_layoutdriver(&filelayout_type); -} - -module_init(nfs4filelayout_init); -module_exit(nfs4filelayout_exit); diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c deleted file mode 100644 index 51fe64ace55..00000000000 --- a/fs/nfs/nfs4filelayoutdev.c +++ /dev/null @@ -1,448 +0,0 @@ -/* - *  Device operations for the pnfs nfs4 file layout driver. - * - *  Copyright (c) 2002 - *  The Regents of the University of Michigan - *  All Rights Reserved - * - *  Dean Hildebrand <dhildebz@umich.edu> - *  Garth Goodson   <Garth.Goodson@netapp.com> - * - *  Permission is granted to use, copy, create derivative works, and - *  redistribute this software and such derivative works for any purpose, - *  so long as the name of the University of Michigan is not used in - *  any advertising or publicity pertaining to the use or distribution - *  of this software without specific, written prior authorization. If - *  the above copyright notice or any other identification of the - *  University of Michigan is included in any copy of any portion of - *  this software, then the disclaimer below must also be included. - * - *  This software is provided as is, without representation or warranty - *  of any kind either express or implied, including without limitation - *  the implied warranties of merchantability, fitness for a particular - *  purpose, or noninfringement.  The Regents of the University of - *  Michigan shall not be liable for any damages, including special, - *  indirect, incidental, or consequential damages, with respect to any - *  claim arising out of or in connection with the use of the software, - *  even if it has been or is hereafter advised of the possibility of - *  such damages. - */ - -#include <linux/nfs_fs.h> -#include <linux/vmalloc.h> - -#include "internal.h" -#include "nfs4filelayout.h" - -#define NFSDBG_FACILITY		NFSDBG_PNFS_LD - -/* - * Data server cache - * - * Data servers can be mapped to different device ids. - * nfs4_pnfs_ds reference counting - *   - set to 1 on allocation - *   - incremented when a device id maps a data server already in the cache. - *   - decremented when deviceid is removed from the cache. - */ -DEFINE_SPINLOCK(nfs4_ds_cache_lock); -static LIST_HEAD(nfs4_data_server_cache); - -/* Debug routines */ -void -print_ds(struct nfs4_pnfs_ds *ds) -{ -	if (ds == NULL) { -		printk("%s NULL device\n", __func__); -		return; -	} -	printk("        ip_addr %x port %hu\n" -		"        ref count %d\n" -		"        client %p\n" -		"        cl_exchange_flags %x\n", -		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), -		atomic_read(&ds->ds_count), ds->ds_clp, -		ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); -} - -void -print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr) -{ -	int i; - -	ifdebug(FACILITY) { -		printk("%s dsaddr->ds_num %d\n", __func__, -		       dsaddr->ds_num); -		for (i = 0; i < dsaddr->ds_num; i++) -			print_ds(dsaddr->ds_list[i]); -	} -} - -void print_deviceid(struct nfs4_deviceid *id) -{ -	u32 *p = (u32 *)id; - -	dprintk("%s: device id= [%x%x%x%x]\n", __func__, -		p[0], p[1], p[2], p[3]); -} - -/* nfs4_ds_cache_lock is held */ -static struct nfs4_pnfs_ds * -_data_server_lookup_locked(u32 ip_addr, u32 port) -{ -	struct nfs4_pnfs_ds *ds; - -	dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", -			ntohl(ip_addr), ntohs(port)); - -	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { -		if (ds->ds_ip_addr == ip_addr && -		    ds->ds_port == port) { -			return ds; -		} -	} -	return NULL; -} - -static void -destroy_ds(struct nfs4_pnfs_ds *ds) -{ -	dprintk("--> %s\n", __func__); -	ifdebug(FACILITY) -		print_ds(ds); - -	if (ds->ds_clp) -		nfs_put_client(ds->ds_clp); -	kfree(ds); -} - -static void -nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) -{ -	struct nfs4_pnfs_ds *ds; -	int i; - -	print_deviceid(&dsaddr->deviceid.de_id); - -	for (i = 0; i < dsaddr->ds_num; i++) { -		ds = dsaddr->ds_list[i]; -		if (ds != NULL) { -			if (atomic_dec_and_lock(&ds->ds_count, -						&nfs4_ds_cache_lock)) { -				list_del_init(&ds->ds_node); -				spin_unlock(&nfs4_ds_cache_lock); -				destroy_ds(ds); -			} -		} -	} -	kfree(dsaddr->stripe_indices); -	kfree(dsaddr); -} - -void -nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device) -{ -	struct nfs4_file_layout_dsaddr *dsaddr = -		container_of(device, struct nfs4_file_layout_dsaddr, deviceid); - -	nfs4_fl_free_deviceid(dsaddr); -} - -static struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port) -{ -	struct nfs4_pnfs_ds *tmp_ds, *ds; - -	ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL); -	if (!ds) -		goto out; - -	spin_lock(&nfs4_ds_cache_lock); -	tmp_ds = _data_server_lookup_locked(ip_addr, port); -	if (tmp_ds == NULL) { -		ds->ds_ip_addr = ip_addr; -		ds->ds_port = port; -		atomic_set(&ds->ds_count, 1); -		INIT_LIST_HEAD(&ds->ds_node); -		ds->ds_clp = NULL; -		list_add(&ds->ds_node, &nfs4_data_server_cache); -		dprintk("%s add new data server ip 0x%x\n", __func__, -			ds->ds_ip_addr); -	} else { -		kfree(ds); -		atomic_inc(&tmp_ds->ds_count); -		dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", -			__func__, tmp_ds->ds_ip_addr, -			atomic_read(&tmp_ds->ds_count)); -		ds = tmp_ds; -	} -	spin_unlock(&nfs4_ds_cache_lock); -out: -	return ds; -} - -/* - * Currently only support ipv4, and one multi-path address. - */ -static struct nfs4_pnfs_ds * -decode_and_add_ds(__be32 **pp, struct inode *inode) -{ -	struct nfs4_pnfs_ds *ds = NULL; -	char *buf; -	const char *ipend, *pstr; -	u32 ip_addr, port; -	int nlen, rlen, i; -	int tmp[2]; -	__be32 *r_netid, *r_addr, *p = *pp; - -	/* r_netid */ -	nlen = be32_to_cpup(p++); -	r_netid = p; -	p += XDR_QUADLEN(nlen); - -	/* r_addr */ -	rlen = be32_to_cpup(p++); -	r_addr = p; -	p += XDR_QUADLEN(rlen); -	*pp = p; - -	/* Check that netid is "tcp" */ -	if (nlen != 3 ||  memcmp((char *)r_netid, "tcp", 3)) { -		dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__); -		goto out_err; -	} - -	/* ipv6 length plus port is legal */ -	if (rlen > INET6_ADDRSTRLEN + 8) { -		dprintk("%s Invalid address, length %d\n", __func__, -			rlen); -		goto out_err; -	} -	buf = kmalloc(rlen + 1, GFP_KERNEL); -	buf[rlen] = '\0'; -	memcpy(buf, r_addr, rlen); - -	/* replace the port dots with dashes for the in4_pton() delimiter*/ -	for (i = 0; i < 2; i++) { -		char *res = strrchr(buf, '.'); -		*res = '-'; -	} - -	/* Currently only support ipv4 address */ -	if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) { -		dprintk("%s: Only ipv4 addresses supported\n", __func__); -		goto out_free; -	} - -	/* port */ -	pstr = ipend; -	sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]); -	port = htons((tmp[0] << 8) | (tmp[1])); - -	ds = nfs4_pnfs_ds_add(inode, ip_addr, port); -	dprintk("%s Decoded address and port %s\n", __func__, buf); -out_free: -	kfree(buf); -out_err: -	return ds; -} - -/* Decode opaque device data and return the result */ -static struct nfs4_file_layout_dsaddr* -decode_device(struct inode *ino, struct pnfs_device *pdev) -{ -	int i, dummy; -	u32 cnt, num; -	u8 *indexp; -	__be32 *p = (__be32 *)pdev->area, *indicesp; -	struct nfs4_file_layout_dsaddr *dsaddr; - -	/* Get the stripe count (number of stripe index) */ -	cnt = be32_to_cpup(p++); -	dprintk("%s stripe count  %d\n", __func__, cnt); -	if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { -		printk(KERN_WARNING "%s: stripe count %d greater than " -		       "supported maximum %d\n", __func__, -			cnt, NFS4_PNFS_MAX_STRIPE_CNT); -		goto out_err; -	} - -	/* Check the multipath list count */ -	indicesp = p; -	p += XDR_QUADLEN(cnt << 2); -	num = be32_to_cpup(p++); -	dprintk("%s ds_num %u\n", __func__, num); -	if (num > NFS4_PNFS_MAX_MULTI_CNT) { -		printk(KERN_WARNING "%s: multipath count %d greater than " -			"supported maximum %d\n", __func__, -			num, NFS4_PNFS_MAX_MULTI_CNT); -		goto out_err; -	} -	dsaddr = kzalloc(sizeof(*dsaddr) + -			(sizeof(struct nfs4_pnfs_ds *) * (num - 1)), -			GFP_KERNEL); -	if (!dsaddr) -		goto out_err; - -	dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); -	if (!dsaddr->stripe_indices) -		goto out_err_free; - -	dsaddr->stripe_count = cnt; -	dsaddr->ds_num = num; - -	memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id)); - -	/* Go back an read stripe indices */ -	p = indicesp; -	indexp = &dsaddr->stripe_indices[0]; -	for (i = 0; i < dsaddr->stripe_count; i++) { -		*indexp = be32_to_cpup(p++); -		if (*indexp >= num) -			goto out_err_free; -		indexp++; -	} -	/* Skip already read multipath list count */ -	p++; - -	for (i = 0; i < dsaddr->ds_num; i++) { -		int j; - -		dummy = be32_to_cpup(p++); /* multipath count */ -		if (dummy > 1) { -			printk(KERN_WARNING -			       "%s: Multipath count %d not supported, " -			       "skipping all greater than 1\n", __func__, -				dummy); -		} -		for (j = 0; j < dummy; j++) { -			if (j == 0) { -				dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); -				if (dsaddr->ds_list[i] == NULL) -					goto out_err_free; -			} else { -				u32 len; -				/* skip extra multipath */ -				len = be32_to_cpup(p++); -				p += XDR_QUADLEN(len); -				len = be32_to_cpup(p++); -				p += XDR_QUADLEN(len); -				continue; -			} -		} -	} -	return dsaddr; - -out_err_free: -	nfs4_fl_free_deviceid(dsaddr); -out_err: -	dprintk("%s ERROR: returning NULL\n", __func__); -	return NULL; -} - -/* - * Decode the opaque device specified in 'dev' - * and add it to the list of available devices. - * If the deviceid is already cached, nfs4_add_deviceid will return - * a pointer to the cached struct and throw away the new. - */ -static struct nfs4_file_layout_dsaddr* -decode_and_add_device(struct inode *inode, struct pnfs_device *dev) -{ -	struct nfs4_file_layout_dsaddr *dsaddr; -	struct pnfs_deviceid_node *d; - -	dsaddr = decode_device(inode, dev); -	if (!dsaddr) { -		printk(KERN_WARNING "%s: Could not decode or add device\n", -			__func__); -		return NULL; -	} - -	d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, -			      &dsaddr->deviceid); - -	return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); -} - -/* - * Retrieve the information for dev_id, add it to the list - * of available devices, and return it. - */ -struct nfs4_file_layout_dsaddr * -get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id) -{ -	struct pnfs_device *pdev = NULL; -	u32 max_resp_sz; -	int max_pages; -	struct page **pages = NULL; -	struct nfs4_file_layout_dsaddr *dsaddr = NULL; -	int rc, i; -	struct nfs_server *server = NFS_SERVER(inode); - -	/* -	 * Use the session max response size as the basis for setting -	 * GETDEVICEINFO's maxcount -	 */ -	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; -	max_pages = max_resp_sz >> PAGE_SHIFT; -	dprintk("%s inode %p max_resp_sz %u max_pages %d\n", -		__func__, inode, max_resp_sz, max_pages); - -	pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL); -	if (pdev == NULL) -		return NULL; - -	pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); -	if (pages == NULL) { -		kfree(pdev); -		return NULL; -	} -	for (i = 0; i < max_pages; i++) { -		pages[i] = alloc_page(GFP_KERNEL); -		if (!pages[i]) -			goto out_free; -	} - -	/* set pdev->area */ -	pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); -	if (!pdev->area) -		goto out_free; - -	memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); -	pdev->layout_type = LAYOUT_NFSV4_1_FILES; -	pdev->pages = pages; -	pdev->pgbase = 0; -	pdev->pglen = PAGE_SIZE * max_pages; -	pdev->mincount = 0; - -	rc = nfs4_proc_getdeviceinfo(server, pdev); -	dprintk("%s getdevice info returns %d\n", __func__, rc); -	if (rc) -		goto out_free; - -	/* -	 * Found new device, need to decode it and then add it to the -	 * list of known devices for this mountpoint. -	 */ -	dsaddr = decode_and_add_device(inode, pdev); -out_free: -	if (pdev->area != NULL) -		vunmap(pdev->area); -	for (i = 0; i < max_pages; i++) -		__free_page(pages[i]); -	kfree(pages); -	kfree(pdev); -	dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); -	return dsaddr; -} - -struct nfs4_file_layout_dsaddr * -nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id) -{ -	struct pnfs_deviceid_node *d; - -	d = pnfs_find_get_deviceid(clp->cl_devid_cache, id); -	return (d == NULL) ? NULL : -		container_of(d, struct nfs4_file_layout_dsaddr, deviceid); -} diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c new file mode 100644 index 00000000000..c0b3a16b4a0 --- /dev/null +++ b/fs/nfs/nfs4getroot.c @@ -0,0 +1,50 @@ +/* +* Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. +* Written by David Howells (dhowells@redhat.com) +*/ + +#include <linux/nfs_fs.h> +#include "nfs4_fs.h" +#include "internal.h" + +#define NFSDBG_FACILITY		NFSDBG_CLIENT + +int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe) +{ +	struct nfs_fsinfo fsinfo; +	int ret = -ENOMEM; + +	dprintk("--> nfs4_get_rootfh()\n"); + +	fsinfo.fattr = nfs_alloc_fattr(); +	if (fsinfo.fattr == NULL) +		goto out; + +	/* Start by getting the root filehandle from the server */ +	ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo, auth_probe); +	if (ret < 0) { +		dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); +		goto out; +	} + +	if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE) +			|| !S_ISDIR(fsinfo.fattr->mode)) { +		printk(KERN_ERR "nfs4_get_rootfh:" +		       " getroot encountered non-directory\n"); +		ret = -ENOTDIR; +		goto out; +	} + +	if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { +		printk(KERN_ERR "nfs4_get_rootfh:" +		       " getroot obtained referral\n"); +		ret = -EREMOTE; +		goto out; +	} + +	memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); +out: +	nfs_free_fattr(fsinfo.fattr); +	dprintk("<-- nfs4_get_rootfh() = %d\n", ret); +	return ret; +} diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 3c2a1724fbd..3d83cb1fdc7 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -11,9 +11,11 @@  #include <linux/mount.h>  #include <linux/namei.h>  #include <linux/nfs_fs.h> +#include <linux/nfs_mount.h>  #include <linux/slab.h>  #include <linux/string.h>  #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h>  #include <linux/vfs.h>  #include <linux/inet.h>  #include "internal.h" @@ -52,35 +54,56 @@ Elong:  }  /* - * Determine the mount path as a string + * return the path component of "<server>:<path>" + *  nfspath - the "<server>:<path>" string + *  end - one past the last char that could contain "<server>:" + * returns NULL on failure   */ -static char *nfs4_path(const struct vfsmount *mnt_parent, -		       const struct dentry *dentry, -		       char *buffer, ssize_t buflen) +static char *nfs_path_component(const char *nfspath, const char *end)  { -	const char *srvpath; - -	srvpath = strchr(mnt_parent->mnt_devname, ':'); -	if (srvpath) -		srvpath++; -	else -		srvpath = mnt_parent->mnt_devname; +	char *p; + +	if (*nfspath == '[') { +		/* parse [] escaped IPv6 addrs */ +		p = strchr(nfspath, ']'); +		if (p != NULL && ++p < end && *p == ':') +			return p + 1; +	} else { +		/* otherwise split on first colon */ +		p = strchr(nfspath, ':'); +		if (p != NULL && p < end) +			return p + 1; +	} +	return NULL; +} -	return nfs_path(srvpath, mnt_parent->mnt_root, dentry, buffer, buflen); +/* + * Determine the mount path as a string + */ +static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen) +{ +	char *limit; +	char *path = nfs_path(&limit, dentry, buffer, buflen, +			      NFS_PATH_CANONICAL); +	if (!IS_ERR(path)) { +		char *path_component = nfs_path_component(path, limit); +		if (path_component) +			return path_component; +	} +	return path;  }  /*   * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we   * believe to be the server path to this dentry   */ -static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, -				const struct dentry *dentry, +static int nfs4_validate_fspath(struct dentry *dentry,  				const struct nfs4_fs_locations *locations,  				char *page, char *page2)  {  	const char *path, *fs_path; -	path = nfs4_path(mnt_parent, dentry, page, PAGE_SIZE); +	path = nfs4_path(dentry, page, PAGE_SIZE);  	if (IS_ERR(path))  		return PTR_ERR(path); @@ -98,24 +121,125 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,  }  static size_t nfs_parse_server_name(char *string, size_t len, -		struct sockaddr *sa, size_t salen) +		struct sockaddr *sa, size_t salen, struct net *net)  {  	ssize_t ret; -	ret = rpc_pton(string, len, sa, salen); +	ret = rpc_pton(net, string, len, sa, salen);  	if (ret == 0) { -		ret = nfs_dns_resolve_name(string, len, sa, salen); +		ret = nfs_dns_resolve_name(net, string, len, sa, salen);  		if (ret < 0)  			ret = 0;  	}  	return ret;  } +/** + * nfs_find_best_sec - Find a security mechanism supported locally + * @server: NFS server struct + * @flavors: List of security tuples returned by SECINFO procedure + * + * Return an rpc client that uses the first security mechanism in + * "flavors" that is locally supported.  The "flavors" array + * is searched in the order returned from the server, per RFC 3530 + * recommendation and each flavor is checked for membership in the + * sec= mount option list if it exists. + * + * Return -EPERM if no matching flavor is found in the array. + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + * + */ +static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt, +					  struct nfs_server *server, +					  struct nfs4_secinfo_flavors *flavors) +{ +	rpc_authflavor_t pflavor; +	struct nfs4_secinfo4 *secinfo; +	unsigned int i; + +	for (i = 0; i < flavors->num_flavors; i++) { +		secinfo = &flavors->flavors[i]; + +		switch (secinfo->flavor) { +		case RPC_AUTH_NULL: +		case RPC_AUTH_UNIX: +		case RPC_AUTH_GSS: +			pflavor = rpcauth_get_pseudoflavor(secinfo->flavor, +							&secinfo->flavor_info); +			/* does the pseudoflavor match a sec= mount opt? */ +			if (pflavor != RPC_AUTH_MAXFLAVOR && +			    nfs_auth_info_match(&server->auth_info, pflavor)) { +				struct rpc_clnt *new; +				struct rpc_cred *cred; + +				/* Cloning creates an rpc_auth for the flavor */ +				new = rpc_clone_client_set_auth(clnt, pflavor); +				if (IS_ERR(new)) +					continue; +				/** +				* Check that the user actually can use the +				* flavor. This is mostly for RPC_AUTH_GSS +				* where cr_init obtains a gss context +				*/ +				cred = rpcauth_lookupcred(new->cl_auth, 0); +				if (IS_ERR(cred)) { +					rpc_shutdown_client(new); +					continue; +				} +				put_rpccred(cred); +				return new; +			} +		} +	} +	return ERR_PTR(-EPERM); +} + +/** + * nfs4_negotiate_security - in response to an NFS4ERR_WRONGSEC on lookup, + * return an rpc_clnt that uses the best available security flavor with + * respect to the secinfo flavor list and the sec= mount options. + * + * @clnt: RPC client to clone + * @inode: directory inode + * @name: lookup name + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + */ +struct rpc_clnt * +nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode, +					struct qstr *name) +{ +	struct page *page; +	struct nfs4_secinfo_flavors *flavors; +	struct rpc_clnt *new; +	int err; + +	page = alloc_page(GFP_KERNEL); +	if (!page) +		return ERR_PTR(-ENOMEM); + +	flavors = page_address(page); + +	err = nfs4_proc_secinfo(inode, name, flavors); +	if (err < 0) { +		new = ERR_PTR(err); +		goto out; +	} + +	new = nfs_find_best_sec(clnt, NFS_SERVER(inode), flavors); + +out: +	put_page(page); +	return new; +} +  static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,  				     char *page, char *page2,  				     const struct nfs4_fs_location *location)  {  	const size_t addr_bufsize = sizeof(struct sockaddr_storage); +	struct net *net = rpc_net_ns(NFS_SB(mountdata->sb)->client);  	struct vfsmount *mnt = ERR_PTR(-ENOENT);  	char *mnt_path;  	unsigned int maxbuflen; @@ -141,7 +265,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,  			continue;  		mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, -				mountdata->addr, addr_bufsize); +				mountdata->addr, addr_bufsize, net);  		if (mountdata->addrlen == 0)  			continue; @@ -165,20 +289,18 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,  /**   * nfs_follow_referral - set up mountpoint when hitting a referral on moved error - * @mnt_parent - mountpoint of parent directory   * @dentry - parent directory   * @locations - array of NFSv4 server location information   *   */ -static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, -					    const struct dentry *dentry, +static struct vfsmount *nfs_follow_referral(struct dentry *dentry,  					    const struct nfs4_fs_locations *locations)  {  	struct vfsmount *mnt = ERR_PTR(-ENOENT);  	struct nfs_clone_mount mountdata = { -		.sb = mnt_parent->mnt_sb, +		.sb = dentry->d_sb,  		.dentry = dentry, -		.authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, +		.authflavor = NFS_SB(dentry->d_sb)->client->cl_auth->au_flavor,  	};  	char *page = NULL, *page2 = NULL;  	int loc, error; @@ -186,8 +308,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,  	if (locations == NULL || locations->nlocations <= 0)  		goto out; -	dprintk("%s: referral at %s/%s\n", __func__, -		dentry->d_parent->d_name.name, dentry->d_name.name); +	dprintk("%s: referral at %pd2\n", __func__, dentry);  	page = (char *) __get_free_page(GFP_USER);  	if (!page) @@ -198,7 +319,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,  		goto out;  	/* Ensure fs path is a prefix of current dentry path */ -	error = nfs4_validate_fspath(mnt_parent, dentry, locations, page, page2); +	error = nfs4_validate_fspath(dentry, locations, page, page2);  	if (error < 0) {  		mnt = ERR_PTR(error);  		goto out; @@ -225,11 +346,10 @@ out:  /*   * nfs_do_refmount - handle crossing a referral on server - * @mnt_parent - mountpoint of referral   * @dentry - dentry of referral   *   */ -struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) +static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry)  {  	struct vfsmount *mnt = ERR_PTR(-ENOMEM);  	struct dentry *parent; @@ -252,17 +372,17 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr  	mnt = ERR_PTR(-ENOENT);  	parent = dget_parent(dentry); -	dprintk("%s: getting locations for %s/%s\n", -		__func__, parent->d_name.name, dentry->d_name.name); +	dprintk("%s: getting locations for %pd2\n", +		__func__, dentry); -	err = nfs4_proc_fs_locations(parent->d_inode, &dentry->d_name, fs_locations, page); +	err = nfs4_proc_fs_locations(client, parent->d_inode, &dentry->d_name, fs_locations, page);  	dput(parent);  	if (err != 0 ||  	    fs_locations->nlocations <= 0 ||  	    fs_locations->fs_path.ncomponents <= 0)  		goto out_free; -	mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations); +	mnt = nfs_follow_referral(dentry, fs_locations);  out_free:  	__free_page(page);  	kfree(fs_locations); @@ -270,3 +390,134 @@ out:  	dprintk("%s: done\n", __func__);  	return mnt;  } + +struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, +			       struct nfs_fh *fh, struct nfs_fattr *fattr) +{ +	rpc_authflavor_t flavor = server->client->cl_auth->au_flavor; +	struct dentry *parent = dget_parent(dentry); +	struct inode *dir = parent->d_inode; +	struct qstr *name = &dentry->d_name; +	struct rpc_clnt *client; +	struct vfsmount *mnt; + +	/* Look it up again to get its attributes and sec flavor */ +	client = nfs4_proc_lookup_mountpoint(dir, name, fh, fattr); +	dput(parent); +	if (IS_ERR(client)) +		return ERR_CAST(client); + +	if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { +		mnt = nfs_do_refmount(client, dentry); +		goto out; +	} + +	if (client->cl_auth->au_flavor != flavor) +		flavor = client->cl_auth->au_flavor; +	mnt = nfs_do_submount(dentry, fh, fattr, flavor); +out: +	rpc_shutdown_client(client); +	return mnt; +} + +/* + * Try one location from the fs_locations array. + * + * Returns zero on success, or a negative errno value. + */ +static int nfs4_try_replacing_one_location(struct nfs_server *server, +		char *page, char *page2, +		const struct nfs4_fs_location *location) +{ +	const size_t addr_bufsize = sizeof(struct sockaddr_storage); +	struct net *net = rpc_net_ns(server->client); +	struct sockaddr *sap; +	unsigned int s; +	size_t salen; +	int error; + +	sap = kmalloc(addr_bufsize, GFP_KERNEL); +	if (sap == NULL) +		return -ENOMEM; + +	error = -ENOENT; +	for (s = 0; s < location->nservers; s++) { +		const struct nfs4_string *buf = &location->servers[s]; +		char *hostname; + +		if (buf->len <= 0 || buf->len > PAGE_SIZE) +			continue; + +		if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len) != NULL) +			continue; + +		salen = nfs_parse_server_name(buf->data, buf->len, +						sap, addr_bufsize, net); +		if (salen == 0) +			continue; +		rpc_set_port(sap, NFS_PORT); + +		error = -ENOMEM; +		hostname = kstrndup(buf->data, buf->len, GFP_KERNEL); +		if (hostname == NULL) +			break; + +		error = nfs4_update_server(server, hostname, sap, salen, net); +		kfree(hostname); +		if (error == 0) +			break; +	} + +	kfree(sap); +	return error; +} + +/** + * nfs4_replace_transport - set up transport to destination server + * + * @server: export being migrated + * @locations: fs_locations array + * + * Returns zero on success, or a negative errno value. + * + * The client tries all the entries in the "locations" array, in the + * order returned by the server, until one works or the end of the + * array is reached. + */ +int nfs4_replace_transport(struct nfs_server *server, +			   const struct nfs4_fs_locations *locations) +{ +	char *page = NULL, *page2 = NULL; +	int loc, error; + +	error = -ENOENT; +	if (locations == NULL || locations->nlocations <= 0) +		goto out; + +	error = -ENOMEM; +	page = (char *) __get_free_page(GFP_USER); +	if (!page) +		goto out; +	page2 = (char *) __get_free_page(GFP_USER); +	if (!page2) +		goto out; + +	for (loc = 0; loc < locations->nlocations; loc++) { +		const struct nfs4_fs_location *location = +						&locations->locations[loc]; + +		if (location == NULL || location->nservers <= 0 || +		    location->rootpath.ncomponents == 0) +			continue; + +		error = nfs4_try_replacing_one_location(server, page, +							page2, location); +		if (error == 0) +			break; +	} + +out: +	free_page((unsigned long)page); +	free_page((unsigned long)page2); +	return error; +} diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0f24cdf2cb1..4bf3d97cc5a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -39,16 +39,22 @@  #include <linux/delay.h>  #include <linux/errno.h>  #include <linux/string.h> +#include <linux/ratelimit.h> +#include <linux/printk.h>  #include <linux/slab.h>  #include <linux/sunrpc/clnt.h>  #include <linux/nfs.h>  #include <linux/nfs4.h>  #include <linux/nfs_fs.h>  #include <linux/nfs_page.h> +#include <linux/nfs_mount.h>  #include <linux/namei.h>  #include <linux/mount.h>  #include <linux/module.h> -#include <linux/sunrpc/bc_xprt.h> +#include <linux/nfs_idmap.h> +#include <linux/xattr.h> +#include <linux/utsname.h> +#include <linux/freezer.h>  #include "nfs4_fs.h"  #include "delegation.h" @@ -56,24 +62,81 @@  #include "iostat.h"  #include "callback.h"  #include "pnfs.h" +#include "netns.h" +#include "nfs4session.h" +#include "fscache.h" + +#include "nfs4trace.h"  #define NFSDBG_FACILITY		NFSDBG_PROC  #define NFS4_POLL_RETRY_MIN	(HZ/10)  #define NFS4_POLL_RETRY_MAX	(15*HZ) -#define NFS4_MAX_LOOP_ON_RECOVER (10) -  struct nfs4_opendata;  static int _nfs4_proc_open(struct nfs4_opendata *data);  static int _nfs4_recover_proc_open(struct nfs4_opendata *data);  static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);  static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); -static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); -static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); +static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); +static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label); +static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);  static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,  			    struct nfs_fattr *fattr, struct iattr *sattr, -			    struct nfs4_state *state); +			    struct nfs4_state *state, struct nfs4_label *ilabel, +			    struct nfs4_label *olabel); +#ifdef CONFIG_NFS_V4_1 +static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, +		struct rpc_cred *); +static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *, +		struct rpc_cred *); +#endif + +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static inline struct nfs4_label * +nfs4_label_init_security(struct inode *dir, struct dentry *dentry, +	struct iattr *sattr, struct nfs4_label *label) +{ +	int err; + +	if (label == NULL) +		return NULL; + +	if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0) +		return NULL; + +	err = security_dentry_init_security(dentry, sattr->ia_mode, +				&dentry->d_name, (void **)&label->label, &label->len); +	if (err == 0) +		return label; + +	return NULL; +} +static inline void +nfs4_label_release_security(struct nfs4_label *label) +{ +	if (label) +		security_release_secctx(label->label, label->len); +} +static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label) +{ +	if (label) +		return server->attr_bitmask; + +	return server->attr_bitmask_nl; +} +#else +static inline struct nfs4_label * +nfs4_label_init_security(struct inode *dir, struct dentry *dentry, +	struct iattr *sattr, struct nfs4_label *l) +{ return NULL; } +static inline void +nfs4_label_release_security(struct nfs4_label *label) +{ return; } +static inline u32 * +nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label) +{ return server->attr_bitmask; } +#endif  /* Prevent leaks of NFSv4 errors into userland */  static int nfs4_map_errors(int err) @@ -82,7 +145,23 @@ static int nfs4_map_errors(int err)  		return err;  	switch (err) {  	case -NFS4ERR_RESOURCE: +	case -NFS4ERR_LAYOUTTRYLATER: +	case -NFS4ERR_RECALLCONFLICT:  		return -EREMOTEIO; +	case -NFS4ERR_WRONGSEC: +	case -NFS4ERR_WRONG_CRED: +		return -EPERM; +	case -NFS4ERR_BADOWNER: +	case -NFS4ERR_BADNAME: +		return -EINVAL; +	case -NFS4ERR_SHARE_DENIED: +		return -EACCES; +	case -NFS4ERR_MINOR_VERS_MISMATCH: +		return -EPROTONOSUPPORT; +	case -NFS4ERR_ACCESS: +		return -EACCES; +	case -NFS4ERR_FILE_OPEN: +		return -EBUSY;  	default:  		dprintk("%s could not handle NFSv4 error %d\n",  				__func__, -err); @@ -94,7 +173,7 @@ static int nfs4_map_errors(int err)  /*   * This is our standard bitmap for GETATTR requests.   */ -const u32 nfs4_fattr_bitmap[2] = { +const u32 nfs4_fattr_bitmap[3] = {  	FATTR4_WORD0_TYPE  	| FATTR4_WORD0_CHANGE  	| FATTR4_WORD0_SIZE @@ -108,10 +187,37 @@ const u32 nfs4_fattr_bitmap[2] = {  	| FATTR4_WORD1_SPACE_USED  	| FATTR4_WORD1_TIME_ACCESS  	| FATTR4_WORD1_TIME_METADATA -	| FATTR4_WORD1_TIME_MODIFY +	| FATTR4_WORD1_TIME_MODIFY, +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +	FATTR4_WORD2_SECURITY_LABEL +#endif  }; -const u32 nfs4_statfs_bitmap[2] = { +static const u32 nfs4_pnfs_open_bitmap[3] = { +	FATTR4_WORD0_TYPE +	| FATTR4_WORD0_CHANGE +	| FATTR4_WORD0_SIZE +	| FATTR4_WORD0_FSID +	| FATTR4_WORD0_FILEID, +	FATTR4_WORD1_MODE +	| FATTR4_WORD1_NUMLINKS +	| FATTR4_WORD1_OWNER +	| FATTR4_WORD1_OWNER_GROUP +	| FATTR4_WORD1_RAWDEV +	| FATTR4_WORD1_SPACE_USED +	| FATTR4_WORD1_TIME_ACCESS +	| FATTR4_WORD1_TIME_METADATA +	| FATTR4_WORD1_TIME_MODIFY, +	FATTR4_WORD2_MDSTHRESHOLD +}; + +static const u32 nfs4_open_noattr_bitmap[3] = { +	FATTR4_WORD0_TYPE +	| FATTR4_WORD0_CHANGE +	| FATTR4_WORD0_FILEID, +}; + +const u32 nfs4_statfs_bitmap[3] = {  	FATTR4_WORD0_FILES_AVAIL  	| FATTR4_WORD0_FILES_FREE  	| FATTR4_WORD0_FILES_TOTAL, @@ -120,21 +226,22 @@ const u32 nfs4_statfs_bitmap[2] = {  	| FATTR4_WORD1_SPACE_TOTAL  }; -const u32 nfs4_pathconf_bitmap[2] = { +const u32 nfs4_pathconf_bitmap[3] = {  	FATTR4_WORD0_MAXLINK  	| FATTR4_WORD0_MAXNAME,  	0  }; -const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE +const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE  			| FATTR4_WORD0_MAXREAD  			| FATTR4_WORD0_MAXWRITE  			| FATTR4_WORD0_LEASE_TIME,  			FATTR4_WORD1_TIME_DELTA -			| FATTR4_WORD1_FS_LAYOUT_TYPES +			| FATTR4_WORD1_FS_LAYOUT_TYPES, +			FATTR4_WORD2_LAYOUT_BLKSIZE  }; -const u32 nfs4_fs_locations_bitmap[2] = { +const u32 nfs4_fs_locations_bitmap[3] = {  	FATTR4_WORD0_TYPE  	| FATTR4_WORD0_CHANGE  	| FATTR4_WORD0_SIZE @@ -150,7 +257,7 @@ const u32 nfs4_fs_locations_bitmap[2] = {  	| FATTR4_WORD1_TIME_ACCESS  	| FATTR4_WORD1_TIME_METADATA  	| FATTR4_WORD1_TIME_MODIFY -	| FATTR4_WORD1_MOUNTED_ON_FILEID +	| FATTR4_WORD1_MOUNTED_ON_FILEID,  };  static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry, @@ -158,7 +265,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent  {  	__be32 *start, *p; -	BUG_ON(readdir->count < 80);  	if (cookie > 2) {  		readdir->cookie = cookie;  		memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier)); @@ -177,7 +283,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent  	 * when talking to the server, we always send cookie 0  	 * instead of 1 or 2.  	 */ -	start = p = kmap_atomic(*readdir->pages, KM_USER0); +	start = p = kmap_atomic(*readdir->pages);  	if (cookie == 0) {  		*p++ = xdr_one;                                  /* next */ @@ -205,18 +311,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent  	readdir->pgbase = (char *)p - (char *)start;  	readdir->count -= readdir->pgbase; -	kunmap_atomic(start, KM_USER0); -} - -static int nfs4_wait_clnt_recover(struct nfs_client *clp) -{ -	int res; - -	might_sleep(); - -	res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, -			nfs_wait_bit_killable, TASK_KILLABLE); -	return res; +	kunmap_atomic(start);  }  static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) @@ -229,7 +324,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)  		*timeout = NFS4_POLL_RETRY_MIN;  	if (*timeout > NFS4_POLL_RETRY_MAX)  		*timeout = NFS4_POLL_RETRY_MAX; -	schedule_timeout_killable(*timeout); +	freezable_schedule_timeout_killable_unsafe(*timeout);  	if (fatal_signal_pending(current))  		res = -ERESTARTSYS;  	*timeout <<= 1; @@ -239,27 +334,61 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)  /* This is the error handling routine for processes that are allowed   * to sleep.   */ -static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception) +static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)  {  	struct nfs_client *clp = server->nfs_client;  	struct nfs4_state *state = exception->state; +	struct inode *inode = exception->inode;  	int ret = errorcode;  	exception->retry = 0;  	switch(errorcode) {  		case 0:  			return 0; +		case -NFS4ERR_OPENMODE: +			if (inode && nfs4_have_delegation(inode, FMODE_READ)) { +				nfs4_inode_return_delegation(inode); +				exception->retry = 1; +				return 0; +			} +			if (state == NULL) +				break; +			ret = nfs4_schedule_stateid_recovery(server, state); +			if (ret < 0) +				break; +			goto wait_on_recovery; +		case -NFS4ERR_DELEG_REVOKED:  		case -NFS4ERR_ADMIN_REVOKED:  		case -NFS4ERR_BAD_STATEID: -		case -NFS4ERR_OPENMODE: +			if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) { +				nfs_remove_bad_delegation(inode); +				exception->retry = 1; +				break; +			}  			if (state == NULL)  				break; -			nfs4_state_mark_reclaim_nograce(clp, state); -			goto do_state_recovery; +			ret = nfs4_schedule_stateid_recovery(server, state); +			if (ret < 0) +				break; +			goto wait_on_recovery; +		case -NFS4ERR_EXPIRED: +			if (state != NULL) { +				ret = nfs4_schedule_stateid_recovery(server, state); +				if (ret < 0) +					break; +			}  		case -NFS4ERR_STALE_STATEID:  		case -NFS4ERR_STALE_CLIENTID: -		case -NFS4ERR_EXPIRED: -			goto do_state_recovery; +			nfs4_schedule_lease_recovery(clp); +			goto wait_on_recovery; +		case -NFS4ERR_MOVED: +			ret = nfs4_schedule_migration_recovery(server); +			if (ret < 0) +				break; +			goto wait_on_recovery; +		case -NFS4ERR_LEASE_MOVED: +			nfs4_schedule_lease_moved_recovery(clp); +			goto wait_on_recovery;  #if defined(CONFIG_NFS_V4_1)  		case -NFS4ERR_BADSESSION:  		case -NFS4ERR_BADSLOT: @@ -270,9 +399,8 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,  		case -NFS4ERR_SEQ_MISORDERED:  			dprintk("%s ERROR: %d Reset session\n", __func__,  				errorcode); -			nfs4_schedule_state_recovery(clp); -			exception->retry = 1; -			break; +			nfs4_schedule_session_recovery(clp->cl_session, errorcode); +			goto wait_on_recovery;  #endif /* defined(CONFIG_NFS_V4_1) */  		case -NFS4ERR_FILE_OPEN:  			if (exception->timeout > HZ) { @@ -284,23 +412,51 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,  			}  		case -NFS4ERR_GRACE:  		case -NFS4ERR_DELAY: -		case -EKEYEXPIRED:  			ret = nfs4_delay(server->client, &exception->timeout);  			if (ret != 0)  				break; +		case -NFS4ERR_RETRY_UNCACHED_REP:  		case -NFS4ERR_OLD_STATEID:  			exception->retry = 1; +			break; +		case -NFS4ERR_BADOWNER: +			/* The following works around a Linux server bug! */ +		case -NFS4ERR_BADNAME: +			if (server->caps & NFS_CAP_UIDGID_NOMAP) { +				server->caps &= ~NFS_CAP_UIDGID_NOMAP; +				exception->retry = 1; +				printk(KERN_WARNING "NFS: v4 server %s " +						"does not accept raw " +						"uid/gids. " +						"Reenabling the idmapper.\n", +						server->nfs_client->cl_hostname); +			}  	}  	/* We failed to handle the error */  	return nfs4_map_errors(ret); -do_state_recovery: -	nfs4_schedule_state_recovery(clp); +wait_on_recovery:  	ret = nfs4_wait_clnt_recover(clp); +	if (test_bit(NFS_MIG_FAILED, &server->mig_status)) +		return -EIO;  	if (ret == 0)  		exception->retry = 1;  	return ret;  } +/* + * Return 'true' if 'clp' is using an rpc_client that is integrity protected + * or 'false' otherwise. + */ +static bool _nfs4_is_integrity_protected(struct nfs_client *clp) +{ +	rpc_authflavor_t flavor = clp->cl_rpcclient->cl_auth->au_flavor; + +	if (flavor == RPC_AUTH_GSS_KRB5I || +	    flavor == RPC_AUTH_GSS_KRB5P) +		return true; + +	return false; +}  static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp)  { @@ -315,342 +471,400 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp  	do_renew_lease(server->nfs_client, timestamp);  } -#if defined(CONFIG_NFS_V4_1) +struct nfs4_call_sync_data { +	const struct nfs_server *seq_server; +	struct nfs4_sequence_args *seq_args; +	struct nfs4_sequence_res *seq_res; +}; -/* - * nfs4_free_slot - free a slot and efficiently update slot table. - * - * freeing a slot is trivially done by clearing its respective bit - * in the bitmap. - * If the freed slotid equals highest_used_slotid we want to update it - * so that the server would be able to size down the slot table if needed, - * otherwise we know that the highest_used_slotid is still in use. - * When updating highest_used_slotid there may be "holes" in the bitmap - * so we need to scan down from highest_used_slotid to 0 looking for the now - * highest slotid in use. - * If none found, highest_used_slotid is set to -1. - * - * Must be called while holding tbl->slot_tbl_lock - */ -static void -nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot) +static void nfs4_init_sequence(struct nfs4_sequence_args *args, +			       struct nfs4_sequence_res *res, int cache_reply)  { -	int free_slotid = free_slot - tbl->slots; -	int slotid = free_slotid; +	args->sa_slot = NULL; +	args->sa_cache_this = cache_reply; +	args->sa_privileged = 0; -	BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE); -	/* clear used bit in bitmap */ -	__clear_bit(slotid, tbl->used_slots); +	res->sr_slot = NULL; +} -	/* update highest_used_slotid when it is freed */ -	if (slotid == tbl->highest_used_slotid) { -		slotid = find_last_bit(tbl->used_slots, tbl->max_slots); -		if (slotid < tbl->max_slots) -			tbl->highest_used_slotid = slotid; -		else -			tbl->highest_used_slotid = -1; -	} -	dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__, -		free_slotid, tbl->highest_used_slotid); +static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) +{ +	args->sa_privileged = 1;  } -/* - * Signal state manager thread if session is drained - */ -static void nfs41_check_drain_session_complete(struct nfs4_session *ses) +static int nfs40_setup_sequence(const struct nfs_server *server, +				struct nfs4_sequence_args *args, +				struct nfs4_sequence_res *res, +				struct rpc_task *task)  { -	struct rpc_task *task; +	struct nfs4_slot_table *tbl = server->nfs_client->cl_slot_tbl; +	struct nfs4_slot *slot; -	if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { -		task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); -		if (task) -			rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); -		return; +	/* slot already allocated? */ +	if (res->sr_slot != NULL) +		goto out_start; + +	spin_lock(&tbl->slot_tbl_lock); +	if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) +		goto out_sleep; + +	slot = nfs4_alloc_slot(tbl); +	if (IS_ERR(slot)) { +		if (slot == ERR_PTR(-ENOMEM)) +			task->tk_timeout = HZ >> 2; +		goto out_sleep;  	} +	spin_unlock(&tbl->slot_tbl_lock); -	if (ses->fc_slot_table.highest_used_slotid != -1) -		return; +	args->sa_slot = slot; +	res->sr_slot = slot; -	dprintk("%s COMPLETE: Session Drained\n", __func__); -	complete(&ses->complete); +out_start: +	rpc_call_start(task); +	return 0; + +out_sleep: +	if (args->sa_privileged) +		rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, +				NULL, RPC_PRIORITY_PRIVILEGED); +	else +		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); +	spin_unlock(&tbl->slot_tbl_lock); +	return -EAGAIN;  } -static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) +static int nfs40_sequence_done(struct rpc_task *task, +			       struct nfs4_sequence_res *res)  { +	struct nfs4_slot *slot = res->sr_slot;  	struct nfs4_slot_table *tbl; -	tbl = &res->sr_session->fc_slot_table; -	if (!res->sr_slot) { -		/* just wake up the next guy waiting since -		 * we may have not consumed a slot after all */ -		dprintk("%s: No slot\n", __func__); -		return; -	} +	if (slot == NULL) +		goto out; +	tbl = slot->table;  	spin_lock(&tbl->slot_tbl_lock); -	nfs4_free_slot(tbl, res->sr_slot); -	nfs41_check_drain_session_complete(res->sr_session); +	if (!nfs41_wake_and_assign_slot(tbl, slot)) +		nfs4_free_slot(tbl, slot);  	spin_unlock(&tbl->slot_tbl_lock); +  	res->sr_slot = NULL; +out: +	return 1;  } -static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +#if defined(CONFIG_NFS_V4_1) + +static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)  { -	unsigned long timestamp; -	struct nfs_client *clp; +	struct nfs4_session *session; +	struct nfs4_slot_table *tbl; +	struct nfs4_slot *slot = res->sr_slot; +	bool send_new_highest_used_slotid = false; -	/* -	 * sr_status remains 1 if an RPC level error occurred. The server -	 * may or may not have processed the sequence operation.. -	 * Proceed as if the server received and processed the sequence -	 * operation. +	tbl = slot->table; +	session = tbl->session; + +	spin_lock(&tbl->slot_tbl_lock); +	/* Be nice to the server: try to ensure that the last transmitted +	 * value for highest_user_slotid <= target_highest_slotid  	 */ -	if (res->sr_status == 1) -		res->sr_status = NFS_OK; +	if (tbl->highest_used_slotid > tbl->target_highest_slotid) +		send_new_highest_used_slotid = true; + +	if (nfs41_wake_and_assign_slot(tbl, slot)) { +		send_new_highest_used_slotid = false; +		goto out_unlock; +	} +	nfs4_free_slot(tbl, slot); + +	if (tbl->highest_used_slotid != NFS4_NO_SLOT) +		send_new_highest_used_slotid = false; +out_unlock: +	spin_unlock(&tbl->slot_tbl_lock); +	res->sr_slot = NULL; +	if (send_new_highest_used_slotid) +		nfs41_server_notify_highest_slotid_update(session->clp); +} -	/* -ERESTARTSYS can result in skipping nfs41_sequence_setup */ -	if (!res->sr_slot) +int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +{ +	struct nfs4_session *session; +	struct nfs4_slot *slot = res->sr_slot; +	struct nfs_client *clp; +	bool interrupted = false; +	int ret = 1; + +	if (slot == NULL) +		goto out_noaction; +	/* don't increment the sequence number if the task wasn't sent */ +	if (!RPC_WAS_SENT(task))  		goto out; +	session = slot->table->session; + +	if (slot->interrupted) { +		slot->interrupted = 0; +		interrupted = true; +	} + +	trace_nfs4_sequence_done(session, res);  	/* Check the SEQUENCE operation status */  	switch (res->sr_status) {  	case 0:  		/* Update the slot's sequence and clientid lease timer */ -		++res->sr_slot->seq_nr; -		timestamp = res->sr_renewal_time; -		clp = res->sr_session->clp; -		do_renew_lease(clp, timestamp); +		++slot->seq_nr; +		clp = session->clp; +		do_renew_lease(clp, res->sr_timestamp);  		/* Check sequence flags */ -		if (atomic_read(&clp->cl_count) > 1) -			nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); +		if (res->sr_status_flags != 0) +			nfs4_schedule_lease_recovery(clp); +		nfs41_update_target_slotid(slot->table, slot, res);  		break; +	case 1: +		/* +		 * sr_status remains 1 if an RPC level error occurred. +		 * The server may or may not have processed the sequence +		 * operation.. +		 * Mark the slot as having hosted an interrupted RPC call. +		 */ +		slot->interrupted = 1; +		goto out;  	case -NFS4ERR_DELAY:  		/* The server detected a resend of the RPC call and  		 * returned NFS4ERR_DELAY as per Section 2.10.6.2  		 * of RFC5661.  		 */ -		dprintk("%s: slot=%td seq=%d: Operation in progress\n", +		dprintk("%s: slot=%u seq=%u: Operation in progress\n",  			__func__, -			res->sr_slot - res->sr_session->fc_slot_table.slots, -			res->sr_slot->seq_nr); +			slot->slot_nr, +			slot->seq_nr);  		goto out_retry; +	case -NFS4ERR_BADSLOT: +		/* +		 * The slot id we used was probably retired. Try again +		 * using a different slot id. +		 */ +		goto retry_nowait; +	case -NFS4ERR_SEQ_MISORDERED: +		/* +		 * Was the last operation on this sequence interrupted? +		 * If so, retry after bumping the sequence number. +		 */ +		if (interrupted) { +			++slot->seq_nr; +			goto retry_nowait; +		} +		/* +		 * Could this slot have been previously retired? +		 * If so, then the server may be expecting seq_nr = 1! +		 */ +		if (slot->seq_nr != 1) { +			slot->seq_nr = 1; +			goto retry_nowait; +		} +		break; +	case -NFS4ERR_SEQ_FALSE_RETRY: +		++slot->seq_nr; +		goto retry_nowait;  	default:  		/* Just update the slot sequence no. */ -		++res->sr_slot->seq_nr; +		++slot->seq_nr;  	}  out:  	/* The session may be reset by one of the error handlers. */  	dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);  	nfs41_sequence_free_slot(res); -	return 1; +out_noaction: +	return ret; +retry_nowait: +	if (rpc_restart_call_prepare(task)) { +		task->tk_status = 0; +		ret = 0; +	} +	goto out;  out_retry:  	if (!rpc_restart_call(task))  		goto out;  	rpc_delay(task, NFS4_POLL_RETRY_MAX);  	return 0;  } +EXPORT_SYMBOL_GPL(nfs41_sequence_done);  static int nfs4_sequence_done(struct rpc_task *task,  			       struct nfs4_sequence_res *res)  { -	if (res->sr_session == NULL) +	if (res->sr_slot == NULL)  		return 1; +	if (!res->sr_slot->table->session) +		return nfs40_sequence_done(task, res);  	return nfs41_sequence_done(task, res);  } -/* - * nfs4_find_slot - efficiently look for a free slot - * - * nfs4_find_slot looks for an unset bit in the used_slots bitmap. - * If found, we mark the slot as used, update the highest_used_slotid, - * and respectively set up the sequence operation args. - * The slot number is returned if found, or NFS4_MAX_SLOT_TABLE otherwise. - * - * Note: must be called with under the slot_tbl_lock. - */ -static u8 -nfs4_find_slot(struct nfs4_slot_table *tbl) -{ -	int slotid; -	u8 ret_id = NFS4_MAX_SLOT_TABLE; -	BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE); - -	dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n", -		__func__, tbl->used_slots[0], tbl->highest_used_slotid, -		tbl->max_slots); -	slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots); -	if (slotid >= tbl->max_slots) -		goto out; -	__set_bit(slotid, tbl->used_slots); -	if (slotid > tbl->highest_used_slotid) -		tbl->highest_used_slotid = slotid; -	ret_id = slotid; -out: -	dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n", -		__func__, tbl->used_slots[0], tbl->highest_used_slotid, ret_id); -	return ret_id; -} - -static int nfs41_setup_sequence(struct nfs4_session *session, +int nfs41_setup_sequence(struct nfs4_session *session,  				struct nfs4_sequence_args *args,  				struct nfs4_sequence_res *res, -				int cache_reply,  				struct rpc_task *task)  {  	struct nfs4_slot *slot;  	struct nfs4_slot_table *tbl; -	u8 slotid;  	dprintk("--> %s\n", __func__);  	/* slot already allocated? */  	if (res->sr_slot != NULL) -		return 0; +		goto out_success;  	tbl = &session->fc_slot_table; -	spin_lock(&tbl->slot_tbl_lock); -	if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && -	    !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { -		/* -		 * The state manager will wait until the slot table is empty. -		 * Schedule the reset thread -		 */ -		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); -		spin_unlock(&tbl->slot_tbl_lock); -		dprintk("%s Schedule Session Reset\n", __func__); -		return -EAGAIN; -	} +	task->tk_timeout = 0; -	if (!rpc_queue_empty(&tbl->slot_tbl_waitq) && -	    !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { -		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); -		spin_unlock(&tbl->slot_tbl_lock); -		dprintk("%s enforce FIFO order\n", __func__); -		return -EAGAIN; +	spin_lock(&tbl->slot_tbl_lock); +	if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state) && +	    !args->sa_privileged) { +		/* The state manager will wait until the slot table is empty */ +		dprintk("%s session is draining\n", __func__); +		goto out_sleep;  	} -	slotid = nfs4_find_slot(tbl); -	if (slotid == NFS4_MAX_SLOT_TABLE) { -		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); -		spin_unlock(&tbl->slot_tbl_lock); +	slot = nfs4_alloc_slot(tbl); +	if (IS_ERR(slot)) { +		/* If out of memory, try again in 1/4 second */ +		if (slot == ERR_PTR(-ENOMEM)) +			task->tk_timeout = HZ >> 2;  		dprintk("<-- %s: no free slots\n", __func__); -		return -EAGAIN; +		goto out_sleep;  	}  	spin_unlock(&tbl->slot_tbl_lock); -	rpc_task_set_priority(task, RPC_PRIORITY_NORMAL); -	slot = tbl->slots + slotid; -	args->sa_session = session; -	args->sa_slotid = slotid; -	args->sa_cache_this = cache_reply; +	args->sa_slot = slot; -	dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); +	dprintk("<-- %s slotid=%u seqid=%u\n", __func__, +			slot->slot_nr, slot->seq_nr); -	res->sr_session = session;  	res->sr_slot = slot; -	res->sr_renewal_time = jiffies; +	res->sr_timestamp = jiffies;  	res->sr_status_flags = 0;  	/*  	 * sr_status is only set in decode_sequence, and so will remain  	 * set to 1 if an rpc level failure occurs.  	 */  	res->sr_status = 1; +	trace_nfs4_setup_sequence(session, args); +out_success: +	rpc_call_start(task);  	return 0; +out_sleep: +	/* Privileged tasks are queued with top priority */ +	if (args->sa_privileged) +		rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, +				NULL, RPC_PRIORITY_PRIVILEGED); +	else +		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); +	spin_unlock(&tbl->slot_tbl_lock); +	return -EAGAIN;  } +EXPORT_SYMBOL_GPL(nfs41_setup_sequence); -int nfs4_setup_sequence(const struct nfs_server *server, -			struct nfs4_sequence_args *args, -			struct nfs4_sequence_res *res, -			int cache_reply, -			struct rpc_task *task) +static int nfs4_setup_sequence(const struct nfs_server *server, +			       struct nfs4_sequence_args *args, +			       struct nfs4_sequence_res *res, +			       struct rpc_task *task)  {  	struct nfs4_session *session = nfs4_get_session(server);  	int ret = 0; -	if (session == NULL) { -		args->sa_session = NULL; -		res->sr_session = NULL; -		goto out; -	} +	if (!session) +		return nfs40_setup_sequence(server, args, res, task); -	dprintk("--> %s clp %p session %p sr_slot %td\n", +	dprintk("--> %s clp %p session %p sr_slot %u\n",  		__func__, session->clp, session, res->sr_slot ? -			res->sr_slot - session->fc_slot_table.slots : -1); +			res->sr_slot->slot_nr : NFS4_NO_SLOT); + +	ret = nfs41_setup_sequence(session, args, res, task); -	ret = nfs41_setup_sequence(session, args, res, cache_reply, -				   task); -out:  	dprintk("<-- %s status=%d\n", __func__, ret);  	return ret;  } -struct nfs41_call_sync_data { -	const struct nfs_server *seq_server; -	struct nfs4_sequence_args *seq_args; -	struct nfs4_sequence_res *seq_res; -	int cache_reply; -}; -  static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)  { -	struct nfs41_call_sync_data *data = calldata; +	struct nfs4_call_sync_data *data = calldata; +	struct nfs4_session *session = nfs4_get_session(data->seq_server);  	dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); -	if (nfs4_setup_sequence(data->seq_server, data->seq_args, -				data->seq_res, data->cache_reply, task)) -		return; -	rpc_call_start(task); -} - -static void nfs41_call_priv_sync_prepare(struct rpc_task *task, void *calldata) -{ -	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); -	nfs41_call_sync_prepare(task, calldata); +	nfs41_setup_sequence(session, data->seq_args, data->seq_res, task);  }  static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)  { -	struct nfs41_call_sync_data *data = calldata; +	struct nfs4_call_sync_data *data = calldata;  	nfs41_sequence_done(task, data->seq_res);  } -struct rpc_call_ops nfs41_call_sync_ops = { +static const struct rpc_call_ops nfs41_call_sync_ops = {  	.rpc_call_prepare = nfs41_call_sync_prepare,  	.rpc_call_done = nfs41_call_sync_done,  }; -struct rpc_call_ops nfs41_call_priv_sync_ops = { -	.rpc_call_prepare = nfs41_call_priv_sync_prepare, -	.rpc_call_done = nfs41_call_sync_done, +#else	/* !CONFIG_NFS_V4_1 */ + +static int nfs4_setup_sequence(const struct nfs_server *server, +			       struct nfs4_sequence_args *args, +			       struct nfs4_sequence_res *res, +			       struct rpc_task *task) +{ +	return nfs40_setup_sequence(server, args, res, task); +} + +static int nfs4_sequence_done(struct rpc_task *task, +			       struct nfs4_sequence_res *res) +{ +	return nfs40_sequence_done(task, res); +} + +#endif	/* !CONFIG_NFS_V4_1 */ + +static void nfs40_call_sync_prepare(struct rpc_task *task, void *calldata) +{ +	struct nfs4_call_sync_data *data = calldata; +	nfs4_setup_sequence(data->seq_server, +				data->seq_args, data->seq_res, task); +} + +static void nfs40_call_sync_done(struct rpc_task *task, void *calldata) +{ +	struct nfs4_call_sync_data *data = calldata; +	nfs4_sequence_done(task, data->seq_res); +} + +static const struct rpc_call_ops nfs40_call_sync_ops = { +	.rpc_call_prepare = nfs40_call_sync_prepare, +	.rpc_call_done = nfs40_call_sync_done,  }; -static int nfs4_call_sync_sequence(struct nfs_server *server, +static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, +				   struct nfs_server *server,  				   struct rpc_message *msg,  				   struct nfs4_sequence_args *args, -				   struct nfs4_sequence_res *res, -				   int cache_reply, -				   int privileged) +				   struct nfs4_sequence_res *res)  {  	int ret;  	struct rpc_task *task; -	struct nfs41_call_sync_data data = { +	struct nfs_client *clp = server->nfs_client; +	struct nfs4_call_sync_data data = {  		.seq_server = server,  		.seq_args = args,  		.seq_res = res, -		.cache_reply = cache_reply,  	};  	struct rpc_task_setup task_setup = { -		.rpc_client = server->client, +		.rpc_client = clnt,  		.rpc_message = msg, -		.callback_ops = &nfs41_call_sync_ops, +		.callback_ops = clp->cl_mvops->call_sync_ops,  		.callback_data = &data  	}; -	res->sr_slot = NULL; -	if (privileged) -		task_setup.callback_ops = &nfs41_call_priv_sync_ops;  	task = rpc_run_task(&task_setup);  	if (IS_ERR(task))  		ret = PTR_ERR(task); @@ -661,46 +875,28 @@ static int nfs4_call_sync_sequence(struct nfs_server *server,  	return ret;  } -int _nfs4_call_sync_session(struct nfs_server *server, -			    struct rpc_message *msg, -			    struct nfs4_sequence_args *args, -			    struct nfs4_sequence_res *res, -			    int cache_reply) +static +int nfs4_call_sync(struct rpc_clnt *clnt, +		   struct nfs_server *server, +		   struct rpc_message *msg, +		   struct nfs4_sequence_args *args, +		   struct nfs4_sequence_res *res, +		   int cache_reply)  { -	return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); +	nfs4_init_sequence(args, res, cache_reply); +	return nfs4_call_sync_sequence(clnt, server, msg, args, res);  } -#else -static int nfs4_sequence_done(struct rpc_task *task, -			       struct nfs4_sequence_res *res) -{ -	return 1; -} -#endif /* CONFIG_NFS_V4_1 */ - -int _nfs4_call_sync(struct nfs_server *server, -		    struct rpc_message *msg, -		    struct nfs4_sequence_args *args, -		    struct nfs4_sequence_res *res, -		    int cache_reply) -{ -	args->sa_session = res->sr_session = NULL; -	return rpc_call_sync(server->client, msg, 0); -} - -#define nfs4_call_sync(server, msg, args, res, cache_reply) \ -	(server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \ -			&(res)->seq_res, (cache_reply)) -  static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)  {  	struct nfs_inode *nfsi = NFS_I(dir);  	spin_lock(&dir->i_lock); -	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA; -	if (!cinfo->atomic || cinfo->before != nfsi->change_attr) +	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; +	if (!cinfo->atomic || cinfo->before != dir->i_version)  		nfs_force_lookup_revalidate(dir); -	nfsi->change_attr = cinfo->after; +	dir->i_version = cinfo->after; +	nfs_fscache_invalidate(dir);  	spin_unlock(&dir->i_lock);  } @@ -710,37 +906,73 @@ struct nfs4_opendata {  	struct nfs_openres o_res;  	struct nfs_open_confirmargs c_arg;  	struct nfs_open_confirmres c_res; +	struct nfs4_string owner_name; +	struct nfs4_string group_name;  	struct nfs_fattr f_attr; -	struct nfs_fattr dir_attr; -	struct path path; +	struct nfs4_label *f_label;  	struct dentry *dir; +	struct dentry *dentry;  	struct nfs4_state_owner *owner;  	struct nfs4_state *state;  	struct iattr attrs;  	unsigned long timestamp;  	unsigned int rpc_done : 1; +	unsigned int file_created : 1; +	unsigned int is_recover : 1;  	int rpc_status;  	int cancelled;  }; +static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server, +		int err, struct nfs4_exception *exception) +{ +	if (err != -EINVAL) +		return false; +	if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1)) +		return false; +	server->caps &= ~NFS_CAP_ATOMIC_OPEN_V1; +	exception->retry = 1; +	return true; +} + +static enum open_claim_type4 +nfs4_map_atomic_open_claim(struct nfs_server *server, +		enum open_claim_type4 claim) +{ +	if (server->caps & NFS_CAP_ATOMIC_OPEN_V1) +		return claim; +	switch (claim) { +	default: +		return claim; +	case NFS4_OPEN_CLAIM_FH: +		return NFS4_OPEN_CLAIM_NULL; +	case NFS4_OPEN_CLAIM_DELEG_CUR_FH: +		return NFS4_OPEN_CLAIM_DELEGATE_CUR; +	case NFS4_OPEN_CLAIM_DELEG_PREV_FH: +		return NFS4_OPEN_CLAIM_DELEGATE_PREV; +	} +}  static void nfs4_init_opendata_res(struct nfs4_opendata *p)  {  	p->o_res.f_attr = &p->f_attr; -	p->o_res.dir_attr = &p->dir_attr; +	p->o_res.f_label = p->f_label;  	p->o_res.seqid = p->o_arg.seqid;  	p->c_res.seqid = p->c_arg.seqid;  	p->o_res.server = p->o_arg.server; +	p->o_res.access_request = p->o_arg.access;  	nfs_fattr_init(&p->f_attr); -	nfs_fattr_init(&p->dir_attr); +	nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name);  } -static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, +static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,  		struct nfs4_state_owner *sp, fmode_t fmode, int flags,  		const struct iattr *attrs, +		struct nfs4_label *label, +		enum open_claim_type4 claim,  		gfp_t gfp_mask)  { -	struct dentry *parent = dget_parent(path->dentry); +	struct dentry *parent = dget_parent(dentry);  	struct inode *dir = parent->d_inode;  	struct nfs_server *server = NFS_SERVER(dir);  	struct nfs4_opendata *p; @@ -748,31 +980,60 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,  	p = kzalloc(sizeof(*p), gfp_mask);  	if (p == NULL)  		goto err; + +	p->f_label = nfs4_label_alloc(server, gfp_mask); +	if (IS_ERR(p->f_label)) +		goto err_free_p; +  	p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);  	if (p->o_arg.seqid == NULL) -		goto err_free; -	path_get(path); -	p->path = *path; +		goto err_free_label; +	nfs_sb_active(dentry->d_sb); +	p->dentry = dget(dentry);  	p->dir = parent;  	p->owner = sp;  	atomic_inc(&sp->so_count); -	p->o_arg.fh = NFS_FH(dir);  	p->o_arg.open_flags = flags;  	p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); +	/* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS +	 * will return permission denied for all bits until close */ +	if (!(flags & O_EXCL)) { +		/* ask server to check for all possible rights as results +		 * are cached */ +		p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY | +				  NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE; +	}  	p->o_arg.clientid = server->nfs_client->cl_clientid; -	p->o_arg.id = sp->so_owner_id.id; -	p->o_arg.name = &p->path.dentry->d_name; +	p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time); +	p->o_arg.id.uniquifier = sp->so_seqid.owner_id; +	p->o_arg.name = &dentry->d_name;  	p->o_arg.server = server; -	p->o_arg.bitmask = server->attr_bitmask; -	p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; -	if (flags & O_CREAT) { -		u32 *s; +	p->o_arg.bitmask = nfs4_bitmask(server, label); +	p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; +	p->o_arg.label = label; +	p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); +	switch (p->o_arg.claim) { +	case NFS4_OPEN_CLAIM_NULL: +	case NFS4_OPEN_CLAIM_DELEGATE_CUR: +	case NFS4_OPEN_CLAIM_DELEGATE_PREV: +		p->o_arg.fh = NFS_FH(dir); +		break; +	case NFS4_OPEN_CLAIM_PREVIOUS: +	case NFS4_OPEN_CLAIM_FH: +	case NFS4_OPEN_CLAIM_DELEG_CUR_FH: +	case NFS4_OPEN_CLAIM_DELEG_PREV_FH: +		p->o_arg.fh = NFS_FH(dentry->d_inode); +	} +	if (attrs != NULL && attrs->ia_valid != 0) { +		__u32 verf[2];  		p->o_arg.u.attrs = &p->attrs;  		memcpy(&p->attrs, attrs, sizeof(p->attrs)); -		s = (u32 *) p->o_arg.u.verifier.data; -		s[0] = jiffies; -		s[1] = current->pid; + +		verf[0] = jiffies; +		verf[1] = current->pid; +		memcpy(p->o_arg.u.verifier.data, verf, +				sizeof(p->o_arg.u.verifier.data));  	}  	p->c_arg.fh = &p->o_res.fh;  	p->c_arg.stateid = &p->o_res.stateid; @@ -780,7 +1041,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,  	nfs4_init_opendata_res(p);  	kref_init(&p->kref);  	return p; -err_free: + +err_free_label: +	nfs4_label_free(p->f_label); +err_free_p:  	kfree(p);  err:  	dput(parent); @@ -791,13 +1055,20 @@ static void nfs4_opendata_free(struct kref *kref)  {  	struct nfs4_opendata *p = container_of(kref,  			struct nfs4_opendata, kref); +	struct super_block *sb = p->dentry->d_sb;  	nfs_free_seqid(p->o_arg.seqid);  	if (p->state != NULL)  		nfs4_put_open_state(p->state);  	nfs4_put_state_owner(p->owner); + +	nfs4_label_free(p->f_label); +  	dput(p->dir); -	path_put(&p->path); +	dput(p->dentry); +	nfs_sb_deactive(sb); +	nfs_fattr_free_names(&p->f_attr); +	kfree(p->f_attr.mdsthreshold);  	kfree(p);  } @@ -819,7 +1090,7 @@ static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode  {  	int ret = 0; -	if (open_mode & O_EXCL) +	if (open_mode & (O_EXCL|O_TRUNC))  		goto out;  	switch (mode & (FMODE_READ|FMODE_WRITE)) {  		case FMODE_READ: @@ -840,10 +1111,14 @@ out:  static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)  { +	if (delegation == NULL) +		return 0;  	if ((delegation->type & fmode) != fmode)  		return 0;  	if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))  		return 0; +	if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) +		return 0;  	nfs_mark_delegation_referenced(delegation);  	return 1;  } @@ -863,11 +1138,71 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)  	nfs4_state_set_mode_locked(state, state->state | fmode);  } -static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) +{ +	struct nfs_client *clp = state->owner->so_server->nfs_client; +	bool need_recover = false; + +	if (test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags) && state->n_rdonly) +		need_recover = true; +	if (test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags) && state->n_wronly) +		need_recover = true; +	if (test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags) && state->n_rdwr) +		need_recover = true; +	if (need_recover) +		nfs4_state_mark_reclaim_nograce(clp, state); +} + +static bool nfs_need_update_open_stateid(struct nfs4_state *state, +		nfs4_stateid *stateid) +{ +	if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0) +		return true; +	if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) { +		nfs_test_and_clear_all_open_stateid(state); +		return true; +	} +	if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) +		return true; +	return false; +} + +static void nfs_clear_open_stateid_locked(struct nfs4_state *state, +		nfs4_stateid *stateid, fmode_t fmode)  { +	clear_bit(NFS_O_RDWR_STATE, &state->flags); +	switch (fmode & (FMODE_READ|FMODE_WRITE)) { +	case FMODE_WRITE: +		clear_bit(NFS_O_RDONLY_STATE, &state->flags); +		break; +	case FMODE_READ: +		clear_bit(NFS_O_WRONLY_STATE, &state->flags); +		break; +	case 0: +		clear_bit(NFS_O_RDONLY_STATE, &state->flags); +		clear_bit(NFS_O_WRONLY_STATE, &state->flags); +		clear_bit(NFS_OPEN_STATE, &state->flags); +	} +	if (stateid == NULL) +		return; +	if (!nfs_need_update_open_stateid(state, stateid)) +		return;  	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) -		memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); -	memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); +		nfs4_stateid_copy(&state->stateid, stateid); +	nfs4_stateid_copy(&state->open_stateid, stateid); +} + +static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{ +	write_seqlock(&state->seqlock); +	nfs_clear_open_stateid_locked(state, stateid, fmode); +	write_sequnlock(&state->seqlock); +	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) +		nfs4_schedule_state_manager(state->owner->so_server->nfs_client); +} + +static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{  	switch (fmode) {  		case FMODE_READ:  			set_bit(NFS_O_RDONLY_STATE, &state->flags); @@ -878,13 +1213,11 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *  		case FMODE_READ|FMODE_WRITE:  			set_bit(NFS_O_RDWR_STATE, &state->flags);  	} -} - -static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) -{ -	write_seqlock(&state->seqlock); -	nfs_set_open_stateid_locked(state, stateid, fmode); -	write_sequnlock(&state->seqlock); +	if (!nfs_need_update_open_stateid(state, stateid)) +		return; +	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +		nfs4_stateid_copy(&state->stateid, stateid); +	nfs4_stateid_copy(&state->open_stateid, stateid);  }  static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode) @@ -895,7 +1228,7 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s  	 */  	write_seqlock(&state->seqlock);  	if (deleg_stateid != NULL) { -		memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); +		nfs4_stateid_copy(&state->stateid, deleg_stateid);  		set_bit(NFS_DELEGATED_STATE, &state->flags);  	}  	if (open_stateid != NULL) @@ -920,13 +1253,14 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat  		goto no_delegation;  	spin_lock(&deleg_cur->lock); -	if (nfsi->delegation != deleg_cur || +	if (rcu_dereference(nfsi->delegation) != deleg_cur || +	   test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) ||  	    (deleg_cur->type & fmode) != fmode)  		goto no_delegation_unlock;  	if (delegation == NULL)  		delegation = &deleg_cur->stateid; -	else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0) +	else if (!nfs4_stateid_match(&deleg_cur->stateid, delegation))  		goto no_delegation_unlock;  	nfs_mark_delegation_referenced(deleg_cur); @@ -941,6 +1275,8 @@ no_delegation:  		__update_open_stateid(state, open_stateid, NULL, fmode);  		ret = 1;  	} +	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) +		nfs4_schedule_state_manager(state->owner->so_server->nfs_client);  	return ret;  } @@ -957,7 +1293,7 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo  		return;  	}  	rcu_read_unlock(); -	nfs_inode_return_delegation(inode); +	nfs4_inode_return_delegation(inode);  }  static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) @@ -965,7 +1301,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)  	struct nfs4_state *state = opendata->state;  	struct nfs_inode *nfsi = NFS_I(state->inode);  	struct nfs_delegation *delegation; -	int open_mode = opendata->o_arg.open_flags & O_EXCL; +	int open_mode = opendata->o_arg.open_flags;  	fmode_t fmode = opendata->o_arg.fmode;  	nfs4_stateid stateid;  	int ret = -EAGAIN; @@ -982,17 +1318,19 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)  		}  		rcu_read_lock();  		delegation = rcu_dereference(nfsi->delegation); -		if (delegation == NULL || -		    !can_open_delegated(delegation, fmode)) { +		if (!can_open_delegated(delegation, fmode)) {  			rcu_read_unlock();  			break;  		}  		/* Save the delegation */ -		memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); +		nfs4_stateid_copy(&stateid, &delegation->stateid);  		rcu_read_unlock(); -		ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); -		if (ret != 0) -			goto out; +		nfs_release_seqid(opendata->o_arg.seqid); +		if (!opendata->is_recover) { +			ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); +			if (ret != 0) +				goto out; +		}  		ret = -EAGAIN;  		/* Try to update the stateid using the delegation */ @@ -1006,11 +1344,75 @@ out_return_state:  	return state;  } -static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +static void +nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state) +{ +	struct nfs_client *clp = NFS_SERVER(state->inode)->nfs_client; +	struct nfs_delegation *delegation; +	int delegation_flags = 0; + +	rcu_read_lock(); +	delegation = rcu_dereference(NFS_I(state->inode)->delegation); +	if (delegation) +		delegation_flags = delegation->flags; +	rcu_read_unlock(); +	if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) { +		pr_err_ratelimited("NFS: Broken NFSv4 server %s is " +				   "returning a delegation for " +				   "OPEN(CLAIM_DELEGATE_CUR)\n", +				   clp->cl_hostname); +	} else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) +		nfs_inode_set_delegation(state->inode, +					 data->owner->so_cred, +					 &data->o_res); +	else +		nfs_inode_reclaim_delegation(state->inode, +					     data->owner->so_cred, +					     &data->o_res); +} + +/* + * Check the inode attributes against the CLAIM_PREVIOUS returned attributes + * and update the nfs4_state. + */ +static struct nfs4_state * +_nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) +{ +	struct inode *inode = data->state->inode; +	struct nfs4_state *state = data->state; +	int ret; + +	if (!data->rpc_done) { +		if (data->rpc_status) { +			ret = data->rpc_status; +			goto err; +		} +		/* cached opens have already been processed */ +		goto update; +	} + +	ret = nfs_refresh_inode(inode, &data->f_attr); +	if (ret) +		goto err; + +	if (data->o_res.delegation_type != 0) +		nfs4_opendata_check_deleg(data, state); +update: +	update_open_stateid(state, &data->o_res.stateid, NULL, +			    data->o_arg.fmode); +	atomic_inc(&state->count); + +	return state; +err: +	return ERR_PTR(ret); + +} + +static struct nfs4_state * +_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)  {  	struct inode *inode;  	struct nfs4_state *state = NULL; -	struct nfs_delegation *delegation;  	int ret;  	if (!data->rpc_done) { @@ -1021,7 +1423,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data  	ret = -EAGAIN;  	if (!(data->f_attr.valid & NFS_ATTR_FATTR))  		goto err; -	inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr); +	inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label);  	ret = PTR_ERR(inode);  	if (IS_ERR(inode))  		goto err; @@ -1029,28 +1431,13 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data  	state = nfs4_get_open_state(inode, data->owner);  	if (state == NULL)  		goto err_put_inode; -	if (data->o_res.delegation_type != 0) { -		int delegation_flags = 0; - -		rcu_read_lock(); -		delegation = rcu_dereference(NFS_I(inode)->delegation); -		if (delegation) -			delegation_flags = delegation->flags; -		rcu_read_unlock(); -		if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) -			nfs_inode_set_delegation(state->inode, -					data->owner->so_cred, -					&data->o_res); -		else -			nfs_inode_reclaim_delegation(state->inode, -					data->owner->so_cred, -					&data->o_res); -	} - +	if (data->o_res.delegation_type != 0) +		nfs4_opendata_check_deleg(data, state);  	update_open_stateid(state, &data->o_res.stateid, NULL,  			data->o_arg.fmode);  	iput(inode);  out: +	nfs_release_seqid(data->o_arg.seqid);  	return state;  err_put_inode:  	iput(inode); @@ -1058,6 +1445,14 @@ err:  	return ERR_PTR(ret);  } +static struct nfs4_state * +nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +{ +	if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) +		return _nfs4_opendata_reclaim_to_nfs4_state(data); +	return _nfs4_opendata_to_nfs4_state(data); +} +  static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)  {  	struct nfs_inode *nfsi = NFS_I(state->inode); @@ -1075,11 +1470,13 @@ static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *  	return ERR_PTR(-ENOENT);  } -static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, struct nfs4_state *state) +static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, +		struct nfs4_state *state, enum open_claim_type4 claim)  {  	struct nfs4_opendata *opendata; -	opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS); +	opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, +			NULL, NULL, claim, GFP_NOFS);  	if (opendata == NULL)  		return ERR_PTR(-ENOMEM);  	opendata->state = state; @@ -1103,7 +1500,7 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod  	newstate = nfs4_opendata_to_nfs4_state(opendata);  	if (IS_ERR(newstate))  		return PTR_ERR(newstate); -	nfs4_close_state(&opendata->path, newstate, fmode); +	nfs4_close_state(newstate, fmode);  	*res = newstate;  	return 0;  } @@ -1113,11 +1510,15 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *  	struct nfs4_state *newstate;  	int ret; +	/* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */ +	clear_bit(NFS_O_RDWR_STATE, &state->flags); +	clear_bit(NFS_O_WRONLY_STATE, &state->flags); +	clear_bit(NFS_O_RDONLY_STATE, &state->flags);  	/* memory barrier prior to reading state->n_* */  	clear_bit(NFS_DELEGATED_STATE, &state->flags); +	clear_bit(NFS_OPEN_STATE, &state->flags);  	smp_rmb();  	if (state->n_rdwr != 0) { -		clear_bit(NFS_O_RDWR_STATE, &state->flags);  		ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);  		if (ret != 0)  			return ret; @@ -1125,7 +1526,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *  			return -ESTALE;  	}  	if (state->n_wronly != 0) { -		clear_bit(NFS_O_WRONLY_STATE, &state->flags);  		ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);  		if (ret != 0)  			return ret; @@ -1133,7 +1533,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *  			return -ESTALE;  	}  	if (state->n_rdonly != 0) { -		clear_bit(NFS_O_RDONLY_STATE, &state->flags);  		ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);  		if (ret != 0)  			return ret; @@ -1145,10 +1544,10 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *  	 * Check if we need to update the current stateid.  	 */  	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && -	    memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { +	    !nfs4_stateid_match(&state->stateid, &state->open_stateid)) {  		write_seqlock(&state->seqlock);  		if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) -			memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); +			nfs4_stateid_copy(&state->stateid, &state->open_stateid);  		write_sequnlock(&state->seqlock);  	}  	return 0; @@ -1165,11 +1564,10 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state  	fmode_t delegation_type = 0;  	int status; -	opendata = nfs4_open_recoverdata_alloc(ctx, state); +	opendata = nfs4_open_recoverdata_alloc(ctx, state, +			NFS4_OPEN_CLAIM_PREVIOUS);  	if (IS_ERR(opendata))  		return PTR_ERR(opendata); -	opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS; -	opendata->o_arg.fh = NFS_FH(state->inode);  	rcu_read_lock();  	delegation = rcu_dereference(NFS_I(state->inode)->delegation);  	if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0) @@ -1188,6 +1586,9 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state  	int err;  	do {  		err = _nfs4_do_open_reclaim(ctx, state); +		trace_nfs4_open_reclaim(ctx, 0, err); +		if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) +			continue;  		if (err != -NFS4ERR_DELAY)  			break;  		nfs4_handle_exception(server, err, &exception); @@ -1202,87 +1603,97 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta  	ctx = nfs4_state_find_open_context(state);  	if (IS_ERR(ctx)) -		return PTR_ERR(ctx); +		return -EAGAIN;  	ret = nfs4_do_open_reclaim(ctx, state);  	put_nfs_open_context(ctx);  	return ret;  } -static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err)  { +	switch (err) { +		default: +			printk(KERN_ERR "NFS: %s: unhandled error " +					"%d.\n", __func__, err); +		case 0: +		case -ENOENT: +		case -ESTALE: +			break; +		case -NFS4ERR_BADSESSION: +		case -NFS4ERR_BADSLOT: +		case -NFS4ERR_BAD_HIGH_SLOT: +		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: +		case -NFS4ERR_DEADSESSION: +			set_bit(NFS_DELEGATED_STATE, &state->flags); +			nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); +			return -EAGAIN; +		case -NFS4ERR_STALE_CLIENTID: +		case -NFS4ERR_STALE_STATEID: +			set_bit(NFS_DELEGATED_STATE, &state->flags); +		case -NFS4ERR_EXPIRED: +			/* Don't recall a delegation if it was lost */ +			nfs4_schedule_lease_recovery(server->nfs_client); +			return -EAGAIN; +		case -NFS4ERR_MOVED: +			nfs4_schedule_migration_recovery(server); +			return -EAGAIN; +		case -NFS4ERR_LEASE_MOVED: +			nfs4_schedule_lease_moved_recovery(server->nfs_client); +			return -EAGAIN; +		case -NFS4ERR_DELEG_REVOKED: +		case -NFS4ERR_ADMIN_REVOKED: +		case -NFS4ERR_BAD_STATEID: +		case -NFS4ERR_OPENMODE: +			nfs_inode_find_state_and_recover(state->inode, +					stateid); +			nfs4_schedule_stateid_recovery(server, state); +			return 0; +		case -NFS4ERR_DELAY: +		case -NFS4ERR_GRACE: +			set_bit(NFS_DELEGATED_STATE, &state->flags); +			ssleep(1); +			return -EAGAIN; +		case -ENOMEM: +		case -NFS4ERR_DENIED: +			/* kill_proc(fl->fl_pid, SIGLOST, 1); */ +			return 0; +	} +	return err; +} + +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +{ +	struct nfs_server *server = NFS_SERVER(state->inode);  	struct nfs4_opendata *opendata; -	int ret; +	int err; -	opendata = nfs4_open_recoverdata_alloc(ctx, state); +	opendata = nfs4_open_recoverdata_alloc(ctx, state, +			NFS4_OPEN_CLAIM_DELEG_CUR_FH);  	if (IS_ERR(opendata))  		return PTR_ERR(opendata); -	opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; -	memcpy(opendata->o_arg.u.delegation.data, stateid->data, -			sizeof(opendata->o_arg.u.delegation.data)); -	ret = nfs4_open_recover(opendata, state); +	nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid); +	err = nfs4_open_recover(opendata, state);  	nfs4_opendata_put(opendata); -	return ret; +	return nfs4_handle_delegation_recall_error(server, state, stateid, err);  } -int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)  { -	struct nfs4_exception exception = { }; -	struct nfs_server *server = NFS_SERVER(state->inode); -	int err; -	do { -		err = _nfs4_open_delegation_recall(ctx, state, stateid); -		switch (err) { -			case 0: -			case -ENOENT: -			case -ESTALE: -				goto out; -			case -NFS4ERR_BADSESSION: -			case -NFS4ERR_BADSLOT: -			case -NFS4ERR_BAD_HIGH_SLOT: -			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: -			case -NFS4ERR_DEADSESSION: -				nfs4_schedule_state_recovery( -					server->nfs_client); -				goto out; -			case -NFS4ERR_STALE_CLIENTID: -			case -NFS4ERR_STALE_STATEID: -			case -NFS4ERR_EXPIRED: -				/* Don't recall a delegation if it was lost */ -				nfs4_schedule_state_recovery(server->nfs_client); -				goto out; -			case -ERESTARTSYS: -				/* -				 * The show must go on: exit, but mark the -				 * stateid as needing recovery. -				 */ -			case -NFS4ERR_ADMIN_REVOKED: -			case -NFS4ERR_BAD_STATEID: -				nfs4_state_mark_reclaim_nograce(server->nfs_client, state); -			case -EKEYEXPIRED: -				/* -				 * User RPCSEC_GSS context has expired. -				 * We cannot recover this stateid now, so -				 * skip it and allow recovery thread to -				 * proceed. -				 */ -			case -ENOMEM: -				err = 0; -				goto out; -		} -		err = nfs4_handle_exception(server, err, &exception); -	} while (exception.retry); -out: -	return err; +	struct nfs4_opendata *data = calldata; + +	nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args, +				&data->c_res.seq_res, task);  }  static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)  {  	struct nfs4_opendata *data = calldata; +	nfs40_sequence_done(task, &data->c_res.seq_res); +  	data->rpc_status = task->tk_status;  	if (data->rpc_status == 0) { -		memcpy(data->o_res.stateid.data, data->c_res.stateid.data, -				sizeof(data->o_res.stateid.data)); +		nfs4_stateid_copy(&data->o_res.stateid, &data->c_res.stateid);  		nfs_confirm_seqid(&data->owner->so_seqid, 0);  		renew_lease(data->o_res.server, data->timestamp);  		data->rpc_done = 1; @@ -1302,12 +1713,13 @@ static void nfs4_open_confirm_release(void *calldata)  		goto out_free;  	state = nfs4_opendata_to_nfs4_state(data);  	if (!IS_ERR(state)) -		nfs4_close_state(&data->path, state, data->o_arg.fmode); +		nfs4_close_state(state, data->o_arg.fmode);  out_free:  	nfs4_opendata_put(data);  }  static const struct rpc_call_ops nfs4_open_confirm_ops = { +	.rpc_call_prepare = nfs4_open_confirm_prepare,  	.rpc_call_done = nfs4_open_confirm_done,  	.rpc_release = nfs4_open_confirm_release,  }; @@ -1335,6 +1747,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)  	};  	int status; +	nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1);  	kref_get(&data->kref);  	data->rpc_done = 0;  	data->rpc_status = 0; @@ -1356,9 +1769,10 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)  {  	struct nfs4_opendata *data = calldata;  	struct nfs4_state_owner *sp = data->owner; +	struct nfs_client *clp = sp->so_server->nfs_client;  	if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) -		return; +		goto out_wait;  	/*  	 * Check if we still need to send an OPEN call, or if we can use  	 * a delegation instead. @@ -1370,36 +1784,46 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)  			goto out_no_action;  		rcu_read_lock();  		delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); -		if (delegation != NULL && -		    test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) { -			rcu_read_unlock(); -			goto out_no_action; -		} +		if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR && +		    data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH && +		    can_open_delegated(delegation, data->o_arg.fmode)) +			goto unlock_no_action;  		rcu_read_unlock();  	} -	/* Update sequence id. */ -	data->o_arg.id = sp->so_owner_id.id; -	data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; -	if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { +	/* Update client id. */ +	data->o_arg.clientid = clp->cl_clientid; +	switch (data->o_arg.claim) { +	case NFS4_OPEN_CLAIM_PREVIOUS: +	case NFS4_OPEN_CLAIM_DELEG_CUR_FH: +	case NFS4_OPEN_CLAIM_DELEG_PREV_FH: +		data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0]; +	case NFS4_OPEN_CLAIM_FH:  		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];  		nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);  	}  	data->timestamp = jiffies;  	if (nfs4_setup_sequence(data->o_arg.server,  				&data->o_arg.seq_args, -				&data->o_res.seq_res, 1, task)) -		return; -	rpc_call_start(task); +				&data->o_res.seq_res, +				task) != 0) +		nfs_release_seqid(data->o_arg.seqid); + +	/* Set the create mode (note dependency on the session type) */ +	data->o_arg.createmode = NFS4_CREATE_UNCHECKED; +	if (data->o_arg.open_flags & O_EXCL) { +		data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE; +		if (nfs4_has_persistent_session(clp)) +			data->o_arg.createmode = NFS4_CREATE_GUARDED; +		else if (clp->cl_mvops->minor_version > 0) +			data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1; +	}  	return; +unlock_no_action: +	rcu_read_unlock();  out_no_action:  	task->tk_action = NULL; - -} - -static void nfs4_recover_open_prepare(struct rpc_task *task, void *calldata) -{ -	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); -	nfs4_open_prepare(task, calldata); +out_wait: +	nfs4_sequence_done(task, &data->o_res.seq_res);  }  static void nfs4_open_done(struct rpc_task *task, void *calldata) @@ -1412,7 +1836,8 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)  		return;  	if (task->tk_status == 0) { -		switch (data->o_res.f_attr->mode & S_IFMT) { +		if (data->o_res.f_attr->valid & NFS_ATTR_FATTR_TYPE) { +			switch (data->o_res.f_attr->mode & S_IFMT) {  			case S_IFREG:  				break;  			case S_IFLNK: @@ -1423,6 +1848,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)  				break;  			default:  				data->rpc_status = -ENOTDIR; +			}  		}  		renew_lease(data->o_res.server, data->timestamp);  		if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)) @@ -1447,7 +1873,7 @@ static void nfs4_open_release(void *calldata)  		goto out_free;  	state = nfs4_opendata_to_nfs4_state(data);  	if (!IS_ERR(state)) -		nfs4_close_state(&data->path, state, data->o_arg.fmode); +		nfs4_close_state(state, data->o_arg.fmode);  out_free:  	nfs4_opendata_put(data);  } @@ -1458,12 +1884,6 @@ static const struct rpc_call_ops nfs4_open_ops = {  	.rpc_release = nfs4_open_release,  }; -static const struct rpc_call_ops nfs4_recover_open_ops = { -	.rpc_call_prepare = nfs4_recover_open_prepare, -	.rpc_call_done = nfs4_open_done, -	.rpc_release = nfs4_open_release, -}; -  static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)  {  	struct inode *dir = data->dir->d_inode; @@ -1487,12 +1907,16 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)  	};  	int status; +	nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1);  	kref_get(&data->kref);  	data->rpc_done = 0;  	data->rpc_status = 0;  	data->cancelled = 0; -	if (isrecover) -		task_setup_data.callback_ops = &nfs4_recover_open_ops; +	data->is_recover = 0; +	if (isrecover) { +		nfs4_set_sequence_privileged(&o_arg->seq_args); +		data->is_recover = 1; +	}  	task = rpc_run_task(&task_setup_data);          if (IS_ERR(task))                  return PTR_ERR(task); @@ -1517,7 +1941,7 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)  	if (status != 0 || !data->rpc_done)  		return status; -	nfs_refresh_inode(dir, o_res->dir_attr); +	nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr);  	if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {  		status = _nfs4_proc_open_confirm(data); @@ -1528,6 +1952,43 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)  	return status;  } +static int nfs4_opendata_access(struct rpc_cred *cred, +				struct nfs4_opendata *opendata, +				struct nfs4_state *state, fmode_t fmode, +				int openflags) +{ +	struct nfs_access_entry cache; +	u32 mask; + +	/* access call failed or for some reason the server doesn't +	 * support any access modes -- defer access call until later */ +	if (opendata->o_res.access_supported == 0) +		return 0; + +	mask = 0; +	/* don't check MAY_WRITE - a newly created file may not have +	 * write mode bits, but POSIX allows the creating process to write. +	 * use openflags to check for exec, because fmode won't +	 * always have FMODE_EXEC set when file open for exec. */ +	if (openflags & __FMODE_EXEC) { +		/* ONLY check for exec rights */ +		mask = MAY_EXEC; +	} else if (fmode & FMODE_READ) +		mask = MAY_READ; + +	cache.cred = cred; +	cache.jiffies = jiffies; +	nfs_access_set_mask(&cache, opendata->o_res.access_result); +	nfs_access_add_cache(state->inode, &cache); + +	if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0) +		return 0; + +	/* even though OPEN succeeded, access is denied. Close the file */ +	nfs4_close_state(state, fmode); +	return -EACCES; +} +  /*   * Note: On error, nfs4_proc_open will free the struct nfs4_opendata   */ @@ -1540,14 +2001,24 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)  	int status;  	status = nfs4_run_open_task(data, 0); -	if (status != 0 || !data->rpc_done) +	if (!data->rpc_done) +		return status; +	if (status != 0) { +		if (status == -NFS4ERR_BADNAME && +				!(o_arg->open_flags & O_CREAT)) +			return -ENOENT;  		return status; +	} + +	nfs_fattr_map_and_free_names(server, &data->f_attr);  	if (o_arg->open_flags & O_CREAT) {  		update_changeattr(dir, &o_res->cinfo); -		nfs_post_op_update_inode(dir, o_res->dir_attr); -	} else -		nfs_refresh_inode(dir, o_res->dir_attr); +		if (o_arg->open_flags & O_EXCL) +			data->file_created = 1; +		else if (o_res->cinfo.before != o_res->cinfo.after) +			data->file_created = 1; +	}  	if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)  		server->caps &= ~NFS_CAP_POSIX_LOCK;  	if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { @@ -1556,27 +2027,13 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)  			return status;  	}  	if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) -		_nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr); +		nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);  	return 0;  }  static int nfs4_recover_expired_lease(struct nfs_server *server)  { -	struct nfs_client *clp = server->nfs_client; -	unsigned int loop; -	int ret; - -	for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { -		ret = nfs4_wait_clnt_recover(clp); -		if (ret != 0) -			break; -		if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && -		    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) -			break; -		nfs4_schedule_state_recovery(clp); -		ret = -EIO; -	} -	return ret; +	return nfs4_client_recover_expired_lease(server->nfs_client);  }  /* @@ -1589,12 +2046,13 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s  	struct nfs4_opendata *opendata;  	int ret; -	opendata = nfs4_open_recoverdata_alloc(ctx, state); +	opendata = nfs4_open_recoverdata_alloc(ctx, state, +			NFS4_OPEN_CLAIM_FH);  	if (IS_ERR(opendata))  		return PTR_ERR(opendata);  	ret = nfs4_open_recover(opendata, state);  	if (ret == -ESTALE) -		d_drop(ctx->path.dentry); +		d_drop(ctx->dentry);  	nfs4_opendata_put(opendata);  	return ret;  } @@ -1607,6 +2065,9 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state  	do {  		err = _nfs4_open_expired(ctx, state); +		trace_nfs4_open_expired(ctx, 0, err); +		if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) +			continue;  		switch (err) {  		default:  			goto out; @@ -1627,12 +2088,103 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta  	ctx = nfs4_state_find_open_context(state);  	if (IS_ERR(ctx)) -		return PTR_ERR(ctx); +		return -EAGAIN;  	ret = nfs4_do_open_expired(ctx, state);  	put_nfs_open_context(ctx);  	return ret;  } +#if defined(CONFIG_NFS_V4_1) +static void nfs41_clear_delegation_stateid(struct nfs4_state *state) +{ +	struct nfs_server *server = NFS_SERVER(state->inode); +	nfs4_stateid *stateid = &state->stateid; +	struct nfs_delegation *delegation; +	struct rpc_cred *cred = NULL; +	int status = -NFS4ERR_BAD_STATEID; + +	/* If a state reset has been done, test_stateid is unneeded */ +	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +		return; + +	/* Get the delegation credential for use by test/free_stateid */ +	rcu_read_lock(); +	delegation = rcu_dereference(NFS_I(state->inode)->delegation); +	if (delegation != NULL && +	    nfs4_stateid_match(&delegation->stateid, stateid)) { +		cred = get_rpccred(delegation->cred); +		rcu_read_unlock(); +		status = nfs41_test_stateid(server, stateid, cred); +		trace_nfs4_test_delegation_stateid(state, NULL, status); +	} else +		rcu_read_unlock(); + +	if (status != NFS_OK) { +		/* Free the stateid unless the server explicitly +		 * informs us the stateid is unrecognized. */ +		if (status != -NFS4ERR_BAD_STATEID) +			nfs41_free_stateid(server, stateid, cred); +		nfs_remove_bad_delegation(state->inode); + +		write_seqlock(&state->seqlock); +		nfs4_stateid_copy(&state->stateid, &state->open_stateid); +		write_sequnlock(&state->seqlock); +		clear_bit(NFS_DELEGATED_STATE, &state->flags); +	} + +	if (cred != NULL) +		put_rpccred(cred); +} + +/** + * nfs41_check_open_stateid - possibly free an open stateid + * + * @state: NFSv4 state for an inode + * + * Returns NFS_OK if recovery for this stateid is now finished. + * Otherwise a negative NFS4ERR value is returned. + */ +static int nfs41_check_open_stateid(struct nfs4_state *state) +{ +	struct nfs_server *server = NFS_SERVER(state->inode); +	nfs4_stateid *stateid = &state->open_stateid; +	struct rpc_cred *cred = state->owner->so_cred; +	int status; + +	/* If a state reset has been done, test_stateid is unneeded */ +	if ((test_bit(NFS_O_RDONLY_STATE, &state->flags) == 0) && +	    (test_bit(NFS_O_WRONLY_STATE, &state->flags) == 0) && +	    (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0)) +		return -NFS4ERR_BAD_STATEID; + +	status = nfs41_test_stateid(server, stateid, cred); +	trace_nfs4_test_open_stateid(state, NULL, status); +	if (status != NFS_OK) { +		/* Free the stateid unless the server explicitly +		 * informs us the stateid is unrecognized. */ +		if (status != -NFS4ERR_BAD_STATEID) +			nfs41_free_stateid(server, stateid, cred); + +		clear_bit(NFS_O_RDONLY_STATE, &state->flags); +		clear_bit(NFS_O_WRONLY_STATE, &state->flags); +		clear_bit(NFS_O_RDWR_STATE, &state->flags); +		clear_bit(NFS_OPEN_STATE, &state->flags); +	} +	return status; +} + +static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ +	int status; + +	nfs41_clear_delegation_stateid(state); +	status = nfs41_check_open_stateid(state); +	if (status != NFS_OK) +		status = nfs4_open_expired(sp, state); +	return status; +} +#endif +  /*   * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-*   * fields corresponding to attributes that were used to store the verifier. @@ -1649,80 +2201,180 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct  		sattr->ia_valid |= ATTR_MTIME;  } +static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, +		fmode_t fmode, +		int flags, +		struct nfs_open_context *ctx) +{ +	struct nfs4_state_owner *sp = opendata->owner; +	struct nfs_server *server = sp->so_server; +	struct dentry *dentry; +	struct nfs4_state *state; +	unsigned int seq; +	int ret; + +	seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); + +	ret = _nfs4_proc_open(opendata); +	if (ret != 0) +		goto out; + +	state = nfs4_opendata_to_nfs4_state(opendata); +	ret = PTR_ERR(state); +	if (IS_ERR(state)) +		goto out; +	if (server->caps & NFS_CAP_POSIX_LOCK) +		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + +	dentry = opendata->dentry; +	if (dentry->d_inode == NULL) { +		/* FIXME: Is this d_drop() ever needed? */ +		d_drop(dentry); +		dentry = d_add_unique(dentry, igrab(state->inode)); +		if (dentry == NULL) { +			dentry = opendata->dentry; +		} else if (dentry != ctx->dentry) { +			dput(ctx->dentry); +			ctx->dentry = dget(dentry); +		} +		nfs_set_verifier(dentry, +				nfs_save_change_attribute(opendata->dir->d_inode)); +	} + +	ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); +	if (ret != 0) +		goto out; + +	ctx->state = state; +	if (dentry->d_inode == state->inode) { +		nfs_inode_attach_open_context(ctx); +		if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) +			nfs4_schedule_stateid_recovery(server, state); +	} +out: +	return ret; +} +  /*   * Returns a referenced nfs4_state   */ -static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) +static int _nfs4_do_open(struct inode *dir, +			struct nfs_open_context *ctx, +			int flags, +			struct iattr *sattr, +			struct nfs4_label *label, +			int *opened)  {  	struct nfs4_state_owner  *sp;  	struct nfs4_state     *state = NULL;  	struct nfs_server       *server = NFS_SERVER(dir);  	struct nfs4_opendata *opendata; +	struct dentry *dentry = ctx->dentry; +	struct rpc_cred *cred = ctx->cred; +	struct nfs4_threshold **ctx_th = &ctx->mdsthreshold; +	fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC); +	enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL; +	struct nfs4_label *olabel = NULL;  	int status;  	/* Protect against reboot recovery conflicts */  	status = -ENOMEM; -	if (!(sp = nfs4_get_state_owner(server, cred))) { +	sp = nfs4_get_state_owner(server, cred, GFP_KERNEL); +	if (sp == NULL) {  		dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");  		goto out_err;  	}  	status = nfs4_recover_expired_lease(server);  	if (status != 0)  		goto err_put_state_owner; -	if (path->dentry->d_inode != NULL) -		nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode); +	if (dentry->d_inode != NULL) +		nfs4_return_incompatible_delegation(dentry->d_inode, fmode);  	status = -ENOMEM; -	opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL); +	if (dentry->d_inode) +		claim = NFS4_OPEN_CLAIM_FH; +	opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, +			label, claim, GFP_KERNEL);  	if (opendata == NULL)  		goto err_put_state_owner; -	if (path->dentry->d_inode != NULL) -		opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp); +	if (label) { +		olabel = nfs4_label_alloc(server, GFP_KERNEL); +		if (IS_ERR(olabel)) { +			status = PTR_ERR(olabel); +			goto err_opendata_put; +		} +	} -	status = _nfs4_proc_open(opendata); -	if (status != 0) -		goto err_opendata_put; +	if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { +		if (!opendata->f_attr.mdsthreshold) { +			opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); +			if (!opendata->f_attr.mdsthreshold) +				goto err_free_label; +		} +		opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; +	} +	if (dentry->d_inode != NULL) +		opendata->state = nfs4_get_open_state(dentry->d_inode, sp); -	state = nfs4_opendata_to_nfs4_state(opendata); -	status = PTR_ERR(state); -	if (IS_ERR(state)) -		goto err_opendata_put; -	if (server->caps & NFS_CAP_POSIX_LOCK) -		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); +	status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx); +	if (status != 0) +		goto err_free_label; +	state = ctx->state; -	if (opendata->o_arg.open_flags & O_EXCL) { +	if ((opendata->o_arg.open_flags & O_EXCL) && +	    (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {  		nfs4_exclusive_attrset(opendata, sattr);  		nfs_fattr_init(opendata->o_res.f_attr);  		status = nfs4_do_setattr(state->inode, cred,  				opendata->o_res.f_attr, sattr, -				state); -		if (status == 0) +				state, label, olabel); +		if (status == 0) {  			nfs_setattr_update_inode(state->inode, sattr); -		nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); +			nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); +			nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); +		}  	} +	if (opendata->file_created) +		*opened |= FILE_CREATED; + +	if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) { +		*ctx_th = opendata->f_attr.mdsthreshold; +		opendata->f_attr.mdsthreshold = NULL; +	} + +	nfs4_label_free(olabel); +  	nfs4_opendata_put(opendata);  	nfs4_put_state_owner(sp); -	*res = state;  	return 0; +err_free_label: +	nfs4_label_free(olabel);  err_opendata_put:  	nfs4_opendata_put(opendata);  err_put_state_owner:  	nfs4_put_state_owner(sp);  out_err: -	*res = NULL;  	return status;  } -static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred) +static struct nfs4_state *nfs4_do_open(struct inode *dir, +					struct nfs_open_context *ctx, +					int flags, +					struct iattr *sattr, +					struct nfs4_label *label, +					int *opened)  { +	struct nfs_server *server = NFS_SERVER(dir);  	struct nfs4_exception exception = { };  	struct nfs4_state *res;  	int status;  	do { -		status = _nfs4_do_open(dir, path, fmode, flags, sattr, cred, &res); +		status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened); +		res = ctx->state; +		trace_nfs4_open_file(ctx, flags, status);  		if (status == 0)  			break;  		/* NOTE: BAD_SEQID means the server and client disagree about the @@ -1737,7 +2389,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmo  		 * the user though...  		 */  		if (status == -NFS4ERR_BAD_SEQID) { -			printk(KERN_WARNING "NFS: v4 server %s " +			pr_warn_ratelimited("NFS: v4 server %s "  					" returned a bad sequence-id error!\n",  					NFS_SERVER(dir)->nfs_client->cl_hostname);  			exception.retry = 1; @@ -1758,7 +2410,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmo  			exception.retry = 1;  			continue;  		} -		res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), +		if (nfs4_clear_cap_atomic_open_v1(server, status, &exception)) +			continue; +		res = ERR_PTR(nfs4_handle_exception(server,  					status, &exception));  	} while (exception.retry);  	return res; @@ -1766,7 +2420,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmo  static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,  			    struct nfs_fattr *fattr, struct iattr *sattr, -			    struct nfs4_state *state) +			    struct nfs4_state *state, struct nfs4_label *ilabel, +			    struct nfs4_label *olabel)  {  	struct nfs_server *server = NFS_SERVER(inode);          struct nfs_setattrargs  arg = { @@ -1774,9 +2429,11 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,                  .iap            = sattr,  		.server		= server,  		.bitmask = server->attr_bitmask, +		.label		= ilabel,          };          struct nfs_setattrres  res = {  		.fattr		= fattr, +		.label		= olabel,  		.server		= server,          };          struct rpc_message msg = { @@ -1786,18 +2443,36 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,  		.rpc_cred	= cred,          };  	unsigned long timestamp = jiffies; +	fmode_t fmode; +	bool truncate;  	int status; +	arg.bitmask = nfs4_bitmask(server, ilabel); +	if (ilabel) +		arg.bitmask = nfs4_bitmask(server, olabel); +  	nfs_fattr_init(fattr); -	if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { +	/* Servers should only apply open mode checks for file size changes */ +	truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; +	fmode = truncate ? FMODE_WRITE : FMODE_READ; + +	if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) {  		/* Use that stateid */ -	} else if (state != NULL) { -		nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid); +	} else if (truncate && state != NULL) { +		struct nfs_lockowner lockowner = { +			.l_owner = current->files, +			.l_pid = current->tgid, +		}; +		if (!nfs4_valid_open_stateid(state)) +			return -EBADF; +		if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, +				&lockowner) == -EIO) +			return -EBADF;  	} else -		memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); +		nfs4_stateid_copy(&arg.stateid, &zero_stateid); -	status = nfs4_call_sync(server, &msg, &arg, &res, 1); +	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);  	if (status == 0 && state != NULL)  		renew_lease(server, timestamp);  	return status; @@ -1805,71 +2480,88 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,  static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,  			   struct nfs_fattr *fattr, struct iattr *sattr, -			   struct nfs4_state *state) +			   struct nfs4_state *state, struct nfs4_label *ilabel, +			   struct nfs4_label *olabel)  {  	struct nfs_server *server = NFS_SERVER(inode); -	struct nfs4_exception exception = { }; +	struct nfs4_exception exception = { +		.state = state, +		.inode = inode, +	};  	int err;  	do { -		err = nfs4_handle_exception(server, -				_nfs4_do_setattr(inode, cred, fattr, sattr, state), -				&exception); +		err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); +		trace_nfs4_setattr(inode, err); +		switch (err) { +		case -NFS4ERR_OPENMODE: +			if (!(sattr->ia_valid & ATTR_SIZE)) { +				pr_warn_once("NFSv4: server %s is incorrectly " +						"applying open mode checks to " +						"a SETATTR that is not " +						"changing file size.\n", +						server->nfs_client->cl_hostname); +			} +			if (state && !(state->state & FMODE_WRITE)) { +				err = -EBADF; +				if (sattr->ia_valid & ATTR_OPEN) +					err = -EACCES; +				goto out; +			} +		} +		err = nfs4_handle_exception(server, err, &exception);  	} while (exception.retry); +out:  	return err;  }  struct nfs4_closedata { -	struct path path;  	struct inode *inode;  	struct nfs4_state *state;  	struct nfs_closeargs arg;  	struct nfs_closeres res;  	struct nfs_fattr fattr;  	unsigned long timestamp; +	bool roc; +	u32 roc_barrier;  };  static void nfs4_free_closedata(void *data)  {  	struct nfs4_closedata *calldata = data;  	struct nfs4_state_owner *sp = calldata->state->owner; +	struct super_block *sb = calldata->state->inode->i_sb; +	if (calldata->roc) +		pnfs_roc_release(calldata->state->inode);  	nfs4_put_open_state(calldata->state);  	nfs_free_seqid(calldata->arg.seqid);  	nfs4_put_state_owner(sp); -	path_put(&calldata->path); +	nfs_sb_deactive(sb);  	kfree(calldata);  } -static void nfs4_close_clear_stateid_flags(struct nfs4_state *state, -		fmode_t fmode) -{ -	spin_lock(&state->owner->so_lock); -	if (!(fmode & FMODE_READ)) -		clear_bit(NFS_O_RDONLY_STATE, &state->flags); -	if (!(fmode & FMODE_WRITE)) -		clear_bit(NFS_O_WRONLY_STATE, &state->flags); -	clear_bit(NFS_O_RDWR_STATE, &state->flags); -	spin_unlock(&state->owner->so_lock); -} -  static void nfs4_close_done(struct rpc_task *task, void *data)  {  	struct nfs4_closedata *calldata = data;  	struct nfs4_state *state = calldata->state;  	struct nfs_server *server = NFS_SERVER(calldata->inode); +	dprintk("%s: begin!\n", __func__);  	if (!nfs4_sequence_done(task, &calldata->res.seq_res))  		return; +	trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status);          /* hmm. we are done with the inode, and in the process of freeing  	 * the state_owner. we keep this around to process errors  	 */  	switch (task->tk_status) {  		case 0: -			nfs_set_open_stateid(state, &calldata->res.stateid, 0); +			if (calldata->roc) +				pnfs_roc_set_barrier(state->inode, +						     calldata->roc_barrier); +			nfs_clear_open_stateid(state, &calldata->res.stateid, 0);  			renew_lease(server, calldata->timestamp); -			nfs4_close_clear_stateid_flags(state, -					calldata->arg.fmode); -			break; +			goto out_release; +		case -NFS4ERR_ADMIN_REVOKED:  		case -NFS4ERR_STALE_STATEID:  		case -NFS4ERR_OLD_STATEID:  		case -NFS4ERR_BAD_STATEID: @@ -1877,21 +2569,28 @@ static void nfs4_close_done(struct rpc_task *task, void *data)  			if (calldata->arg.fmode == 0)  				break;  		default: -			if (nfs4_async_handle_error(task, server, state) == -EAGAIN) +			if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {  				rpc_restart_call_prepare(task); +				goto out_release; +			}  	} +	nfs_clear_open_stateid(state, NULL, calldata->arg.fmode); +out_release:  	nfs_release_seqid(calldata->arg.seqid);  	nfs_refresh_inode(calldata->inode, calldata->res.fattr); +	dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);  }  static void nfs4_close_prepare(struct rpc_task *task, void *data)  {  	struct nfs4_closedata *calldata = data;  	struct nfs4_state *state = calldata->state; +	struct inode *inode = calldata->inode;  	int call_close = 0; +	dprintk("%s: begin!\n", __func__);  	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) -		return; +		goto out_wait;  	task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];  	calldata->arg.fmode = FMODE_READ|FMODE_WRITE; @@ -1909,24 +2608,37 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)  			calldata->arg.fmode &= ~FMODE_WRITE;  		}  	} +	if (!nfs4_valid_open_stateid(state)) +		call_close = 0;  	spin_unlock(&state->owner->so_lock);  	if (!call_close) {  		/* Note: exit _without_ calling nfs4_close_done */ -		task->tk_action = NULL; -		return; +		goto out_no_action;  	} -	if (calldata->arg.fmode == 0) +	if (calldata->arg.fmode == 0) {  		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; +		if (calldata->roc && +		    pnfs_roc_drain(inode, &calldata->roc_barrier, task)) { +			nfs_release_seqid(calldata->arg.seqid); +			goto out_wait; +		    } +	}  	nfs_fattr_init(calldata->res.fattr);  	calldata->timestamp = jiffies; -	if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), -				&calldata->arg.seq_args, &calldata->res.seq_res, -				1, task)) -		return; -	rpc_call_start(task); +	if (nfs4_setup_sequence(NFS_SERVER(inode), +				&calldata->arg.seq_args, +				&calldata->res.seq_res, +				task) != 0) +		nfs_release_seqid(calldata->arg.seqid); +	dprintk("%s: done!\n", __func__); +	return; +out_no_action: +	task->tk_action = NULL; +out_wait: +	nfs4_sequence_done(task, &calldata->res.seq_res);  }  static const struct rpc_call_ops nfs4_close_ops = { @@ -1946,7 +2658,7 @@ static const struct rpc_call_ops nfs4_close_ops = {   *   * NOTE: Caller must be holding the sp->so_owner semaphore!   */ -int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait) +int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)  {  	struct nfs_server *server = NFS_SERVER(state->inode);  	struct nfs4_closedata *calldata; @@ -1965,9 +2677,13 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i  	};  	int status = -ENOMEM; +	nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, +		&task_setup_data.rpc_client, &msg); +  	calldata = kzalloc(sizeof(*calldata), gfp_mask);  	if (calldata == NULL)  		goto out; +	nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1);  	calldata->inode = state->inode;  	calldata->state = state;  	calldata->arg.fh = NFS_FH(state->inode); @@ -1981,11 +2697,11 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, i  	calldata->res.fattr = &calldata->fattr;  	calldata->res.seqid = calldata->arg.seqid;  	calldata->res.server = server; -	path_get(path); -	calldata->path = *path; +	calldata->roc = pnfs_roc(state->inode); +	nfs_sb_active(calldata->inode->i_sb); -	msg.rpc_argp = &calldata->arg, -	msg.rpc_resp = &calldata->res, +	msg.rpc_argp = &calldata->arg; +	msg.rpc_resp = &calldata->res;  	task_setup_data.callback_data = calldata;  	task = rpc_run_task(&task_setup_data);  	if (IS_ERR(task)) @@ -2004,16 +2720,22 @@ out:  }  static struct inode * -nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) +nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, +		int open_flags, struct iattr *attr, int *opened)  {  	struct nfs4_state *state; +	struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL; + +	label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);  	/* Protect against concurrent sillydeletes */ -	state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred); +	state = nfs4_do_open(dir, ctx, open_flags, attr, label, opened); + +	nfs4_label_release_security(label); +  	if (IS_ERR(state))  		return ERR_CAST(state); -	ctx->state = state; -	return igrab(state->inode); +	return state->inode;  }  static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) @@ -2021,11 +2743,15 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)  	if (ctx->state == NULL)  		return;  	if (is_sync) -		nfs4_close_sync(&ctx->path, ctx->state, ctx->mode); +		nfs4_close_sync(ctx->state, ctx->mode);  	else -		nfs4_close_state(&ctx->path, ctx->state, ctx->mode); +		nfs4_close_state(ctx->state, ctx->mode);  } +#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL) +#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL) +#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL) +  static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)  {  	struct nfs4_server_caps_arg args = { @@ -2039,15 +2765,29 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f  	};  	int status; -	status = nfs4_call_sync(server, &msg, &args, &res, 0); +	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);  	if (status == 0) { +		/* Sanity check the server answers */ +		switch (server->nfs_client->cl_minorversion) { +		case 0: +			res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK; +			res.attr_bitmask[2] = 0; +			break; +		case 1: +			res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK; +			break; +		case 2: +			res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK; +		}  		memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));  		server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|  				NFS_CAP_SYMLINKS|NFS_CAP_FILEID|  				NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|  				NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| -				NFS_CAP_CTIME|NFS_CAP_MTIME); -		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) +				NFS_CAP_CTIME|NFS_CAP_MTIME| +				NFS_CAP_SECURITY_LABEL); +		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && +				res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)  			server->caps |= NFS_CAP_ACLS;  		if (res.has_links != 0)  			server->caps |= NFS_CAP_HARDLINKS; @@ -2069,11 +2809,20 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f  			server->caps |= NFS_CAP_CTIME;  		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)  			server->caps |= NFS_CAP_MTIME; +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +		if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL) +			server->caps |= NFS_CAP_SECURITY_LABEL; +#endif +		memcpy(server->attr_bitmask_nl, res.attr_bitmask, +				sizeof(server->attr_bitmask)); +		server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;  		memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));  		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;  		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; +		server->cache_consistency_bitmask[2] = 0;  		server->acl_bitmask = res.acl_bitmask; +		server->fh_expire_type = res.fh_expire_type;  	}  	return status; @@ -2094,8 +2843,9 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)  static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,  		struct nfs_fsinfo *info)  { +	u32 bitmask[3];  	struct nfs4_lookup_root_arg args = { -		.bitmask = nfs4_fattr_bitmap, +		.bitmask = bitmask,  	};  	struct nfs4_lookup_res res = {  		.server = server, @@ -2108,8 +2858,15 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,  		.rpc_resp = &res,  	}; +	bitmask[0] = nfs4_fattr_bitmap[0]; +	bitmask[1] = nfs4_fattr_bitmap[1]; +	/* +	 * Process the label in the upcoming getfattr +	 */ +	bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL; +  	nfs_fattr_init(info->fattr); -	return nfs4_call_sync(server, &msg, &args, &res, 0); +	return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);  }  static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, @@ -2118,35 +2875,174 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,  	struct nfs4_exception exception = { };  	int err;  	do { -		err = nfs4_handle_exception(server, -				_nfs4_lookup_root(server, fhandle, info), -				&exception); +		err = _nfs4_lookup_root(server, fhandle, info); +		trace_nfs4_lookup_root(server, fhandle, info->fattr, err); +		switch (err) { +		case 0: +		case -NFS4ERR_WRONGSEC: +			goto out; +		default: +			err = nfs4_handle_exception(server, err, &exception); +		}  	} while (exception.retry); +out:  	return err;  } +static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, +				struct nfs_fsinfo *info, rpc_authflavor_t flavor) +{ +	struct rpc_auth_create_args auth_args = { +		.pseudoflavor = flavor, +	}; +	struct rpc_auth *auth; +	int ret; + +	auth = rpcauth_create(&auth_args, server->client); +	if (IS_ERR(auth)) { +		ret = -EACCES; +		goto out; +	} +	ret = nfs4_lookup_root(server, fhandle, info); +out: +	return ret; +} +  /* - * get the file handle for the "/" directory on the server + * Retry pseudoroot lookup with various security flavors.  We do this when: + * + *   NFSv4.0: the PUTROOTFH operation returns NFS4ERR_WRONGSEC + *   NFSv4.1: the server does not support the SECINFO_NO_NAME operation + * + * Returns zero on success, or a negative NFS4ERR value, or a + * negative errno value.   */ -static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, +static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,  			      struct nfs_fsinfo *info)  { +	/* Per 3530bis 15.33.5 */ +	static const rpc_authflavor_t flav_array[] = { +		RPC_AUTH_GSS_KRB5P, +		RPC_AUTH_GSS_KRB5I, +		RPC_AUTH_GSS_KRB5, +		RPC_AUTH_UNIX,			/* courtesy */ +		RPC_AUTH_NULL, +	}; +	int status = -EPERM; +	size_t i; + +	if (server->auth_info.flavor_len > 0) { +		/* try each flavor specified by user */ +		for (i = 0; i < server->auth_info.flavor_len; i++) { +			status = nfs4_lookup_root_sec(server, fhandle, info, +						server->auth_info.flavors[i]); +			if (status == -NFS4ERR_WRONGSEC || status == -EACCES) +				continue; +			break; +		} +	} else { +		/* no flavors specified by user, try default list */ +		for (i = 0; i < ARRAY_SIZE(flav_array); i++) { +			status = nfs4_lookup_root_sec(server, fhandle, info, +						      flav_array[i]); +			if (status == -NFS4ERR_WRONGSEC || status == -EACCES) +				continue; +			break; +		} +	} + +	/* +	 * -EACCESS could mean that the user doesn't have correct permissions +	 * to access the mount.  It could also mean that we tried to mount +	 * with a gss auth flavor, but rpc.gssd isn't running.  Either way, +	 * existing mount programs don't handle -EACCES very well so it should +	 * be mapped to -EPERM instead. +	 */ +	if (status == -EACCES) +		status = -EPERM; +	return status; +} + +static int nfs4_do_find_root_sec(struct nfs_server *server, +		struct nfs_fh *fhandle, struct nfs_fsinfo *info) +{ +	int mv = server->nfs_client->cl_minorversion; +	return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info); +} + +/** + * nfs4_proc_get_rootfh - get file handle for server's pseudoroot + * @server: initialized nfs_server handle + * @fhandle: we fill in the pseudo-fs root file handle + * @info: we fill in an FSINFO struct + * @auth_probe: probe the auth flavours + * + * Returns zero on success, or a negative errno. + */ +int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, +			 struct nfs_fsinfo *info, +			 bool auth_probe) +{  	int status; -	status = nfs4_lookup_root(server, fhandle, info); +	switch (auth_probe) { +	case false: +		status = nfs4_lookup_root(server, fhandle, info); +		if (status != -NFS4ERR_WRONGSEC) +			break; +	default: +		status = nfs4_do_find_root_sec(server, fhandle, info); +	} +  	if (status == 0)  		status = nfs4_server_capabilities(server, fhandle);  	if (status == 0)  		status = nfs4_do_fsinfo(server, fhandle, info); +  	return nfs4_map_errors(status);  } +static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh, +			      struct nfs_fsinfo *info) +{ +	int error; +	struct nfs_fattr *fattr = info->fattr; +	struct nfs4_label *label = NULL; + +	error = nfs4_server_capabilities(server, mntfh); +	if (error < 0) { +		dprintk("nfs4_get_root: getcaps error = %d\n", -error); +		return error; +	} + +	label = nfs4_label_alloc(server, GFP_KERNEL); +	if (IS_ERR(label)) +		return PTR_ERR(label); + +	error = nfs4_proc_getattr(server, mntfh, fattr, label); +	if (error < 0) { +		dprintk("nfs4_get_root: getattr error = %d\n", -error); +		goto err_free_label; +	} + +	if (fattr->valid & NFS_ATTR_FATTR_FSID && +	    !nfs_fsid_equal(&server->fsid, &fattr->fsid)) +		memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); + +err_free_label: +	nfs4_label_free(label); + +	return error; +} +  /*   * Get locations and (maybe) other attributes of a referral.   * Note that we'll actually follow the referral later when   * we detect fsid mismatch in inode revalidation   */ -static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct nfs_fattr *fattr, struct nfs_fh *fhandle) +static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, +			     const struct qstr *name, struct nfs_fattr *fattr, +			     struct nfs_fh *fhandle)  {  	int status = -ENOMEM;  	struct page *page = NULL; @@ -2159,20 +3055,26 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct  	if (locations == NULL)  		goto out; -	status = nfs4_proc_fs_locations(dir, name, locations, page); +	status = nfs4_proc_fs_locations(client, dir, name, locations, page);  	if (status != 0)  		goto out; -	/* Make sure server returned a different fsid for the referral */ + +	/* +	 * If the fsid didn't change, this is a migration event, not a +	 * referral.  Cause us to drop into the exception handler, which +	 * will kick off migration recovery. +	 */  	if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { -		dprintk("%s: server did not return a different fsid for a referral at %s\n", __func__, name->name); -		status = -EIO; +		dprintk("%s: server did not return a different fsid for" +			" a referral at %s\n", __func__, name->name); +		status = -NFS4ERR_MOVED;  		goto out;  	} +	/* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ +	nfs_fixup_referral_attributes(&locations->fattr); +	/* replace the lookup nfs_fattr with the locations nfs_fattr */  	memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr)); -	fattr->valid |= NFS_ATTR_FATTR_V4_REFERRAL; -	if (!fattr->mode) -		fattr->mode = S_IFDIR;  	memset(fhandle, 0, sizeof(struct nfs_fh));  out:  	if (page) @@ -2181,7 +3083,8 @@ out:  	return status;  } -static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, +				struct nfs_fattr *fattr, struct nfs4_label *label)  {  	struct nfs4_getattr_arg args = {  		.fh = fhandle, @@ -2189,6 +3092,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,  	};  	struct nfs4_getattr_res res = {  		.fattr = fattr, +		.label = label,  		.server = server,  	};  	struct rpc_message msg = { @@ -2196,18 +3100,22 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,  		.rpc_argp = &args,  		.rpc_resp = &res,  	}; -	 + +	args.bitmask = nfs4_bitmask(server, label); +  	nfs_fattr_init(fattr); -	return nfs4_call_sync(server, &msg, &args, &res, 0); +	return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);  } -static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, +				struct nfs_fattr *fattr, struct nfs4_label *label)  {  	struct nfs4_exception exception = { };  	int err;  	do { -		err = nfs4_handle_exception(server, -				_nfs4_proc_getattr(server, fhandle, fattr), +		err = _nfs4_proc_getattr(server, fhandle, fattr, label); +		trace_nfs4_getattr(server, fhandle, fattr, err); +		err = nfs4_handle_exception(server, err,  				&exception);  	} while (exception.retry);  	return err; @@ -2237,10 +3145,22 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,  	struct inode *inode = dentry->d_inode;  	struct rpc_cred *cred = NULL;  	struct nfs4_state *state = NULL; +	struct nfs4_label *label = NULL;  	int status; +	if (pnfs_ld_layoutret_on_setattr(inode)) +		pnfs_commit_and_return_layout(inode); +  	nfs_fattr_init(fattr); +	/* Deal with open(O_TRUNC) */ +	if (sattr->ia_valid & ATTR_OPEN) +		sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME); + +	/* Optimization: if the end result is no change, don't RPC */ +	if ((sattr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0) +		return 0; +  	/* Search for an existing open(O_WRITE) file */  	if (sattr->ia_valid & ATTR_FILE) {  		struct nfs_open_context *ctx; @@ -2252,25 +3172,34 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,  		}  	} -	status = nfs4_do_setattr(inode, cred, fattr, sattr, state); -	if (status == 0) +	label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); +	if (IS_ERR(label)) +		return PTR_ERR(label); + +	status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label); +	if (status == 0) {  		nfs_setattr_update_inode(inode, sattr); +		nfs_setsecurity(inode, fattr, label); +	} +	nfs4_label_free(label);  	return status;  } -static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *dirfh, +static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,  		const struct qstr *name, struct nfs_fh *fhandle, -		struct nfs_fattr *fattr) +		struct nfs_fattr *fattr, struct nfs4_label *label)  { +	struct nfs_server *server = NFS_SERVER(dir);  	int		       status;  	struct nfs4_lookup_arg args = {  		.bitmask = server->attr_bitmask, -		.dir_fh = dirfh, +		.dir_fh = NFS_FH(dir),  		.name = name,  	};  	struct nfs4_lookup_res res = {  		.server = server,  		.fattr = fattr, +		.label = label,  		.fh = fhandle,  	};  	struct rpc_message msg = { @@ -2279,55 +3208,91 @@ static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *d  		.rpc_resp = &res,  	}; +	args.bitmask = nfs4_bitmask(server, label); +  	nfs_fattr_init(fattr); -	dprintk("NFS call  lookupfh %s\n", name->name); -	status = nfs4_call_sync(server, &msg, &args, &res, 0); -	dprintk("NFS reply lookupfh: %d\n", status); +	dprintk("NFS call  lookup %s\n", name->name); +	status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); +	dprintk("NFS reply lookup: %d\n", status);  	return status;  } -static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, -			      struct qstr *name, struct nfs_fh *fhandle, -			      struct nfs_fattr *fattr) +static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr) +{ +	fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | +		NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_MOUNTPOINT; +	fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO; +	fattr->nlink = 2; +} + +static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, +				   struct qstr *name, struct nfs_fh *fhandle, +				   struct nfs_fattr *fattr, struct nfs4_label *label)  {  	struct nfs4_exception exception = { }; +	struct rpc_clnt *client = *clnt;  	int err;  	do { -		err = _nfs4_proc_lookupfh(server, dirfh, name, fhandle, fattr); -		/* FIXME: !!!! */ -		if (err == -NFS4ERR_MOVED) { -			err = -EREMOTE; +		err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label); +		trace_nfs4_lookup(dir, name, err); +		switch (err) { +		case -NFS4ERR_BADNAME: +			err = -ENOENT; +			goto out; +		case -NFS4ERR_MOVED: +			err = nfs4_get_referral(client, dir, name, fattr, fhandle); +			goto out; +		case -NFS4ERR_WRONGSEC: +			err = -EPERM; +			if (client != *clnt) +				goto out; +			client = nfs4_negotiate_security(client, dir, name); +			if (IS_ERR(client)) +				return PTR_ERR(client); + +			exception.retry = 1;  			break; +		default: +			err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception);  		} -		err = nfs4_handle_exception(server, err, &exception);  	} while (exception.retry); + +out: +	if (err == 0) +		*clnt = client; +	else if (client != *clnt) +		rpc_shutdown_client(client); +  	return err;  } -static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, -		struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, +			    struct nfs_fh *fhandle, struct nfs_fattr *fattr, +			    struct nfs4_label *label)  {  	int status; -	 -	dprintk("NFS call  lookup %s\n", name->name); -	status = _nfs4_proc_lookupfh(NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); -	if (status == -NFS4ERR_MOVED) -		status = nfs4_get_referral(dir, name, fattr, fhandle); -	dprintk("NFS reply lookup: %d\n", status); +	struct rpc_clnt *client = NFS_CLIENT(dir); + +	status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label); +	if (client != NFS_CLIENT(dir)) { +		rpc_shutdown_client(client); +		nfs_fixup_secinfo_attributes(fattr); +	}  	return status;  } -static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) +struct rpc_clnt * +nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name, +			    struct nfs_fh *fhandle, struct nfs_fattr *fattr)  { -	struct nfs4_exception exception = { }; -	int err; -	do { -		err = nfs4_handle_exception(NFS_SERVER(dir), -				_nfs4_proc_lookup(dir, name, fhandle, fattr), -				&exception); -	} while (exception.retry); -	return err; +	struct rpc_clnt *client = NFS_CLIENT(dir); +	int status; + +	status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL); +	if (status < 0) +		return ERR_PTR(status); +	return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client;  }  static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) @@ -2335,7 +3300,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry  	struct nfs_server *server = NFS_SERVER(inode);  	struct nfs4_accessargs args = {  		.fh = NFS_FH(inode), -		.bitmask = server->attr_bitmask, +		.bitmask = server->cache_consistency_bitmask,  	};  	struct nfs4_accessres res = {  		.server = server, @@ -2347,7 +3312,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry  		.rpc_cred = entry->cred,  	};  	int mode = entry->mask; -	int status; +	int status = 0;  	/*  	 * Determine which access bits we want to ask for... @@ -2370,15 +3335,9 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry  	if (res.fattr == NULL)  		return -ENOMEM; -	status = nfs4_call_sync(server, &msg, &args, &res, 0); +	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);  	if (!status) { -		entry->mask = 0; -		if (res.access & NFS4_ACCESS_READ) -			entry->mask |= MAY_READ; -		if (res.access & (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE)) -			entry->mask |= MAY_WRITE; -		if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) -			entry->mask |= MAY_EXEC; +		nfs_access_set_mask(entry, res.access);  		nfs_refresh_inode(inode, res.fattr);  	}  	nfs_free_fattr(res.fattr); @@ -2390,8 +3349,9 @@ static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)  	struct nfs4_exception exception = { };  	int err;  	do { -		err = nfs4_handle_exception(NFS_SERVER(inode), -				_nfs4_proc_access(inode, entry), +		err = _nfs4_proc_access(inode, entry); +		trace_nfs4_access(inode, err); +		err = nfs4_handle_exception(NFS_SERVER(inode), err,  				&exception);  	} while (exception.retry);  	return err; @@ -2414,9 +3374,7 @@ static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)   *   * In the case of WRITE, we also want to put the GETATTR after   * the operation -- in this case because we want to make sure - * we get the post-operation mtime and size.  This means that - * we can't use xdr_encode_pages() as written: we need a variant - * of it which would leave room in the 'tail' iovec. + * we get the post-operation mtime and size.   *   * Both of these changes to the XDR layer would in fact be quite   * minor, but I decided to leave them for a subsequent patch. @@ -2437,7 +3395,7 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page,  		.rpc_resp = &res,  	}; -	return nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0); +	return nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);  }  static int nfs4_proc_readlink(struct inode *inode, struct page *page, @@ -2446,59 +3404,42 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page,  	struct nfs4_exception exception = { };  	int err;  	do { -		err = nfs4_handle_exception(NFS_SERVER(inode), -				_nfs4_proc_readlink(inode, page, pgbase, pglen), +		err = _nfs4_proc_readlink(inode, page, pgbase, pglen); +		trace_nfs4_readlink(inode, err); +		err = nfs4_handle_exception(NFS_SERVER(inode), err,  				&exception);  	} while (exception.retry);  	return err;  }  /* - * Got race? - * We will need to arrange for the VFS layer to provide an atomic open. - * Until then, this create/open method is prone to inefficiency and race - * conditions due to the lookup, create, and open VFS calls from sys_open() - * placed on the wire. - * - * Given the above sorry state of affairs, I'm simply sending an OPEN. - * The file will be opened again in the subsequent VFS open call - * (nfs4_proc_file_open). - * - * The open for read will just hang around to be used by any process that - * opens the file O_RDONLY. This will all be resolved with the VFS changes. + * This is just for mknod.  open(O_CREAT) will always do ->open_context().   */ -  static int  nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, -                 int flags, struct nfs_open_context *ctx) +		 int flags)  { -	struct path my_path = { -		.dentry = dentry, -	}; -	struct path *path = &my_path; +	struct nfs4_label l, *ilabel = NULL; +	struct nfs_open_context *ctx;  	struct nfs4_state *state; -	struct rpc_cred *cred = NULL; -	fmode_t fmode = 0; +	int opened = 0;  	int status = 0; -	if (ctx != NULL) { -		cred = ctx->cred; -		path = &ctx->path; -		fmode = ctx->mode; -	} -	state = nfs4_do_open(dir, path, fmode, flags, sattr, cred); -	d_drop(dentry); +	ctx = alloc_nfs_open_context(dentry, FMODE_READ); +	if (IS_ERR(ctx)) +		return PTR_ERR(ctx); + +	ilabel = nfs4_label_init_security(dir, dentry, sattr, &l); + +	sattr->ia_mode &= ~current_umask(); +	state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened);  	if (IS_ERR(state)) {  		status = PTR_ERR(state);  		goto out;  	} -	d_add(dentry, igrab(state->inode)); -	nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); -	if (ctx != NULL) -		ctx->state = state; -	else -		nfs4_close_sync(path, state, fmode);  out: +	nfs4_label_release_security(ilabel); +	put_nfs_open_context(ctx);  	return status;  } @@ -2507,9 +3448,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)  	struct nfs_server *server = NFS_SERVER(dir);  	struct nfs_removeargs args = {  		.fh = NFS_FH(dir), -		.name.len = name->len, -		.name.name = name->name, -		.bitmask = server->attr_bitmask, +		.name = *name,  	};  	struct nfs_removeres res = {  		.server = server, @@ -2519,19 +3458,11 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)  		.rpc_argp = &args,  		.rpc_resp = &res,  	}; -	int status = -ENOMEM; - -	res.dir_attr = nfs_alloc_fattr(); -	if (res.dir_attr == NULL) -		goto out; +	int status; -	status = nfs4_call_sync(server, &msg, &args, &res, 1); -	if (status == 0) { +	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); +	if (status == 0)  		update_changeattr(dir, &res.cinfo); -		nfs_post_op_update_inode(dir, res.dir_attr); -	} -	nfs_free_fattr(res.dir_attr); -out:  	return status;  } @@ -2540,8 +3471,9 @@ static int nfs4_proc_remove(struct inode *dir, struct qstr *name)  	struct nfs4_exception exception = { };  	int err;  	do { -		err = nfs4_handle_exception(NFS_SERVER(dir), -				_nfs4_proc_remove(dir, name), +		err = _nfs4_proc_remove(dir, name); +		trace_nfs4_remove(dir, name, err); +		err = nfs4_handle_exception(NFS_SERVER(dir), err,  				&exception);  	} while (exception.retry);  	return err; @@ -2553,22 +3485,31 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)  	struct nfs_removeargs *args = msg->rpc_argp;  	struct nfs_removeres *res = msg->rpc_resp; -	args->bitmask = server->cache_consistency_bitmask;  	res->server = server; -	res->seq_res.sr_slot = NULL;  	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; +	nfs4_init_sequence(&args->seq_args, &res->seq_res, 1); + +	nfs_fattr_init(res->dir_attr); +} + +static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) +{ +	nfs4_setup_sequence(NFS_SERVER(data->dir), +			&data->args.seq_args, +			&data->res.seq_res, +			task);  }  static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)  { -	struct nfs_removeres *res = task->tk_msg.rpc_resp; +	struct nfs_unlinkdata *data = task->tk_calldata; +	struct nfs_removeres *res = &data->res;  	if (!nfs4_sequence_done(task, &res->seq_res))  		return 0;  	if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)  		return 0;  	update_changeattr(dir, &res->cinfo); -	nfs_post_op_update_inode(dir, res->dir_attr);  	return 1;  } @@ -2579,14 +3520,23 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)  	struct nfs_renameres *res = msg->rpc_resp;  	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; -	arg->bitmask = server->attr_bitmask;  	res->server = server; +	nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1); +} + +static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) +{ +	nfs4_setup_sequence(NFS_SERVER(data->old_dir), +			&data->args.seq_args, +			&data->res.seq_res, +			task);  }  static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,  				 struct inode *new_dir)  { -	struct nfs_renameres *res = task->tk_msg.rpc_resp; +	struct nfs_renamedata *data = task->tk_calldata; +	struct nfs_renameres *res = &data->res;  	if (!nfs4_sequence_done(task, &res->seq_res))  		return 0; @@ -2594,65 +3544,10 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,  		return 0;  	update_changeattr(old_dir, &res->old_cinfo); -	nfs_post_op_update_inode(old_dir, res->old_fattr);  	update_changeattr(new_dir, &res->new_cinfo); -	nfs_post_op_update_inode(new_dir, res->new_fattr);  	return 1;  } -static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, -		struct inode *new_dir, struct qstr *new_name) -{ -	struct nfs_server *server = NFS_SERVER(old_dir); -	struct nfs_renameargs arg = { -		.old_dir = NFS_FH(old_dir), -		.new_dir = NFS_FH(new_dir), -		.old_name = old_name, -		.new_name = new_name, -		.bitmask = server->attr_bitmask, -	}; -	struct nfs_renameres res = { -		.server = server, -	}; -	struct rpc_message msg = { -		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], -		.rpc_argp = &arg, -		.rpc_resp = &res, -	}; -	int status = -ENOMEM; -	 -	res.old_fattr = nfs_alloc_fattr(); -	res.new_fattr = nfs_alloc_fattr(); -	if (res.old_fattr == NULL || res.new_fattr == NULL) -		goto out; - -	status = nfs4_call_sync(server, &msg, &arg, &res, 1); -	if (!status) { -		update_changeattr(old_dir, &res.old_cinfo); -		nfs_post_op_update_inode(old_dir, res.old_fattr); -		update_changeattr(new_dir, &res.new_cinfo); -		nfs_post_op_update_inode(new_dir, res.new_fattr); -	} -out: -	nfs_free_fattr(res.new_fattr); -	nfs_free_fattr(res.old_fattr); -	return status; -} - -static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, -		struct inode *new_dir, struct qstr *new_name) -{ -	struct nfs4_exception exception = { }; -	int err; -	do { -		err = nfs4_handle_exception(NFS_SERVER(old_dir), -				_nfs4_proc_rename(old_dir, old_name, -					new_dir, new_name), -				&exception); -	} while (exception.retry); -	return err; -} -  static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)  {  	struct nfs_server *server = NFS_SERVER(inode); @@ -2664,6 +3559,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *  	};  	struct nfs4_link_res res = {  		.server = server, +		.label = NULL,  	};  	struct rpc_message msg = {  		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], @@ -2673,18 +3569,28 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *  	int status = -ENOMEM;  	res.fattr = nfs_alloc_fattr(); -	res.dir_attr = nfs_alloc_fattr(); -	if (res.fattr == NULL || res.dir_attr == NULL) +	if (res.fattr == NULL)  		goto out; -	status = nfs4_call_sync(server, &msg, &arg, &res, 1); +	res.label = nfs4_label_alloc(server, GFP_KERNEL); +	if (IS_ERR(res.label)) { +		status = PTR_ERR(res.label); +		goto out; +	} +	arg.bitmask = nfs4_bitmask(server, res.label); + +	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);  	if (!status) {  		update_changeattr(dir, &res.cinfo); -		nfs_post_op_update_inode(dir, res.dir_attr); -		nfs_post_op_update_inode(inode, res.fattr); +		status = nfs_post_op_update_inode(inode, res.fattr); +		if (!status) +			nfs_setsecurity(inode, res.fattr, res.label);  	} + + +	nfs4_label_free(res.label); +  out: -	nfs_free_fattr(res.dir_attr);  	nfs_free_fattr(res.fattr);  	return status;  } @@ -2707,7 +3613,7 @@ struct nfs4_createdata {  	struct nfs4_create_res res;  	struct nfs_fh fh;  	struct nfs_fattr fattr; -	struct nfs_fattr dir_fattr; +	struct nfs4_label *label;  };  static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, @@ -2719,6 +3625,10 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,  	if (data != NULL) {  		struct nfs_server *server = NFS_SERVER(dir); +		data->label = nfs4_label_alloc(server, GFP_KERNEL); +		if (IS_ERR(data->label)) +			goto out_free; +  		data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];  		data->msg.rpc_argp = &data->arg;  		data->msg.rpc_resp = &data->res; @@ -2727,36 +3637,39 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,  		data->arg.name = name;  		data->arg.attrs = sattr;  		data->arg.ftype = ftype; -		data->arg.bitmask = server->attr_bitmask; +		data->arg.bitmask = nfs4_bitmask(server, data->label);  		data->res.server = server;  		data->res.fh = &data->fh;  		data->res.fattr = &data->fattr; -		data->res.dir_fattr = &data->dir_fattr; +		data->res.label = data->label;  		nfs_fattr_init(data->res.fattr); -		nfs_fattr_init(data->res.dir_fattr);  	}  	return data; +out_free: +	kfree(data); +	return NULL;  }  static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)  { -	int status = nfs4_call_sync(NFS_SERVER(dir), &data->msg, -				    &data->arg, &data->res, 1); +	int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, +				    &data->arg.seq_args, &data->res.seq_res, 1);  	if (status == 0) {  		update_changeattr(dir, &data->res.dir_cinfo); -		nfs_post_op_update_inode(dir, data->res.dir_fattr); -		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); +		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);  	}  	return status;  }  static void nfs4_free_createdata(struct nfs4_createdata *data)  { +	nfs4_label_free(data->label);  	kfree(data);  }  static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, -		struct page *page, unsigned int len, struct iattr *sattr) +		struct page *page, unsigned int len, struct iattr *sattr, +		struct nfs4_label *label)  {  	struct nfs4_createdata *data;  	int status = -ENAMETOOLONG; @@ -2772,6 +3685,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,  	data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];  	data->arg.u.symlink.pages = &page;  	data->arg.u.symlink.len = len; +	data->arg.label = label;  	status = nfs4_do_create(dir, dentry, data); @@ -2784,18 +3698,24 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,  		struct page *page, unsigned int len, struct iattr *sattr)  {  	struct nfs4_exception exception = { }; +	struct nfs4_label l, *label = NULL;  	int err; + +	label = nfs4_label_init_security(dir, dentry, sattr, &l); +  	do { -		err = nfs4_handle_exception(NFS_SERVER(dir), -				_nfs4_proc_symlink(dir, dentry, page, -							len, sattr), +		err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label); +		trace_nfs4_symlink(dir, &dentry->d_name, err); +		err = nfs4_handle_exception(NFS_SERVER(dir), err,  				&exception);  	} while (exception.retry); + +	nfs4_label_release_security(label);  	return err;  }  static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, -		struct iattr *sattr) +		struct iattr *sattr, struct nfs4_label *label)  {  	struct nfs4_createdata *data;  	int status = -ENOMEM; @@ -2804,6 +3724,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,  	if (data == NULL)  		goto out; +	data->arg.label = label;  	status = nfs4_do_create(dir, dentry, data);  	nfs4_free_createdata(data); @@ -2815,12 +3736,20 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,  		struct iattr *sattr)  {  	struct nfs4_exception exception = { }; +	struct nfs4_label l, *label = NULL;  	int err; + +	label = nfs4_label_init_security(dir, dentry, sattr, &l); + +	sattr->ia_mode &= ~current_umask();  	do { -		err = nfs4_handle_exception(NFS_SERVER(dir), -				_nfs4_proc_mkdir(dir, dentry, sattr), +		err = _nfs4_proc_mkdir(dir, dentry, sattr, label); +		trace_nfs4_mkdir(dir, &dentry->d_name, err); +		err = nfs4_handle_exception(NFS_SERVER(dir), err,  				&exception);  	} while (exception.retry); +	nfs4_label_release_security(label); +  	return err;  } @@ -2845,15 +3774,16 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,  	};  	int			status; -	dprintk("%s: dentry = %s/%s, cookie = %Lu\n", __func__, -			dentry->d_parent->d_name.name, -			dentry->d_name.name, +	dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__, +			dentry,  			(unsigned long long)cookie); -	nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); +	nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args);  	res.pgbase = args.pgbase; -	status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0); -	if (status == 0) -		memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); +	status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); +	if (status >= 0) { +		memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE); +		status += args.pgbase; +	}  	nfs_invalidate_atime(dir); @@ -2867,24 +3797,22 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,  	struct nfs4_exception exception = { };  	int err;  	do { -		err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), -				_nfs4_proc_readdir(dentry, cred, cookie, -					pages, count, plus), +		err = _nfs4_proc_readdir(dentry, cred, cookie, +				pages, count, plus); +		trace_nfs4_readdir(dentry->d_inode, err); +		err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), err,  				&exception);  	} while (exception.retry);  	return err;  }  static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, -		struct iattr *sattr, dev_t rdev) +		struct iattr *sattr, struct nfs4_label *label, dev_t rdev)  {  	struct nfs4_createdata *data;  	int mode = sattr->ia_mode;  	int status = -ENOMEM; -	BUG_ON(!(sattr->ia_valid & ATTR_MODE)); -	BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode)); -  	data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);  	if (data == NULL)  		goto out; @@ -2900,10 +3828,14 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,  		data->arg.ftype = NF4CHR;  		data->arg.u.device.specdata1 = MAJOR(rdev);  		data->arg.u.device.specdata2 = MINOR(rdev); +	} else if (!S_ISSOCK(mode)) { +		status = -EINVAL; +		goto out_free;  	} -	 -	status = nfs4_do_create(dir, dentry, data); +	data->arg.label = label; +	status = nfs4_do_create(dir, dentry, data); +out_free:  	nfs4_free_createdata(data);  out:  	return status; @@ -2913,12 +3845,21 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,  		struct iattr *sattr, dev_t rdev)  {  	struct nfs4_exception exception = { }; +	struct nfs4_label l, *label = NULL;  	int err; + +	label = nfs4_label_init_security(dir, dentry, sattr, &l); + +	sattr->ia_mode &= ~current_umask();  	do { -		err = nfs4_handle_exception(NFS_SERVER(dir), -				_nfs4_proc_mknod(dir, dentry, sattr, rdev), +		err = _nfs4_proc_mknod(dir, dentry, sattr, label, rdev); +		trace_nfs4_mknod(dir, &dentry->d_name, err); +		err = nfs4_handle_exception(NFS_SERVER(dir), err,  				&exception);  	} while (exception.retry); + +	nfs4_label_release_security(label); +  	return err;  } @@ -2939,7 +3880,7 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,  	};  	nfs_fattr_init(fsstat->fattr); -	return  nfs4_call_sync(server, &msg, &args, &res, 0); +	return  nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);  }  static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat) @@ -2970,26 +3911,45 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,  		.rpc_resp = &res,  	}; -	return nfs4_call_sync(server, &msg, &args, &res, 0); +	return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);  }  static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)  {  	struct nfs4_exception exception = { }; +	unsigned long now = jiffies;  	int err;  	do { -		err = nfs4_handle_exception(server, -				_nfs4_do_fsinfo(server, fhandle, fsinfo), -				&exception); +		err = _nfs4_do_fsinfo(server, fhandle, fsinfo); +		trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err); +		if (err == 0) { +			struct nfs_client *clp = server->nfs_client; + +			spin_lock(&clp->cl_lock); +			clp->cl_lease_time = fsinfo->lease_time * HZ; +			clp->cl_last_renewal = now; +			spin_unlock(&clp->cl_lock); +			break; +		} +		err = nfs4_handle_exception(server, err, &exception);  	} while (exception.retry);  	return err;  }  static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)  { +	int error; +  	nfs_fattr_init(fsinfo->fattr); -	return nfs4_do_fsinfo(server, fhandle, fsinfo); +	error = nfs4_do_fsinfo(server, fhandle, fsinfo); +	if (error == 0) { +		/* block layout checks this! */ +		server->pnfs_blksize = fsinfo->blksize; +		set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype); +	} + +	return error;  }  static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, @@ -3015,7 +3975,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle  	}  	nfs_fattr_init(pathconf->fattr); -	return nfs4_call_sync(server, &msg, &args, &res, 0); +	return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);  }  static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, @@ -3032,83 +3992,227 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,  	return err;  } -static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) +int nfs4_set_rw_stateid(nfs4_stateid *stateid, +		const struct nfs_open_context *ctx, +		const struct nfs_lock_context *l_ctx, +		fmode_t fmode)  { -	struct nfs_server *server = NFS_SERVER(data->inode); +	const struct nfs_lockowner *lockowner = NULL; -	dprintk("--> %s\n", __func__); +	if (l_ctx != NULL) +		lockowner = &l_ctx->lockowner; +	return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner); +} +EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid); -	if (!nfs4_sequence_done(task, &data->res.seq_res)) -		return -EAGAIN; +static bool nfs4_stateid_is_current(nfs4_stateid *stateid, +		const struct nfs_open_context *ctx, +		const struct nfs_lock_context *l_ctx, +		fmode_t fmode) +{ +	nfs4_stateid current_stateid; + +	/* If the current stateid represents a lost lock, then exit */ +	if (nfs4_set_rw_stateid(¤t_stateid, ctx, l_ctx, fmode) == -EIO) +		return true; +	return nfs4_stateid_match(stateid, ¤t_stateid); +} + +static bool nfs4_error_stateid_expired(int err) +{ +	switch (err) { +	case -NFS4ERR_DELEG_REVOKED: +	case -NFS4ERR_ADMIN_REVOKED: +	case -NFS4ERR_BAD_STATEID: +	case -NFS4ERR_STALE_STATEID: +	case -NFS4ERR_OLD_STATEID: +	case -NFS4ERR_OPENMODE: +	case -NFS4ERR_EXPIRED: +		return true; +	} +	return false; +} + +void __nfs4_read_done_cb(struct nfs_pgio_data *data) +{ +	nfs_invalidate_atime(data->header->inode); +} + +static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) +{ +	struct nfs_server *server = NFS_SERVER(data->header->inode); +	trace_nfs4_read(data, task->tk_status);  	if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { -		nfs_restart_rpc(task, server->nfs_client); +		rpc_restart_call_prepare(task);  		return -EAGAIN;  	} -	nfs_invalidate_atime(data->inode); +	__nfs4_read_done_cb(data);  	if (task->tk_status > 0)  		renew_lease(server, data->timestamp);  	return 0;  } -static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static bool nfs4_read_stateid_changed(struct rpc_task *task, +		struct nfs_pgio_args *args)  { -	data->timestamp   = jiffies; -	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + +	if (!nfs4_error_stateid_expired(task->tk_status) || +		nfs4_stateid_is_current(&args->stateid, +				args->context, +				args->lock_context, +				FMODE_READ)) +		return false; +	rpc_restart_call_prepare(task); +	return true;  } -static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data)  { -	struct inode *inode = data->inode; -	 + +	dprintk("--> %s\n", __func__); +  	if (!nfs4_sequence_done(task, &data->res.seq_res))  		return -EAGAIN; +	if (nfs4_read_stateid_changed(task, &data->args)) +		return -EAGAIN; +	return data->pgio_done_cb ? data->pgio_done_cb(task, data) : +				    nfs4_read_done_cb(task, data); +} + +static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) +{ +	data->timestamp   = jiffies; +	data->pgio_done_cb = nfs4_read_done_cb; +	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; +	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); +} +static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) +{ +	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), +			&data->args.seq_args, +			&data->res.seq_res, +			task)) +		return 0; +	if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context, +				data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO) +		return -EIO; +	if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) +		return -EIO; +	return 0; +} + +static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) +{ +	struct inode *inode = data->header->inode; +	 +	trace_nfs4_write(data, task->tk_status);  	if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { -		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); +		rpc_restart_call_prepare(task);  		return -EAGAIN;  	}  	if (task->tk_status >= 0) {  		renew_lease(NFS_SERVER(inode), data->timestamp); -		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); +		nfs_post_op_update_inode_force_wcc(inode, &data->fattr);  	}  	return 0;  } -static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static bool nfs4_write_stateid_changed(struct rpc_task *task, +		struct nfs_pgio_args *args)  { -	struct nfs_server *server = NFS_SERVER(data->inode); -	data->args.bitmask = server->cache_consistency_bitmask; +	if (!nfs4_error_stateid_expired(task->tk_status) || +		nfs4_stateid_is_current(&args->stateid, +				args->context, +				args->lock_context, +				FMODE_WRITE)) +		return false; +	rpc_restart_call_prepare(task); +	return true; +} + +static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data) +{ +	if (!nfs4_sequence_done(task, &data->res.seq_res)) +		return -EAGAIN; +	if (nfs4_write_stateid_changed(task, &data->args)) +		return -EAGAIN; +	return data->pgio_done_cb ? data->pgio_done_cb(task, data) : +		nfs4_write_done_cb(task, data); +} + +static +bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data) +{ +	const struct nfs_pgio_header *hdr = data->header; + +	/* Don't request attributes for pNFS or O_DIRECT writes */ +	if (data->ds_clp != NULL || hdr->dreq != NULL) +		return false; +	/* Otherwise, request attributes if and only if we don't hold +	 * a delegation +	 */ +	return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; +} + +static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) +{ +	struct nfs_server *server = NFS_SERVER(data->header->inode); + +	if (!nfs4_write_need_cache_consistency_data(data)) { +		data->args.bitmask = NULL; +		data->res.fattr = NULL; +	} else +		data->args.bitmask = server->cache_consistency_bitmask; + +	if (!data->pgio_done_cb) +		data->pgio_done_cb = nfs4_write_done_cb;  	data->res.server = server;  	data->timestamp   = jiffies;  	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; +	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); +} + +static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) +{ +	nfs4_setup_sequence(NFS_SERVER(data->inode), +			&data->args.seq_args, +			&data->res.seq_res, +			task);  } -static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data)  {  	struct inode *inode = data->inode; -	 -	if (!nfs4_sequence_done(task, &data->res.seq_res)) -		return -EAGAIN; +	trace_nfs4_commit(data, task->tk_status);  	if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { -		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); +		rpc_restart_call_prepare(task);  		return -EAGAIN;  	} -	nfs_refresh_inode(inode, data->res.fattr);  	return 0;  } -static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) +static int nfs4_commit_done(struct rpc_task *task, struct nfs_commit_data *data) +{ +	if (!nfs4_sequence_done(task, &data->res.seq_res)) +		return -EAGAIN; +	return data->commit_done_cb(task, data); +} + +static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)  {  	struct nfs_server *server = NFS_SERVER(data->inode); -	 -	data->args.bitmask = server->cache_consistency_bitmask; + +	if (data->commit_done_cb == NULL) +		data->commit_done_cb = nfs4_commit_done_cb;  	data->res.server = server;  	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; +	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);  }  struct nfs4_renewdata { @@ -3137,11 +4241,22 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)  	struct nfs_client *clp = data->client;  	unsigned long timestamp = data->timestamp; -	if (task->tk_status < 0) { +	trace_nfs4_renew_async(clp, task->tk_status); +	switch (task->tk_status) { +	case 0: +		break; +	case -NFS4ERR_LEASE_MOVED: +		nfs4_schedule_lease_moved_recovery(clp); +		break; +	default:  		/* Unless we're shutting down, schedule state recovery! */ -		if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0) -			nfs4_schedule_state_recovery(clp); -		return; +		if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0) +			return; +		if (task->tk_status != NFS4ERR_CB_PATH_DOWN) { +			nfs4_schedule_lease_recovery(clp); +			return; +		} +		nfs4_schedule_path_down_recovery(clp);  	}  	do_renew_lease(clp, timestamp);  } @@ -3151,7 +4266,7 @@ static const struct rpc_call_ops nfs4_renew_ops = {  	.rpc_release = nfs4_renew_release,  }; -int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) +static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)  {  	struct rpc_message msg = {  		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_RENEW], @@ -3160,18 +4275,20 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)  	};  	struct nfs4_renewdata *data; +	if (renew_flags == 0) +		return 0;  	if (!atomic_inc_not_zero(&clp->cl_count))  		return -EIO; -	data = kmalloc(sizeof(*data), GFP_KERNEL); +	data = kmalloc(sizeof(*data), GFP_NOFS);  	if (data == NULL)  		return -ENOMEM;  	data->client = clp;  	data->timestamp = jiffies; -	return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, +	return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT,  			&nfs4_renew_ops, data);  } -int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) +static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)  {  	struct rpc_message msg = {  		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_RENEW], @@ -3181,7 +4298,7 @@ int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)  	unsigned long now = jiffies;  	int status; -	status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); +	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);  	if (status < 0)  		return status;  	do_renew_lease(clp, now); @@ -3190,28 +4307,42 @@ int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)  static inline int nfs4_server_supports_acls(struct nfs_server *server)  { -	return (server->caps & NFS_CAP_ACLS) -		&& (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) -		&& (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL); +	return server->caps & NFS_CAP_ACLS;  } -/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_CACHE_SIZE, and that - * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_CACHE_SIZE) bytes on +/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that + * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_SIZE) bytes on   * the stack.   */ -#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT) +#define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE) -static void buf_to_pages(const void *buf, size_t buflen, +static int buf_to_pages_noslab(const void *buf, size_t buflen,  		struct page **pages, unsigned int *pgbase)  { -	const void *p = buf; +	struct page *newpage, **spages; +	int rc = 0; +	size_t len; +	spages = pages; -	*pgbase = offset_in_page(buf); -	p -= *pgbase; -	while (p < buf + buflen) { -		*(pages++) = virt_to_page(p); -		p += PAGE_CACHE_SIZE; -	} +	do { +		len = min_t(size_t, PAGE_SIZE, buflen); +		newpage = alloc_page(GFP_KERNEL); + +		if (newpage == NULL) +			goto unwind; +		memcpy(page_address(newpage), buf, len); +                buf += len; +                buflen -= len; +		*pages++ = newpage; +		rc++; +	} while (buflen != 0); + +	return rc; + +unwind: +	for(; rc > 0; rc--) +		__free_page(spages[rc-1]); +	return -ENOMEM;  }  struct nfs4_cached_acl { @@ -3260,16 +4391,17 @@ out:  	return ret;  } -static void nfs4_write_cached_acl(struct inode *inode, const char *buf, size_t acl_len) +static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size_t pgbase, size_t acl_len)  {  	struct nfs4_cached_acl *acl; +	size_t buflen = sizeof(*acl) + acl_len; -	if (buf && acl_len <= PAGE_SIZE) { -		acl = kmalloc(sizeof(*acl) + acl_len, GFP_KERNEL); +	if (buflen <= PAGE_SIZE) { +		acl = kmalloc(buflen, GFP_KERNEL);  		if (acl == NULL)  			goto out;  		acl->cached = 1; -		memcpy(acl->data, buf, acl_len); +		_copy_from_pages(acl->data, pages, pgbase, acl_len);  	} else {  		acl = kmalloc(sizeof(*acl), GFP_KERNEL);  		if (acl == NULL) @@ -3281,9 +4413,19 @@ out:  	nfs4_set_cached_acl(inode, acl);  } +/* + * The getxattr API returns the required buffer length when called with a + * NULL buf. The NFSv4 acl tool then calls getxattr again after allocating + * the required buf.  On a NULL buf, we send a page of data to the server + * guessing that the ACL request can be serviced by a page. If so, we cache + * up to the page of ACL data, and the 2nd call to getxattr is serviced by + * the cache. If not so, we throw away the page, and cache the required + * length. The next getxattr call will then produce another round trip to + * the server, this time with the input buf of the required size. + */  static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)  { -	struct page *pages[NFS4ACL_MAXPAGES]; +	struct page *pages[NFS4ACL_MAXPAGES] = {NULL, };  	struct nfs_getaclargs args = {  		.fh = NFS_FH(inode),  		.acl_pages = pages, @@ -3292,47 +4434,66 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu  	struct nfs_getaclres res = {  		.acl_len = buflen,  	}; -	void *resp_buf;  	struct rpc_message msg = {  		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL],  		.rpc_argp = &args,  		.rpc_resp = &res,  	}; -	struct page *localpage = NULL; -	int ret; +	unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); +	int ret = -ENOMEM, i; + +	/* As long as we're doing a round trip to the server anyway, +	 * let's be prepared for a page of acl data. */ +	if (npages == 0) +		npages = 1; +	if (npages > ARRAY_SIZE(pages)) +		return -ERANGE; -	if (buflen < PAGE_SIZE) { -		/* As long as we're doing a round trip to the server anyway, -		 * let's be prepared for a page of acl data. */ -		localpage = alloc_page(GFP_KERNEL); -		resp_buf = page_address(localpage); -		if (localpage == NULL) -			return -ENOMEM; -		args.acl_pages[0] = localpage; -		args.acl_pgbase = 0; -		args.acl_len = PAGE_SIZE; -	} else { -		resp_buf = buf; -		buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase); +	for (i = 0; i < npages; i++) { +		pages[i] = alloc_page(GFP_KERNEL); +		if (!pages[i]) +			goto out_free;  	} -	ret = nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0); + +	/* for decoding across pages */ +	res.acl_scratch = alloc_page(GFP_KERNEL); +	if (!res.acl_scratch) +		goto out_free; + +	args.acl_len = npages * PAGE_SIZE; +	args.acl_pgbase = 0; + +	dprintk("%s  buf %p buflen %zu npages %d args.acl_len %zu\n", +		__func__, buf, buflen, npages, args.acl_len); +	ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), +			     &msg, &args.seq_args, &res.seq_res, 0);  	if (ret)  		goto out_free; -	if (res.acl_len > args.acl_len) -		nfs4_write_cached_acl(inode, NULL, res.acl_len); -	else -		nfs4_write_cached_acl(inode, resp_buf, res.acl_len); -	if (buf) { + +	/* Handle the case where the passed-in buffer is too short */ +	if (res.acl_flags & NFS4_ACL_TRUNC) { +		/* Did the user only issue a request for the acl length? */ +		if (buf == NULL) +			goto out_ok;  		ret = -ERANGE; -		if (res.acl_len > buflen) +		goto out_free; +	} +	nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len); +	if (buf) { +		if (res.acl_len > buflen) { +			ret = -ERANGE;  			goto out_free; -		if (localpage) -			memcpy(buf, resp_buf, res.acl_len); +		} +		_copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len);  	} +out_ok:  	ret = res.acl_len;  out_free: -	if (localpage) -		__free_page(localpage); +	for (i = 0; i < npages; i++) +		if (pages[i]) +			__free_page(pages[i]); +	if (res.acl_scratch) +		__free_page(res.acl_scratch);  	return ret;  } @@ -3342,6 +4503,7 @@ static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bufl  	ssize_t ret;  	do {  		ret = __nfs4_get_acl_uncached(inode, buf, buflen); +		trace_nfs4_get_acl(inode, ret);  		if (ret >= 0)  			break;  		ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception); @@ -3359,8 +4521,12 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)  	ret = nfs_revalidate_inode(server, inode);  	if (ret < 0)  		return ret; +	if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) +		nfs_zap_acl_cache(inode);  	ret = nfs4_read_cached_acl(inode, buf, buflen);  	if (ret != -ENOENT) +		/* -ENOENT is returned if there is no ACL or if there is an ACL +		 * but no cached acl data, just the acl length */  		return ret;  	return nfs4_get_acl_uncached(inode, buf, buflen);  } @@ -3380,13 +4546,33 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl  		.rpc_argp	= &arg,  		.rpc_resp	= &res,  	}; -	int ret; +	unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); +	int ret, i;  	if (!nfs4_server_supports_acls(server))  		return -EOPNOTSUPP; -	nfs_inode_return_delegation(inode); -	buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); -	ret = nfs4_call_sync(server, &msg, &arg, &res, 1); +	if (npages > ARRAY_SIZE(pages)) +		return -ERANGE; +	i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase); +	if (i < 0) +		return i; +	nfs4_inode_return_delegation(inode); +	ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + +	/* +	 * Free each page after tx, so the only ref left is +	 * held by the network stack +	 */ +	for (; i > 0; i--) +		put_page(pages[i-1]); + +	/* +	 * Acl update can result in inode attribute update. +	 * so mark the attribute cache invalid. +	 */ +	spin_lock(&inode->i_lock); +	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; +	spin_unlock(&inode->i_lock);  	nfs_access_zap_cache(inode);  	nfs_zap_acl_cache(inode);  	return ret; @@ -3397,14 +4583,166 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen  	struct nfs4_exception exception = { };  	int err;  	do { -		err = nfs4_handle_exception(NFS_SERVER(inode), -				__nfs4_proc_set_acl(inode, buf, buflen), +		err = __nfs4_proc_set_acl(inode, buf, buflen); +		trace_nfs4_set_acl(inode, err); +		err = nfs4_handle_exception(NFS_SERVER(inode), err, +				&exception); +	} while (exception.retry); +	return err; +} + +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static int _nfs4_get_security_label(struct inode *inode, void *buf, +					size_t buflen) +{ +	struct nfs_server *server = NFS_SERVER(inode); +	struct nfs_fattr fattr; +	struct nfs4_label label = {0, 0, buflen, buf}; + +	u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; +	struct nfs4_getattr_arg arg = { +		.fh		= NFS_FH(inode), +		.bitmask	= bitmask, +	}; +	struct nfs4_getattr_res res = { +		.fattr		= &fattr, +		.label		= &label, +		.server		= server, +	}; +	struct rpc_message msg = { +		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_GETATTR], +		.rpc_argp	= &arg, +		.rpc_resp	= &res, +	}; +	int ret; + +	nfs_fattr_init(&fattr); + +	ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); +	if (ret) +		return ret; +	if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL)) +		return -ENOENT; +	if (buflen < label.len) +		return -ERANGE; +	return 0; +} + +static int nfs4_get_security_label(struct inode *inode, void *buf, +					size_t buflen) +{ +	struct nfs4_exception exception = { }; +	int err; + +	if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) +		return -EOPNOTSUPP; + +	do { +		err = _nfs4_get_security_label(inode, buf, buflen); +		trace_nfs4_get_security_label(inode, err); +		err = nfs4_handle_exception(NFS_SERVER(inode), err, +				&exception); +	} while (exception.retry); +	return err; +} + +static int _nfs4_do_set_security_label(struct inode *inode, +		struct nfs4_label *ilabel, +		struct nfs_fattr *fattr, +		struct nfs4_label *olabel) +{ + +	struct iattr sattr = {0}; +	struct nfs_server *server = NFS_SERVER(inode); +	const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; +	struct nfs_setattrargs arg = { +		.fh             = NFS_FH(inode), +		.iap            = &sattr, +		.server		= server, +		.bitmask	= bitmask, +		.label		= ilabel, +	}; +	struct nfs_setattrres res = { +		.fattr		= fattr, +		.label		= olabel, +		.server		= server, +	}; +	struct rpc_message msg = { +		.rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], +		.rpc_argp       = &arg, +		.rpc_resp       = &res, +	}; +	int status; + +	nfs4_stateid_copy(&arg.stateid, &zero_stateid); + +	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); +	if (status) +		dprintk("%s failed: %d\n", __func__, status); + +	return status; +} + +static int nfs4_do_set_security_label(struct inode *inode, +		struct nfs4_label *ilabel, +		struct nfs_fattr *fattr, +		struct nfs4_label *olabel) +{ +	struct nfs4_exception exception = { }; +	int err; + +	do { +		err = _nfs4_do_set_security_label(inode, ilabel, +				fattr, olabel); +		trace_nfs4_set_security_label(inode, err); +		err = nfs4_handle_exception(NFS_SERVER(inode), err,  				&exception);  	} while (exception.retry);  	return err;  }  static int +nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen) +{ +	struct nfs4_label ilabel, *olabel = NULL; +	struct nfs_fattr fattr; +	struct rpc_cred *cred; +	struct inode *inode = dentry->d_inode; +	int status; + +	if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) +		return -EOPNOTSUPP; + +	nfs_fattr_init(&fattr); + +	ilabel.pi = 0; +	ilabel.lfs = 0; +	ilabel.label = (char *)buf; +	ilabel.len = buflen; + +	cred = rpc_lookup_cred(); +	if (IS_ERR(cred)) +		return PTR_ERR(cred); + +	olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); +	if (IS_ERR(olabel)) { +		status = -PTR_ERR(olabel); +		goto out; +	} + +	status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel); +	if (status == 0) +		nfs_setsecurity(inode, &fattr, olabel); + +	nfs4_label_free(olabel); +out: +	put_rpccred(cred); +	return status; +} +#endif	/* CONFIG_NFS_V4_SECURITY_LABEL */ + + +static int  nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)  {  	struct nfs_client *clp = server->nfs_client; @@ -3412,17 +4750,34 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,  	if (task->tk_status >= 0)  		return 0;  	switch(task->tk_status) { +		case -NFS4ERR_DELEG_REVOKED:  		case -NFS4ERR_ADMIN_REVOKED:  		case -NFS4ERR_BAD_STATEID: +			if (state == NULL) +				break; +			nfs_remove_bad_delegation(state->inode);  		case -NFS4ERR_OPENMODE:  			if (state == NULL)  				break; -			nfs4_state_mark_reclaim_nograce(clp, state); -			goto do_state_recovery; +			if (nfs4_schedule_stateid_recovery(server, state) < 0) +				goto recovery_failed; +			goto wait_on_recovery; +		case -NFS4ERR_EXPIRED: +			if (state != NULL) { +				if (nfs4_schedule_stateid_recovery(server, state) < 0) +					goto recovery_failed; +			}  		case -NFS4ERR_STALE_STATEID:  		case -NFS4ERR_STALE_CLIENTID: -		case -NFS4ERR_EXPIRED: -			goto do_state_recovery; +			nfs4_schedule_lease_recovery(clp); +			goto wait_on_recovery; +		case -NFS4ERR_MOVED: +			if (nfs4_schedule_migration_recovery(server) < 0) +				goto recovery_failed; +			goto wait_on_recovery; +		case -NFS4ERR_LEASE_MOVED: +			nfs4_schedule_lease_moved_recovery(clp); +			goto wait_on_recovery;  #if defined(CONFIG_NFS_V4_1)  		case -NFS4ERR_BADSESSION:  		case -NFS4ERR_BADSLOT: @@ -3433,32 +4788,109 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,  		case -NFS4ERR_SEQ_MISORDERED:  			dprintk("%s ERROR %d, Reset session\n", __func__,  				task->tk_status); -			nfs4_schedule_state_recovery(clp); -			task->tk_status = 0; -			return -EAGAIN; +			nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); +			goto wait_on_recovery;  #endif /* CONFIG_NFS_V4_1 */  		case -NFS4ERR_DELAY:  			nfs_inc_server_stats(server, NFSIOS_DELAY);  		case -NFS4ERR_GRACE: -		case -EKEYEXPIRED:  			rpc_delay(task, NFS4_POLL_RETRY_MAX); -			task->tk_status = 0; -			return -EAGAIN; +		case -NFS4ERR_RETRY_UNCACHED_REP:  		case -NFS4ERR_OLD_STATEID: -			task->tk_status = 0; -			return -EAGAIN; +			goto restart_call;  	}  	task->tk_status = nfs4_map_errors(task->tk_status);  	return 0; -do_state_recovery: +recovery_failed: +	task->tk_status = -EIO; +	return 0; +wait_on_recovery:  	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); -	nfs4_schedule_state_recovery(clp);  	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)  		rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); +	if (test_bit(NFS_MIG_FAILED, &server->mig_status)) +		goto recovery_failed; +restart_call:  	task->tk_status = 0;  	return -EAGAIN;  } +static void nfs4_init_boot_verifier(const struct nfs_client *clp, +				    nfs4_verifier *bootverf) +{ +	__be32 verf[2]; + +	if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { +		/* An impossible timestamp guarantees this value +		 * will never match a generated boot time. */ +		verf[0] = 0; +		verf[1] = cpu_to_be32(NSEC_PER_SEC + 1); +	} else { +		struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); +		verf[0] = cpu_to_be32(nn->boot_time.tv_sec); +		verf[1] = cpu_to_be32(nn->boot_time.tv_nsec); +	} +	memcpy(bootverf->data, verf, sizeof(bootverf->data)); +} + +static unsigned int +nfs4_init_nonuniform_client_string(const struct nfs_client *clp, +				   char *buf, size_t len) +{ +	unsigned int result; + +	rcu_read_lock(); +	result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s", +				clp->cl_ipaddr, +				rpc_peeraddr2str(clp->cl_rpcclient, +							RPC_DISPLAY_ADDR), +				rpc_peeraddr2str(clp->cl_rpcclient, +							RPC_DISPLAY_PROTO)); +	rcu_read_unlock(); +	return result; +} + +static unsigned int +nfs4_init_uniform_client_string(const struct nfs_client *clp, +				char *buf, size_t len) +{ +	const char *nodename = clp->cl_rpcclient->cl_nodename; + +	if (nfs4_client_id_uniquifier[0] != '\0') +		return scnprintf(buf, len, "Linux NFSv%u.%u %s/%s", +				clp->rpc_ops->version, +				clp->cl_minorversion, +				nfs4_client_id_uniquifier, +				nodename); +	return scnprintf(buf, len, "Linux NFSv%u.%u %s", +				clp->rpc_ops->version, clp->cl_minorversion, +				nodename); +} + +/* + * nfs4_callback_up_net() starts only "tcp" and "tcp6" callback + * services.  Advertise one based on the address family of the + * clientaddr. + */ +static unsigned int +nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len) +{ +	if (strchr(clp->cl_ipaddr, ':') != NULL) +		return scnprintf(buf, len, "tcp6"); +	else +		return scnprintf(buf, len, "tcp"); +} + +/** + * nfs4_proc_setclientid - Negotiate client ID + * @clp: state data structure + * @program: RPC program for NFSv4 callback service + * @port: IP port number for NFS4 callback service + * @cred: RPC credential to use for this call + * @res: where to place the result + * + * Returns zero, a negative errno, or a negative NFS4ERR status code. + */  int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,  		unsigned short port, struct rpc_cred *cred,  		struct nfs4_setclientid_res *res) @@ -3467,6 +4899,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,  	struct nfs4_setclientid setclientid = {  		.sc_verifier = &sc_verifier,  		.sc_prog = program, +		.sc_cb_ident = clp->cl_cb_ident,  	};  	struct rpc_message msg = {  		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], @@ -3474,91 +4907,66 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,  		.rpc_resp = res,  		.rpc_cred = cred,  	}; -	__be32 *p; -	int loop = 0;  	int status; -	p = (__be32*)sc_verifier.data; -	*p++ = htonl((u32)clp->cl_boot_time.tv_sec); -	*p = htonl((u32)clp->cl_boot_time.tv_nsec); - -	for(;;) { -		setclientid.sc_name_len = scnprintf(setclientid.sc_name, -				sizeof(setclientid.sc_name), "%s/%s %s %s %u", -				clp->cl_ipaddr, -				rpc_peeraddr2str(clp->cl_rpcclient, -							RPC_DISPLAY_ADDR), -				rpc_peeraddr2str(clp->cl_rpcclient, -							RPC_DISPLAY_PROTO), -				clp->cl_rpcclient->cl_auth->au_ops->au_name, -				clp->cl_id_uniquifier); -		setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, -				sizeof(setclientid.sc_netid), -				rpc_peeraddr2str(clp->cl_rpcclient, -							RPC_DISPLAY_NETID)); -		setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, +	/* nfs_client_id4 */ +	nfs4_init_boot_verifier(clp, &sc_verifier); +	if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags)) +		setclientid.sc_name_len = +				nfs4_init_uniform_client_string(clp, +						setclientid.sc_name, +						sizeof(setclientid.sc_name)); +	else +		setclientid.sc_name_len = +				nfs4_init_nonuniform_client_string(clp, +						setclientid.sc_name, +						sizeof(setclientid.sc_name)); +	/* cb_client4 */ +	setclientid.sc_netid_len = +				nfs4_init_callback_netid(clp, +						setclientid.sc_netid, +						sizeof(setclientid.sc_netid)); +	setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,  				sizeof(setclientid.sc_uaddr), "%s.%u.%u",  				clp->cl_ipaddr, port >> 8, port & 255); -		status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); -		if (status != -NFS4ERR_CLID_INUSE) -			break; -		if (signalled()) -			break; -		if (loop++ & 1) -			ssleep(clp->cl_lease_time + 1); -		else -			if (++clp->cl_id_uniquifier == 0) -				break; -	} +	dprintk("NFS call  setclientid auth=%s, '%.*s'\n", +		clp->cl_rpcclient->cl_auth->au_ops->au_name, +		setclientid.sc_name_len, setclientid.sc_name); +	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); +	trace_nfs4_setclientid(clp, status); +	dprintk("NFS reply setclientid: %d\n", status);  	return status;  } -static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, +/** + * nfs4_proc_setclientid_confirm - Confirm client ID + * @clp: state data structure + * @res: result of a previous SETCLIENTID + * @cred: RPC credential to use for this call + * + * Returns zero, a negative errno, or a negative NFS4ERR status code. + */ +int nfs4_proc_setclientid_confirm(struct nfs_client *clp,  		struct nfs4_setclientid_res *arg,  		struct rpc_cred *cred)  { -	struct nfs_fsinfo fsinfo;  	struct rpc_message msg = {  		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],  		.rpc_argp = arg, -		.rpc_resp = &fsinfo,  		.rpc_cred = cred,  	}; -	unsigned long now;  	int status; -	now = jiffies; -	status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); -	if (status == 0) { -		spin_lock(&clp->cl_lock); -		clp->cl_lease_time = fsinfo.lease_time * HZ; -		clp->cl_last_renewal = now; -		spin_unlock(&clp->cl_lock); -	} +	dprintk("NFS call  setclientid_confirm auth=%s, (client ID %llx)\n", +		clp->cl_rpcclient->cl_auth->au_ops->au_name, +		clp->cl_clientid); +	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); +	trace_nfs4_setclientid_confirm(clp, status); +	dprintk("NFS reply setclientid_confirm: %d\n", status);  	return status;  } -int nfs4_proc_setclientid_confirm(struct nfs_client *clp, -		struct nfs4_setclientid_res *arg, -		struct rpc_cred *cred) -{ -	long timeout = 0; -	int err; -	do { -		err = _nfs4_proc_setclientid_confirm(clp, arg, cred); -		switch (err) { -			case 0: -				return err; -			case -NFS4ERR_RESOURCE: -				/* The IBM lawyers misread another document! */ -			case -NFS4ERR_DELAY: -				err = nfs4_delay(clp->cl_rpcclient, &timeout); -		} -	} while (err == 0); -	return err; -} -  struct nfs4_delegreturndata {  	struct nfs4_delegreturnargs args;  	struct nfs4_delegreturnres res; @@ -3576,16 +4984,23 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)  	if (!nfs4_sequence_done(task, &data->res.seq_res))  		return; +	trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);  	switch (task->tk_status) { -	case -NFS4ERR_STALE_STATEID: -	case -NFS4ERR_EXPIRED:  	case 0:  		renew_lease(data->res.server, data->timestamp);  		break; +	case -NFS4ERR_ADMIN_REVOKED: +	case -NFS4ERR_DELEG_REVOKED: +	case -NFS4ERR_BAD_STATEID: +	case -NFS4ERR_OLD_STATEID: +	case -NFS4ERR_STALE_STATEID: +	case -NFS4ERR_EXPIRED: +		task->tk_status = 0; +		break;  	default:  		if (nfs4_async_handle_error(task, data->res.server, NULL) ==  				-EAGAIN) { -			nfs_restart_rpc(task, data->res.server->nfs_client); +			rpc_restart_call_prepare(task);  			return;  		}  	} @@ -3597,25 +5012,20 @@ static void nfs4_delegreturn_release(void *calldata)  	kfree(calldata);  } -#if defined(CONFIG_NFS_V4_1)  static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)  {  	struct nfs4_delegreturndata *d_data;  	d_data = (struct nfs4_delegreturndata *)data; -	if (nfs4_setup_sequence(d_data->res.server, -				&d_data->args.seq_args, -				&d_data->res.seq_res, 1, task)) -		return; -	rpc_call_start(task); +	nfs4_setup_sequence(d_data->res.server, +			&d_data->args.seq_args, +			&d_data->res.seq_res, +			task);  } -#endif /* CONFIG_NFS_V4_1 */  static const struct rpc_call_ops nfs4_delegreturn_ops = { -#if defined(CONFIG_NFS_V4_1)  	.rpc_call_prepare = nfs4_delegreturn_prepare, -#endif /* CONFIG_NFS_V4_1 */  	.rpc_call_done = nfs4_delegreturn_done,  	.rpc_release = nfs4_delegreturn_release,  }; @@ -3640,11 +5050,12 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co  	data = kzalloc(sizeof(*data), GFP_NOFS);  	if (data == NULL)  		return -ENOMEM; +	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);  	data->args.fhandle = &data->fh;  	data->args.stateid = &data->stateid; -	data->args.bitmask = server->attr_bitmask; +	data->args.bitmask = server->cache_consistency_bitmask;  	nfs_copy_fh(&data->fh, NFS_FH(inode)); -	memcpy(&data->stateid, stateid, sizeof(data->stateid)); +	nfs4_stateid_copy(&data->stateid, stateid);  	data->res.fattr = &data->fattr;  	data->res.server = server;  	nfs_fattr_init(data->res.fattr); @@ -3652,8 +5063,8 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co  	data->rpc_status = 0;  	task_setup_data.callback_data = data; -	msg.rpc_argp = &data->args, -	msg.rpc_resp = &data->res, +	msg.rpc_argp = &data->args; +	msg.rpc_resp = &data->res;  	task = rpc_run_task(&task_setup_data);  	if (IS_ERR(task))  		return PTR_ERR(task); @@ -3663,9 +5074,10 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co  	if (status != 0)  		goto out;  	status = data->rpc_status; -	if (status != 0) -		goto out; -	nfs_refresh_inode(inode, &data->fattr); +	if (status == 0) +		nfs_post_op_update_inode_force_wcc(inode, &data->fattr); +	else +		nfs_refresh_inode(inode, &data->fattr);  out:  	rpc_put_task(task);  	return status; @@ -3678,6 +5090,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4  	int err;  	do {  		err = _nfs4_proc_delegreturn(inode, cred, stateid, issync); +		trace_nfs4_delegreturn(inode, err);  		switch (err) {  			case -NFS4ERR_STALE_STATEID:  			case -NFS4ERR_EXPIRED: @@ -3698,7 +5111,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4  static unsigned long  nfs4_set_lock_task_retry(unsigned long timeout)  { -	schedule_timeout_killable(timeout); +	freezable_schedule_timeout_killable_unsafe(timeout);  	timeout <<= 1;  	if (timeout > NFS4_LOCK_MAXTIMEOUT)  		return NFS4_LOCK_MAXTIMEOUT; @@ -3731,8 +5144,9 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock  	if (status != 0)  		goto out;  	lsp = request->fl_u.nfs4_fl.owner; -	arg.lock_owner.id = lsp->ls_id.id; -	status = nfs4_call_sync(server, &msg, &arg, &res, 1); +	arg.lock_owner.id = lsp->ls_seqid.owner_id; +	arg.lock_owner.s_dev = server->s_dev; +	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);  	switch (status) {  		case 0:  			request->fl_type = F_UNLCK; @@ -3741,6 +5155,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock  			status = 0;  	}  	request->fl_ops->fl_release_private(request); +	request->fl_ops = NULL;  out:  	return status;  } @@ -3751,8 +5166,9 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *  	int err;  	do { -		err = nfs4_handle_exception(NFS_SERVER(state->inode), -				_nfs4_proc_getlk(state, cmd, request), +		err = _nfs4_proc_getlk(state, cmd, request); +		trace_nfs4_get_lock(request, state, cmd, err); +		err = nfs4_handle_exception(NFS_SERVER(state->inode), err,  				&exception);  	} while (exception.retry);  	return err; @@ -3826,9 +5242,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)  		return;  	switch (task->tk_status) {  		case 0: -			memcpy(calldata->lsp->ls_stateid.data, -					calldata->res.stateid.data, -					sizeof(calldata->lsp->ls_stateid.data)); +			nfs4_stateid_copy(&calldata->lsp->ls_stateid, +					&calldata->res.stateid);  			renew_lease(calldata->server, calldata->timestamp);  			break;  		case -NFS4ERR_BAD_STATEID: @@ -3838,9 +5253,9 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)  			break;  		default:  			if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) -				nfs_restart_rpc(task, -						 calldata->server->nfs_client); +				rpc_restart_call_prepare(task);  	} +	nfs_release_seqid(calldata->arg.seqid);  }  static void nfs4_locku_prepare(struct rpc_task *task, void *data) @@ -3848,18 +5263,22 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)  	struct nfs4_unlockdata *calldata = data;  	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) -		return; -	if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) { +		goto out_wait; +	if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {  		/* Note: exit _without_ running nfs4_locku_done */ -		task->tk_action = NULL; -		return; +		goto out_no_action;  	}  	calldata->timestamp = jiffies;  	if (nfs4_setup_sequence(calldata->server,  				&calldata->arg.seq_args, -				&calldata->res.seq_res, 1, task)) -		return; -	rpc_call_start(task); +				&calldata->res.seq_res, +				task) != 0) +		nfs_release_seqid(calldata->arg.seqid); +	return; +out_no_action: +	task->tk_action = NULL; +out_wait: +	nfs4_sequence_done(task, &calldata->res.seq_res);  }  static const struct rpc_call_ops nfs4_locku_ops = { @@ -3886,6 +5305,9 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,  		.flags = RPC_TASK_ASYNC,  	}; +	nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client, +		NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg); +  	/* Ensure this is an unlock - when canceling a lock, the  	 * canceled lock is passed in, and it won't be an unlock.  	 */ @@ -3897,15 +5319,18 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,  		return ERR_PTR(-ENOMEM);  	} -	msg.rpc_argp = &data->arg, -	msg.rpc_resp = &data->res, +	nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); +	msg.rpc_argp = &data->arg; +	msg.rpc_resp = &data->res;  	task_setup_data.callback_data = data;  	return rpc_run_task(&task_setup_data);  }  static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)  { -	struct nfs_inode *nfsi = NFS_I(state->inode); +	struct inode *inode = state->inode; +	struct nfs4_state_owner *sp = state->owner; +	struct nfs_inode *nfsi = NFS_I(inode);  	struct nfs_seqid *seqid;  	struct nfs4_lock_state *lsp;  	struct rpc_task *task; @@ -3915,18 +5340,23 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *  	status = nfs4_set_lock_state(state, request);  	/* Unlock _before_ we do the RPC call */  	request->fl_flags |= FL_EXISTS; +	/* Exclude nfs_delegation_claim_locks() */ +	mutex_lock(&sp->so_delegreturn_mutex); +	/* Exclude nfs4_reclaim_open_stateid() - note nesting! */  	down_read(&nfsi->rwsem);  	if (do_vfs_lock(request->fl_file, request) == -ENOENT) {  		up_read(&nfsi->rwsem); +		mutex_unlock(&sp->so_delegreturn_mutex);  		goto out;  	}  	up_read(&nfsi->rwsem); +	mutex_unlock(&sp->so_delegreturn_mutex);  	if (status != 0)  		goto out;  	/* Is this a delegated lock? */ -	if (test_bit(NFS_DELEGATED_STATE, &state->flags)) -		goto out;  	lsp = request->fl_u.nfs4_fl.owner; +	if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0) +		goto out;  	seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);  	status = -ENOMEM;  	if (seqid == NULL) @@ -3939,6 +5369,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *  	rpc_put_task(task);  out:  	request->fl_flags = fl_flags; +	trace_nfs4_unlock(request, state, F_SETLK, status);  	return status;  } @@ -3976,7 +5407,8 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,  		goto out_free_seqid;  	p->arg.lock_stateid = &lsp->ls_stateid;  	p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; -	p->arg.lock_owner.id = lsp->ls_id.id; +	p->arg.lock_owner.id = lsp->ls_seqid.owner_id; +	p->arg.lock_owner.s_dev = server->s_dev;  	p->res.lock_seqid = p->arg.lock_seqid;  	p->lsp = lsp;  	p->server = server; @@ -3998,31 +5430,37 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)  	dprintk("%s: begin!\n", __func__);  	if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) -		return; +		goto out_wait;  	/* Do we need to do an open_to_lock_owner? */  	if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { -		if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) -			return; -		data->arg.open_stateid = &state->stateid; +		if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { +			goto out_release_lock_seqid; +		} +		data->arg.open_stateid = &state->open_stateid;  		data->arg.new_lock_owner = 1;  		data->res.open_seqid = data->arg.open_seqid;  	} else  		data->arg.new_lock_owner = 0; +	if (!nfs4_valid_open_stateid(state)) { +		data->rpc_status = -EBADF; +		task->tk_action = NULL; +		goto out_release_open_seqid; +	}  	data->timestamp = jiffies;  	if (nfs4_setup_sequence(data->server,  				&data->arg.seq_args, -				&data->res.seq_res, 1, task)) +				&data->res.seq_res, +				task) == 0)  		return; -	rpc_call_start(task); +out_release_open_seqid: +	nfs_release_seqid(data->arg.open_seqid); +out_release_lock_seqid: +	nfs_release_seqid(data->arg.lock_seqid); +out_wait: +	nfs4_sequence_done(task, &data->res.seq_res);  	dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);  } -static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata) -{ -	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); -	nfs4_lock_prepare(task, calldata); -} -  static void nfs4_lock_done(struct rpc_task *task, void *calldata)  {  	struct nfs4_lockdata *data = calldata; @@ -4040,10 +5478,9 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)  			goto out;  	}  	if (data->rpc_status == 0) { -		memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, -					sizeof(data->lsp->ls_stateid.data)); -		data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; -		renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); +		nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid); +		set_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags); +		renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);  	}  out:  	dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status); @@ -4060,7 +5497,7 @@ static void nfs4_lock_release(void *calldata)  		task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,  				data->arg.lock_seqid);  		if (!IS_ERR(task)) -			rpc_put_task(task); +			rpc_put_task_async(task);  		dprintk("%s: cancelling lock!\n", __func__);  	} else  		nfs_free_seqid(data->arg.lock_seqid); @@ -4076,31 +5513,20 @@ static const struct rpc_call_ops nfs4_lock_ops = {  	.rpc_release = nfs4_lock_release,  }; -static const struct rpc_call_ops nfs4_recover_lock_ops = { -	.rpc_call_prepare = nfs4_recover_lock_prepare, -	.rpc_call_done = nfs4_lock_done, -	.rpc_release = nfs4_lock_release, -}; -  static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)  { -	struct nfs_client *clp = server->nfs_client; -	struct nfs4_state *state = lsp->ls_state; -  	switch (error) {  	case -NFS4ERR_ADMIN_REVOKED:  	case -NFS4ERR_BAD_STATEID: -	case -NFS4ERR_EXPIRED: -		if (new_lock_owner != 0 || -		   (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) -			nfs4_state_mark_reclaim_nograce(clp, state);  		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; +		if (new_lock_owner != 0 || +		   test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) +			nfs4_schedule_stateid_recovery(server, lsp->ls_state);  		break;  	case -NFS4ERR_STALE_STATEID: -		if (new_lock_owner != 0 || -		    (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) -			nfs4_state_mark_reclaim_reboot(clp, state);  		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; +	case -NFS4ERR_EXPIRED: +		nfs4_schedule_lease_recovery(server->nfs_client);  	};  } @@ -4129,14 +5555,15 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f  		return -ENOMEM;  	if (IS_SETLKW(cmd))  		data->arg.block = 1; +	nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); +	msg.rpc_argp = &data->arg; +	msg.rpc_resp = &data->res; +	task_setup_data.callback_data = data;  	if (recovery_type > NFS_LOCK_NEW) {  		if (recovery_type == NFS_LOCK_RECLAIM)  			data->arg.reclaim = NFS_LOCK_RECLAIM; -		task_setup_data.callback_ops = &nfs4_recover_lock_ops; +		nfs4_set_sequence_privileged(&data->arg.seq_args);  	} -	msg.rpc_argp = &data->arg, -	msg.rpc_resp = &data->res, -	task_setup_data.callback_data = data;  	task = rpc_run_task(&task_setup_data);  	if (IS_ERR(task))  		return PTR_ERR(task); @@ -4156,7 +5583,9 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f  static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request)  {  	struct nfs_server *server = NFS_SERVER(state->inode); -	struct nfs4_exception exception = { }; +	struct nfs4_exception exception = { +		.inode = state->inode, +	};  	int err;  	do { @@ -4164,6 +5593,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request  		if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)  			return 0;  		err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); +		trace_nfs4_lock_reclaim(request, state, F_SETLK, err);  		if (err != -NFS4ERR_DELAY)  			break;  		nfs4_handle_exception(server, err, &exception); @@ -4174,16 +5604,23 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request  static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request)  {  	struct nfs_server *server = NFS_SERVER(state->inode); -	struct nfs4_exception exception = { }; +	struct nfs4_exception exception = { +		.inode = state->inode, +	};  	int err;  	err = nfs4_set_lock_state(state, request);  	if (err != 0)  		return err; +	if (!recover_lost_locks) { +		set_bit(NFS_LOCK_LOST, &request->fl_u.nfs4_fl.owner->ls_flags); +		return 0; +	}  	do {  		if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)  			return 0;  		err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED); +		trace_nfs4_lock_expired(request, state, F_SETLK, err);  		switch (err) {  		default:  			goto out; @@ -4197,10 +5634,63 @@ out:  	return err;  } +#if defined(CONFIG_NFS_V4_1) +/** + * nfs41_check_expired_locks - possibly free a lock stateid + * + * @state: NFSv4 state for an inode + * + * Returns NFS_OK if recovery for this stateid is now finished. + * Otherwise a negative NFS4ERR value is returned. + */ +static int nfs41_check_expired_locks(struct nfs4_state *state) +{ +	int status, ret = -NFS4ERR_BAD_STATEID; +	struct nfs4_lock_state *lsp; +	struct nfs_server *server = NFS_SERVER(state->inode); + +	list_for_each_entry(lsp, &state->lock_states, ls_locks) { +		if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { +			struct rpc_cred *cred = lsp->ls_state->owner->so_cred; + +			status = nfs41_test_stateid(server, +					&lsp->ls_stateid, +					cred); +			trace_nfs4_test_lock_stateid(state, lsp, status); +			if (status != NFS_OK) { +				/* Free the stateid unless the server +				 * informs us the stateid is unrecognized. */ +				if (status != -NFS4ERR_BAD_STATEID) +					nfs41_free_stateid(server, +							&lsp->ls_stateid, +							cred); +				clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); +				ret = status; +			} +		} +	}; + +	return ret; +} + +static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) +{ +	int status = NFS_OK; + +	if (test_bit(LK_STATE_IN_USE, &state->flags)) +		status = nfs41_check_expired_locks(state); +	if (status != NFS_OK) +		status = nfs4_lock_expired(state, request); +	return status; +} +#endif +  static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)  { +	struct nfs4_state_owner *sp = state->owner;  	struct nfs_inode *nfsi = NFS_I(state->inode);  	unsigned char fl_flags = request->fl_flags; +	unsigned int seq;  	int status = -ENOLCK;  	if ((fl_flags & FL_POSIX) && @@ -4222,13 +5712,21 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock  		status = do_vfs_lock(request->fl_file, request);  		goto out_unlock;  	} +	seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); +	up_read(&nfsi->rwsem);  	status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);  	if (status != 0) +		goto out; +	down_read(&nfsi->rwsem); +	if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) { +		status = -NFS4ERR_DELAY;  		goto out_unlock; +	}  	/* Note: we always want to sleep here! */  	request->fl_flags = fl_flags | FL_SLEEP;  	if (do_vfs_lock(request->fl_file, request) < 0) -		printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__); +		printk(KERN_WARNING "NFS: %s: VFS is out of sync with lock " +			"manager!\n", __func__);  out_unlock:  	up_read(&nfsi->rwsem);  out: @@ -4238,11 +5736,15 @@ out:  static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)  { -	struct nfs4_exception exception = { }; +	struct nfs4_exception exception = { +		.state = state, +		.inode = state->inode, +	};  	int err;  	do {  		err = _nfs4_proc_setlk(state, cmd, request); +		trace_nfs4_set_lock(request, state, cmd, err);  		if (err == -NFS4ERR_DENIED)  			err = -EAGAIN;  		err = nfs4_handle_exception(NFS_SERVER(state->inode), @@ -4283,6 +5785,20 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)  	if (state == NULL)  		return -ENOLCK; +	/* +	 * Don't rely on the VFS having checked the file open mode, +	 * since it won't do this for flock() locks. +	 */ +	switch (request->fl_type) { +	case F_RDLCK: +		if (!(filp->f_mode & FMODE_READ)) +			return -EBADF; +		break; +	case F_WRLCK: +		if (!(filp->f_mode & FMODE_WRITE)) +			return -EBADF; +	} +  	do {  		status = nfs4_proc_setlk(state, cmd, request);  		if ((status != -EAGAIN) || IS_SETLK(cmd)) @@ -4295,157 +5811,200 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)  	return status;  } -int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) +int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid)  {  	struct nfs_server *server = NFS_SERVER(state->inode); -	struct nfs4_exception exception = { };  	int err;  	err = nfs4_set_lock_state(state, fl);  	if (err != 0) -		goto out; -	do { -		err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); -		switch (err) { -			default: -				printk(KERN_ERR "%s: unhandled error %d.\n", -						__func__, err); -			case 0: -			case -ESTALE: -				goto out; -			case -NFS4ERR_EXPIRED: -			case -NFS4ERR_STALE_CLIENTID: -			case -NFS4ERR_STALE_STATEID: -			case -NFS4ERR_BADSESSION: -			case -NFS4ERR_BADSLOT: -			case -NFS4ERR_BAD_HIGH_SLOT: -			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: -			case -NFS4ERR_DEADSESSION: -				nfs4_schedule_state_recovery(server->nfs_client); -				goto out; -			case -ERESTARTSYS: -				/* -				 * The show must go on: exit, but mark the -				 * stateid as needing recovery. -				 */ -			case -NFS4ERR_ADMIN_REVOKED: -			case -NFS4ERR_BAD_STATEID: -			case -NFS4ERR_OPENMODE: -				nfs4_state_mark_reclaim_nograce(server->nfs_client, state); -				err = 0; -				goto out; -			case -EKEYEXPIRED: -				/* -				 * User RPCSEC_GSS context has expired. -				 * We cannot recover this stateid now, so -				 * skip it and allow recovery thread to -				 * proceed. -				 */ -				err = 0; -				goto out; -			case -ENOMEM: -			case -NFS4ERR_DENIED: -				/* kill_proc(fl->fl_pid, SIGLOST, 1); */ -				err = 0; -				goto out; -			case -NFS4ERR_DELAY: -				break; -		} -		err = nfs4_handle_exception(server, err, &exception); -	} while (exception.retry); -out: -	return err; +		return err; +	err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); +	return nfs4_handle_delegation_recall_error(server, state, stateid, err); +} + +struct nfs_release_lockowner_data { +	struct nfs4_lock_state *lsp; +	struct nfs_server *server; +	struct nfs_release_lockowner_args args; +	struct nfs_release_lockowner_res res; +	unsigned long timestamp; +}; + +static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata) +{ +	struct nfs_release_lockowner_data *data = calldata; +	nfs40_setup_sequence(data->server, +				&data->args.seq_args, &data->res.seq_res, task); +	data->timestamp = jiffies; +} + +static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata) +{ +	struct nfs_release_lockowner_data *data = calldata; +	struct nfs_server *server = data->server; + +	nfs40_sequence_done(task, &data->res.seq_res); + +	switch (task->tk_status) { +	case 0: +		renew_lease(server, data->timestamp); +		break; +	case -NFS4ERR_STALE_CLIENTID: +	case -NFS4ERR_EXPIRED: +	case -NFS4ERR_LEASE_MOVED: +	case -NFS4ERR_DELAY: +		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) +			rpc_restart_call_prepare(task); +	}  }  static void nfs4_release_lockowner_release(void *calldata)  { +	struct nfs_release_lockowner_data *data = calldata; +	nfs4_free_lock_state(data->server, data->lsp);  	kfree(calldata);  } -const struct rpc_call_ops nfs4_release_lockowner_ops = { +static const struct rpc_call_ops nfs4_release_lockowner_ops = { +	.rpc_call_prepare = nfs4_release_lockowner_prepare, +	.rpc_call_done = nfs4_release_lockowner_done,  	.rpc_release = nfs4_release_lockowner_release,  }; -void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) +static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)  { -	struct nfs_server *server = lsp->ls_state->owner->so_server; -	struct nfs_release_lockowner_args *args; +	struct nfs_release_lockowner_data *data;  	struct rpc_message msg = {  		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],  	};  	if (server->nfs_client->cl_mvops->minor_version != 0) -		return; -	args = kmalloc(sizeof(*args), GFP_NOFS); -	if (!args) -		return; -	args->lock_owner.clientid = server->nfs_client->cl_clientid; -	args->lock_owner.id = lsp->ls_id.id; -	msg.rpc_argp = args; -	rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); +		return -EINVAL; + +	data = kmalloc(sizeof(*data), GFP_NOFS); +	if (!data) +		return -ENOMEM; +	data->lsp = lsp; +	data->server = server; +	data->args.lock_owner.clientid = server->nfs_client->cl_clientid; +	data->args.lock_owner.id = lsp->ls_seqid.owner_id; +	data->args.lock_owner.s_dev = server->s_dev; + +	msg.rpc_argp = &data->args; +	msg.rpc_resp = &data->res; +	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); +	rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); +	return 0;  }  #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" -int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, -		size_t buflen, int flags) +static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key, +				   const void *buf, size_t buflen, +				   int flags, int type)  { -	struct inode *inode = dentry->d_inode; - -	if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0) -		return -EOPNOTSUPP; +	if (strcmp(key, "") != 0) +		return -EINVAL; -	return nfs4_proc_set_acl(inode, buf, buflen); +	return nfs4_proc_set_acl(dentry->d_inode, buf, buflen);  } -/* The getxattr man page suggests returning -ENODATA for unknown attributes, - * and that's what we'll do for e.g. user attributes that haven't been set. - * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported - * attributes in kernel-managed attribute namespaces. */ -ssize_t nfs4_getxattr(struct dentry *dentry, const char *key, void *buf, -		size_t buflen) +static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key, +				   void *buf, size_t buflen, int type)  { -	struct inode *inode = dentry->d_inode; - -	if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0) -		return -EOPNOTSUPP; +	if (strcmp(key, "") != 0) +		return -EINVAL; -	return nfs4_proc_get_acl(inode, buf, buflen); +	return nfs4_proc_get_acl(dentry->d_inode, buf, buflen);  } -ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen) +static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, +				       size_t list_len, const char *name, +				       size_t name_len, int type)  { -	size_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1; +	size_t len = sizeof(XATTR_NAME_NFSV4_ACL);  	if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode)))  		return 0; -	if (buf && buflen < len) -		return -ERANGE; -	if (buf) -		memcpy(buf, XATTR_NAME_NFSV4_ACL, len); + +	if (list && len <= list_len) +		memcpy(list, XATTR_NAME_NFSV4_ACL, len); +	return len; +} + +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static inline int nfs4_server_supports_labels(struct nfs_server *server) +{ +	return server->caps & NFS_CAP_SECURITY_LABEL; +} + +static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key, +				   const void *buf, size_t buflen, +				   int flags, int type) +{ +	if (security_ismaclabel(key)) +		return nfs4_set_security_label(dentry, buf, buflen); + +	return -EOPNOTSUPP; +} + +static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key, +				   void *buf, size_t buflen, int type) +{ +	if (security_ismaclabel(key)) +		return nfs4_get_security_label(dentry->d_inode, buf, buflen); +	return -EOPNOTSUPP; +} + +static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list, +				       size_t list_len, const char *name, +				       size_t name_len, int type) +{ +	size_t len = 0; + +	if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) { +		len = security_inode_listsecurity(dentry->d_inode, NULL, 0); +		if (list && len <= list_len) +			security_inode_listsecurity(dentry->d_inode, list, len); +	}  	return len;  } +static const struct xattr_handler nfs4_xattr_nfs4_label_handler = { +	.prefix = XATTR_SECURITY_PREFIX, +	.list	= nfs4_xattr_list_nfs4_label, +	.get	= nfs4_xattr_get_nfs4_label, +	.set	= nfs4_xattr_set_nfs4_label, +}; +#endif + + +/* + * nfs_fhget will use either the mounted_on_fileid or the fileid + */  static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)  { -	if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) && -		(fattr->valid & NFS_ATTR_FATTR_FSID) && -		(fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL))) +	if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) || +	       (fattr->valid & NFS_ATTR_FATTR_FILEID)) && +	      (fattr->valid & NFS_ATTR_FATTR_FSID) && +	      (fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)))  		return;  	fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | -		NFS_ATTR_FATTR_NLINK; +		NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_V4_REFERRAL;  	fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;  	fattr->nlink = 2;  } -int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, -		struct nfs4_fs_locations *fs_locations, struct page *page) +static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, +				   const struct qstr *name, +				   struct nfs4_fs_locations *fs_locations, +				   struct page *page)  {  	struct nfs_server *server = NFS_SERVER(dir); -	u32 bitmask[2] = { +	u32 bitmask[3] = {  		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, -		[1] = FATTR4_WORD1_MOUNTED_ON_FILEID,  	};  	struct nfs4_fs_locations_arg args = {  		.dir_fh = NFS_FH(dir), @@ -4464,33 +6023,613 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,  	int status;  	dprintk("%s: start\n", __func__); + +	/* Ask for the fileid of the absent filesystem if mounted_on_fileid +	 * is not supported */ +	if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) +		bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; +	else +		bitmask[0] |= FATTR4_WORD0_FILEID; +  	nfs_fattr_init(&fs_locations->fattr);  	fs_locations->server = server;  	fs_locations->nlocations = 0; -	status = nfs4_call_sync(server, &msg, &args, &res, 0); -	nfs_fixup_referral_attributes(&fs_locations->fattr); +	status = nfs4_call_sync(client, server, &msg, &args.seq_args, &res.seq_res, 0);  	dprintk("%s: returned status = %d\n", __func__, status);  	return status;  } +int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, +			   const struct qstr *name, +			   struct nfs4_fs_locations *fs_locations, +			   struct page *page) +{ +	struct nfs4_exception exception = { }; +	int err; +	do { +		err = _nfs4_proc_fs_locations(client, dir, name, +				fs_locations, page); +		trace_nfs4_get_fs_locations(dir, name, err); +		err = nfs4_handle_exception(NFS_SERVER(dir), err, +				&exception); +	} while (exception.retry); +	return err; +} + +/* + * This operation also signals the server that this client is + * performing migration recovery.  The server can stop returning + * NFS4ERR_LEASE_MOVED to this client.  A RENEW operation is + * appended to this compound to identify the client ID which is + * performing recovery. + */ +static int _nfs40_proc_get_locations(struct inode *inode, +				     struct nfs4_fs_locations *locations, +				     struct page *page, struct rpc_cred *cred) +{ +	struct nfs_server *server = NFS_SERVER(inode); +	struct rpc_clnt *clnt = server->client; +	u32 bitmask[2] = { +		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, +	}; +	struct nfs4_fs_locations_arg args = { +		.clientid	= server->nfs_client->cl_clientid, +		.fh		= NFS_FH(inode), +		.page		= page, +		.bitmask	= bitmask, +		.migration	= 1,		/* skip LOOKUP */ +		.renew		= 1,		/* append RENEW */ +	}; +	struct nfs4_fs_locations_res res = { +		.fs_locations	= locations, +		.migration	= 1, +		.renew		= 1, +	}; +	struct rpc_message msg = { +		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], +		.rpc_argp	= &args, +		.rpc_resp	= &res, +		.rpc_cred	= cred, +	}; +	unsigned long now = jiffies; +	int status; + +	nfs_fattr_init(&locations->fattr); +	locations->server = server; +	locations->nlocations = 0; + +	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); +	nfs4_set_sequence_privileged(&args.seq_args); +	status = nfs4_call_sync_sequence(clnt, server, &msg, +					&args.seq_args, &res.seq_res); +	if (status) +		return status; + +	renew_lease(server, now); +	return 0; +} +  #ifdef CONFIG_NFS_V4_1 +  /* - * nfs4_proc_exchange_id() + * This operation also signals the server that this client is + * performing migration recovery.  The server can stop asserting + * SEQ4_STATUS_LEASE_MOVED for this client.  The client ID + * performing this operation is identified in the SEQUENCE + * operation in this compound.   * - * Since the clientid has expired, all compounds using sessions - * associated with the stale clientid will be returning - * NFS4ERR_BADSESSION in the sequence operation, and will therefore - * be in some phase of session reset. + * When the client supports GETATTR(fs_locations_info), it can + * be plumbed in here.   */ -int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) +static int _nfs41_proc_get_locations(struct inode *inode, +				     struct nfs4_fs_locations *locations, +				     struct page *page, struct rpc_cred *cred) +{ +	struct nfs_server *server = NFS_SERVER(inode); +	struct rpc_clnt *clnt = server->client; +	u32 bitmask[2] = { +		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, +	}; +	struct nfs4_fs_locations_arg args = { +		.fh		= NFS_FH(inode), +		.page		= page, +		.bitmask	= bitmask, +		.migration	= 1,		/* skip LOOKUP */ +	}; +	struct nfs4_fs_locations_res res = { +		.fs_locations	= locations, +		.migration	= 1, +	}; +	struct rpc_message msg = { +		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], +		.rpc_argp	= &args, +		.rpc_resp	= &res, +		.rpc_cred	= cred, +	}; +	int status; + +	nfs_fattr_init(&locations->fattr); +	locations->server = server; +	locations->nlocations = 0; + +	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); +	nfs4_set_sequence_privileged(&args.seq_args); +	status = nfs4_call_sync_sequence(clnt, server, &msg, +					&args.seq_args, &res.seq_res); +	if (status == NFS4_OK && +	    res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) +		status = -NFS4ERR_LEASE_MOVED; +	return status; +} + +#endif	/* CONFIG_NFS_V4_1 */ + +/** + * nfs4_proc_get_locations - discover locations for a migrated FSID + * @inode: inode on FSID that is migrating + * @locations: result of query + * @page: buffer + * @cred: credential to use for this operation + * + * Returns NFS4_OK on success, a negative NFS4ERR status code if the + * operation failed, or a negative errno if a local error occurred. + * + * On success, "locations" is filled in, but if the server has + * no locations information, NFS_ATTR_FATTR_V4_LOCATIONS is not + * asserted. + * + * -NFS4ERR_LEASE_MOVED is returned if the server still has leases + * from this client that require migration recovery. + */ +int nfs4_proc_get_locations(struct inode *inode, +			    struct nfs4_fs_locations *locations, +			    struct page *page, struct rpc_cred *cred) +{ +	struct nfs_server *server = NFS_SERVER(inode); +	struct nfs_client *clp = server->nfs_client; +	const struct nfs4_mig_recovery_ops *ops = +					clp->cl_mvops->mig_recovery_ops; +	struct nfs4_exception exception = { }; +	int status; + +	dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, +		(unsigned long long)server->fsid.major, +		(unsigned long long)server->fsid.minor, +		clp->cl_hostname); +	nfs_display_fhandle(NFS_FH(inode), __func__); + +	do { +		status = ops->get_locations(inode, locations, page, cred); +		if (status != -NFS4ERR_DELAY) +			break; +		nfs4_handle_exception(server, status, &exception); +	} while (exception.retry); +	return status; +} + +/* + * This operation also signals the server that this client is + * performing "lease moved" recovery.  The server can stop + * returning NFS4ERR_LEASE_MOVED to this client.  A RENEW operation + * is appended to this compound to identify the client ID which is + * performing recovery. + */ +static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ +	struct nfs_server *server = NFS_SERVER(inode); +	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; +	struct rpc_clnt *clnt = server->client; +	struct nfs4_fsid_present_arg args = { +		.fh		= NFS_FH(inode), +		.clientid	= clp->cl_clientid, +		.renew		= 1,		/* append RENEW */ +	}; +	struct nfs4_fsid_present_res res = { +		.renew		= 1, +	}; +	struct rpc_message msg = { +		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT], +		.rpc_argp	= &args, +		.rpc_resp	= &res, +		.rpc_cred	= cred, +	}; +	unsigned long now = jiffies; +	int status; + +	res.fh = nfs_alloc_fhandle(); +	if (res.fh == NULL) +		return -ENOMEM; + +	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); +	nfs4_set_sequence_privileged(&args.seq_args); +	status = nfs4_call_sync_sequence(clnt, server, &msg, +						&args.seq_args, &res.seq_res); +	nfs_free_fhandle(res.fh); +	if (status) +		return status; + +	do_renew_lease(clp, now); +	return 0; +} + +#ifdef CONFIG_NFS_V4_1 + +/* + * This operation also signals the server that this client is + * performing "lease moved" recovery.  The server can stop asserting + * SEQ4_STATUS_LEASE_MOVED for this client.  The client ID performing + * this operation is identified in the SEQUENCE operation in this + * compound. + */ +static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ +	struct nfs_server *server = NFS_SERVER(inode); +	struct rpc_clnt *clnt = server->client; +	struct nfs4_fsid_present_arg args = { +		.fh		= NFS_FH(inode), +	}; +	struct nfs4_fsid_present_res res = { +	}; +	struct rpc_message msg = { +		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT], +		.rpc_argp	= &args, +		.rpc_resp	= &res, +		.rpc_cred	= cred, +	}; +	int status; + +	res.fh = nfs_alloc_fhandle(); +	if (res.fh == NULL) +		return -ENOMEM; + +	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); +	nfs4_set_sequence_privileged(&args.seq_args); +	status = nfs4_call_sync_sequence(clnt, server, &msg, +						&args.seq_args, &res.seq_res); +	nfs_free_fhandle(res.fh); +	if (status == NFS4_OK && +	    res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) +		status = -NFS4ERR_LEASE_MOVED; +	return status; +} + +#endif	/* CONFIG_NFS_V4_1 */ + +/** + * nfs4_proc_fsid_present - Is this FSID present or absent on server? + * @inode: inode on FSID to check + * @cred: credential to use for this operation + * + * Server indicates whether the FSID is present, moved, or not + * recognized.  This operation is necessary to clear a LEASE_MOVED + * condition for this client ID. + * + * Returns NFS4_OK if the FSID is present on this server, + * -NFS4ERR_MOVED if the FSID is no longer present, a negative + *  NFS4ERR code if some error occurred on the server, or a + *  negative errno if a local failure occurred. + */ +int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ +	struct nfs_server *server = NFS_SERVER(inode); +	struct nfs_client *clp = server->nfs_client; +	const struct nfs4_mig_recovery_ops *ops = +					clp->cl_mvops->mig_recovery_ops; +	struct nfs4_exception exception = { }; +	int status; + +	dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, +		(unsigned long long)server->fsid.major, +		(unsigned long long)server->fsid.minor, +		clp->cl_hostname); +	nfs_display_fhandle(NFS_FH(inode), __func__); + +	do { +		status = ops->fsid_present(inode, cred); +		if (status != -NFS4ERR_DELAY) +			break; +		nfs4_handle_exception(server, status, &exception); +	} while (exception.retry); +	return status; +} + +/** + * If 'use_integrity' is true and the state managment nfs_client + * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient + * and the machine credential as per RFC3530bis and RFC5661 Security + * Considerations sections. Otherwise, just use the user cred with the + * filesystem's rpc_client. + */ +static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity) +{ +	int status; +	struct nfs4_secinfo_arg args = { +		.dir_fh = NFS_FH(dir), +		.name   = name, +	}; +	struct nfs4_secinfo_res res = { +		.flavors     = flavors, +	}; +	struct rpc_message msg = { +		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO], +		.rpc_argp = &args, +		.rpc_resp = &res, +	}; +	struct rpc_clnt *clnt = NFS_SERVER(dir)->client; +	struct rpc_cred *cred = NULL; + +	if (use_integrity) { +		clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient; +		cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client); +		msg.rpc_cred = cred; +	} + +	dprintk("NFS call  secinfo %s\n", name->name); + +	nfs4_state_protect(NFS_SERVER(dir)->nfs_client, +		NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + +	status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args, +				&res.seq_res, 0); +	dprintk("NFS reply  secinfo: %d\n", status); + +	if (cred) +		put_rpccred(cred); + +	return status; +} + +int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, +		      struct nfs4_secinfo_flavors *flavors) +{ +	struct nfs4_exception exception = { }; +	int err; +	do { +		err = -NFS4ERR_WRONGSEC; + +		/* try to use integrity protection with machine cred */ +		if (_nfs4_is_integrity_protected(NFS_SERVER(dir)->nfs_client)) +			err = _nfs4_proc_secinfo(dir, name, flavors, true); + +		/* +		 * if unable to use integrity protection, or SECINFO with +		 * integrity protection returns NFS4ERR_WRONGSEC (which is +		 * disallowed by spec, but exists in deployed servers) use +		 * the current filesystem's rpc_client and the user cred. +		 */ +		if (err == -NFS4ERR_WRONGSEC) +			err = _nfs4_proc_secinfo(dir, name, flavors, false); + +		trace_nfs4_secinfo(dir, name, err); +		err = nfs4_handle_exception(NFS_SERVER(dir), err, +				&exception); +	} while (exception.retry); +	return err; +} + +#ifdef CONFIG_NFS_V4_1 +/* + * Check the exchange flags returned by the server for invalid flags, having + * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or + * DS flags set. + */ +static int nfs4_check_cl_exchange_flags(u32 flags) +{ +	if (flags & ~EXCHGID4_FLAG_MASK_R) +		goto out_inval; +	if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) && +	    (flags & EXCHGID4_FLAG_USE_NON_PNFS)) +		goto out_inval; +	if (!(flags & (EXCHGID4_FLAG_MASK_PNFS))) +		goto out_inval; +	return NFS_OK; +out_inval: +	return -NFS4ERR_INVAL; +} + +static bool +nfs41_same_server_scope(struct nfs41_server_scope *a, +			struct nfs41_server_scope *b) +{ +	if (a->server_scope_sz == b->server_scope_sz && +	    memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0) +		return true; + +	return false; +} + +/* + * nfs4_proc_bind_conn_to_session() + * + * The 4.1 client currently uses the same TCP connection for the + * fore and backchannel. + */ +int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred) +{ +	int status; +	struct nfs41_bind_conn_to_session_res res; +	struct rpc_message msg = { +		.rpc_proc = +			&nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION], +		.rpc_argp = clp, +		.rpc_resp = &res, +		.rpc_cred = cred, +	}; + +	dprintk("--> %s\n", __func__); + +	res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); +	if (unlikely(res.session == NULL)) { +		status = -ENOMEM; +		goto out; +	} + +	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); +	trace_nfs4_bind_conn_to_session(clp, status); +	if (status == 0) { +		if (memcmp(res.session->sess_id.data, +		    clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { +			dprintk("NFS: %s: Session ID mismatch\n", __func__); +			status = -EIO; +			goto out_session; +		} +		if (res.dir != NFS4_CDFS4_BOTH) { +			dprintk("NFS: %s: Unexpected direction from server\n", +				__func__); +			status = -EIO; +			goto out_session; +		} +		if (res.use_conn_in_rdma_mode) { +			dprintk("NFS: %s: Server returned RDMA mode = true\n", +				__func__); +			status = -EIO; +			goto out_session; +		} +	} +out_session: +	kfree(res.session); +out: +	dprintk("<-- %s status= %d\n", __func__, status); +	return status; +} + +/* + * Minimum set of SP4_MACH_CRED operations from RFC 5661 in the enforce map + * and operations we'd like to see to enable certain features in the allow map + */ +static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = { +	.how = SP4_MACH_CRED, +	.enforce.u.words = { +		[1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) | +		      1 << (OP_EXCHANGE_ID - 32) | +		      1 << (OP_CREATE_SESSION - 32) | +		      1 << (OP_DESTROY_SESSION - 32) | +		      1 << (OP_DESTROY_CLIENTID - 32) +	}, +	.allow.u.words = { +		[0] = 1 << (OP_CLOSE) | +		      1 << (OP_LOCKU) | +		      1 << (OP_COMMIT), +		[1] = 1 << (OP_SECINFO - 32) | +		      1 << (OP_SECINFO_NO_NAME - 32) | +		      1 << (OP_TEST_STATEID - 32) | +		      1 << (OP_FREE_STATEID - 32) | +		      1 << (OP_WRITE - 32) +	} +}; + +/* + * Select the state protection mode for client `clp' given the server results + * from exchange_id in `sp'. + * + * Returns 0 on success, negative errno otherwise. + */ +static int nfs4_sp4_select_mode(struct nfs_client *clp, +				 struct nfs41_state_protection *sp) +{ +	static const u32 supported_enforce[NFS4_OP_MAP_NUM_WORDS] = { +		[1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) | +		      1 << (OP_EXCHANGE_ID - 32) | +		      1 << (OP_CREATE_SESSION - 32) | +		      1 << (OP_DESTROY_SESSION - 32) | +		      1 << (OP_DESTROY_CLIENTID - 32) +	}; +	unsigned int i; + +	if (sp->how == SP4_MACH_CRED) { +		/* Print state protect result */ +		dfprintk(MOUNT, "Server SP4_MACH_CRED support:\n"); +		for (i = 0; i <= LAST_NFS4_OP; i++) { +			if (test_bit(i, sp->enforce.u.longs)) +				dfprintk(MOUNT, "  enforce op %d\n", i); +			if (test_bit(i, sp->allow.u.longs)) +				dfprintk(MOUNT, "  allow op %d\n", i); +		} + +		/* make sure nothing is on enforce list that isn't supported */ +		for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) { +			if (sp->enforce.u.words[i] & ~supported_enforce[i]) { +				dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); +				return -EINVAL; +			} +		} + +		/* +		 * Minimal mode - state operations are allowed to use machine +		 * credential.  Note this already happens by default, so the +		 * client doesn't have to do anything more than the negotiation. +		 * +		 * NOTE: we don't care if EXCHANGE_ID is in the list - +		 *       we're already using the machine cred for exchange_id +		 *       and will never use a different cred. +		 */ +		if (test_bit(OP_BIND_CONN_TO_SESSION, sp->enforce.u.longs) && +		    test_bit(OP_CREATE_SESSION, sp->enforce.u.longs) && +		    test_bit(OP_DESTROY_SESSION, sp->enforce.u.longs) && +		    test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) { +			dfprintk(MOUNT, "sp4_mach_cred:\n"); +			dfprintk(MOUNT, "  minimal mode enabled\n"); +			set_bit(NFS_SP4_MACH_CRED_MINIMAL, &clp->cl_sp4_flags); +		} else { +			dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); +			return -EINVAL; +		} + +		if (test_bit(OP_CLOSE, sp->allow.u.longs) && +		    test_bit(OP_LOCKU, sp->allow.u.longs)) { +			dfprintk(MOUNT, "  cleanup mode enabled\n"); +			set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags); +		} + +		if (test_bit(OP_SECINFO, sp->allow.u.longs) && +		    test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) { +			dfprintk(MOUNT, "  secinfo mode enabled\n"); +			set_bit(NFS_SP4_MACH_CRED_SECINFO, &clp->cl_sp4_flags); +		} + +		if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) && +		    test_bit(OP_FREE_STATEID, sp->allow.u.longs)) { +			dfprintk(MOUNT, "  stateid mode enabled\n"); +			set_bit(NFS_SP4_MACH_CRED_STATEID, &clp->cl_sp4_flags); +		} + +		if (test_bit(OP_WRITE, sp->allow.u.longs)) { +			dfprintk(MOUNT, "  write mode enabled\n"); +			set_bit(NFS_SP4_MACH_CRED_WRITE, &clp->cl_sp4_flags); +		} + +		if (test_bit(OP_COMMIT, sp->allow.u.longs)) { +			dfprintk(MOUNT, "  commit mode enabled\n"); +			set_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags); +		} +	} + +	return 0; +} + +/* + * _nfs4_proc_exchange_id() + * + * Wrapper for EXCHANGE_ID operation. + */ +static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, +	u32 sp4_how)  {  	nfs4_verifier verifier;  	struct nfs41_exchange_id_args args = { +		.verifier = &verifier,  		.client = clp, -		.flags = clp->cl_exchange_flags, +#ifdef CONFIG_NFS_V4_1_MIGRATION +		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | +			 EXCHGID4_FLAG_BIND_PRINC_STATEID | +			 EXCHGID4_FLAG_SUPP_MOVED_MIGR, +#else +		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | +			 EXCHGID4_FLAG_BIND_PRINC_STATEID, +#endif  	};  	struct nfs41_exchange_id_res res = { -		.client = clp, +		0  	};  	int status;  	struct rpc_message msg = { @@ -4499,43 +6638,195 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)  		.rpc_resp = &res,  		.rpc_cred = cred,  	}; -	__be32 *p; -	dprintk("--> %s\n", __func__); -	BUG_ON(clp == NULL); +	nfs4_init_boot_verifier(clp, &verifier); +	args.id_len = nfs4_init_uniform_client_string(clp, args.id, +							sizeof(args.id)); +	dprintk("NFS call  exchange_id auth=%s, '%.*s'\n", +		clp->cl_rpcclient->cl_auth->au_ops->au_name, +		args.id_len, args.id); + +	res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), +					GFP_NOFS); +	if (unlikely(res.server_owner == NULL)) { +		status = -ENOMEM; +		goto out; +	} + +	res.server_scope = kzalloc(sizeof(struct nfs41_server_scope), +					GFP_NOFS); +	if (unlikely(res.server_scope == NULL)) { +		status = -ENOMEM; +		goto out_server_owner; +	} + +	res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS); +	if (unlikely(res.impl_id == NULL)) { +		status = -ENOMEM; +		goto out_server_scope; +	} -	/* Remove server-only flags */ -	args.flags &= ~EXCHGID4_FLAG_CONFIRMED_R; +	switch (sp4_how) { +	case SP4_NONE: +		args.state_protect.how = SP4_NONE; +		break; -	p = (u32 *)verifier.data; -	*p++ = htonl((u32)clp->cl_boot_time.tv_sec); -	*p = htonl((u32)clp->cl_boot_time.tv_nsec); -	args.verifier = &verifier; +	case SP4_MACH_CRED: +		args.state_protect = nfs4_sp4_mach_cred_request; +		break; -	while (1) { -		args.id_len = scnprintf(args.id, sizeof(args.id), -					"%s/%s %u", -					clp->cl_ipaddr, -					rpc_peeraddr2str(clp->cl_rpcclient, -							 RPC_DISPLAY_ADDR), -					clp->cl_id_uniquifier); +	default: +		/* unsupported! */ +		WARN_ON_ONCE(1); +		status = -EINVAL; +		goto out_server_scope; +	} -		status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); +	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); +	trace_nfs4_exchange_id(clp, status); +	if (status == 0) +		status = nfs4_check_cl_exchange_flags(res.flags); -		if (status != -NFS4ERR_CLID_INUSE) -			break; +	if (status == 0) +		status = nfs4_sp4_select_mode(clp, &res.state_protect); -		if (signalled()) -			break; +	if (status == 0) { +		clp->cl_clientid = res.clientid; +		clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R); +		if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) +			clp->cl_seqid = res.seqid; + +		kfree(clp->cl_serverowner); +		clp->cl_serverowner = res.server_owner; +		res.server_owner = NULL; + +		/* use the most recent implementation id */ +		kfree(clp->cl_implid); +		clp->cl_implid = res.impl_id; + +		if (clp->cl_serverscope != NULL && +		    !nfs41_same_server_scope(clp->cl_serverscope, +					     res.server_scope)) { +			dprintk("%s: server_scope mismatch detected\n", +				__func__); +			set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); +			kfree(clp->cl_serverscope); +			clp->cl_serverscope = NULL; +		} -		if (++clp->cl_id_uniquifier == 0) -			break; +		if (clp->cl_serverscope == NULL) { +			clp->cl_serverscope = res.server_scope; +			goto out; +		} +	} else +		kfree(res.impl_id); + +out_server_owner: +	kfree(res.server_owner); +out_server_scope: +	kfree(res.server_scope); +out: +	if (clp->cl_implid != NULL) +		dprintk("NFS reply exchange_id: Server Implementation ID: " +			"domain: %s, name: %s, date: %llu,%u\n", +			clp->cl_implid->domain, clp->cl_implid->name, +			clp->cl_implid->date.seconds, +			clp->cl_implid->date.nseconds); +	dprintk("NFS reply exchange_id: %d\n", status); +	return status; +} + +/* + * nfs4_proc_exchange_id() + * + * Returns zero, a negative errno, or a negative NFS4ERR status code. + * + * Since the clientid has expired, all compounds using sessions + * associated with the stale clientid will be returning + * NFS4ERR_BADSESSION in the sequence operation, and will therefore + * be in some phase of session reset. + * + * Will attempt to negotiate SP4_MACH_CRED if krb5i / krb5p auth is used. + */ +int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) +{ +	rpc_authflavor_t authflavor = clp->cl_rpcclient->cl_auth->au_flavor; +	int status; + +	/* try SP4_MACH_CRED if krb5i/p	*/ +	if (authflavor == RPC_AUTH_GSS_KRB5I || +	    authflavor == RPC_AUTH_GSS_KRB5P) { +		status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED); +		if (!status) +			return 0;  	} -	dprintk("<-- %s status= %d\n", __func__, status); +	/* try SP4_NONE */ +	return _nfs4_proc_exchange_id(clp, cred, SP4_NONE); +} + +static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, +		struct rpc_cred *cred) +{ +	struct rpc_message msg = { +		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_CLIENTID], +		.rpc_argp = clp, +		.rpc_cred = cred, +	}; +	int status; + +	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); +	trace_nfs4_destroy_clientid(clp, status); +	if (status) +		dprintk("NFS: Got error %d from the server %s on " +			"DESTROY_CLIENTID.", status, clp->cl_hostname);  	return status;  } +static int nfs4_proc_destroy_clientid(struct nfs_client *clp, +		struct rpc_cred *cred) +{ +	unsigned int loop; +	int ret; + +	for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { +		ret = _nfs4_proc_destroy_clientid(clp, cred); +		switch (ret) { +		case -NFS4ERR_DELAY: +		case -NFS4ERR_CLIENTID_BUSY: +			ssleep(1); +			break; +		default: +			return ret; +		} +	} +	return 0; +} + +int nfs4_destroy_clientid(struct nfs_client *clp) +{ +	struct rpc_cred *cred; +	int ret = 0; + +	if (clp->cl_mvops->minor_version < 1) +		goto out; +	if (clp->cl_exchange_flags == 0) +		goto out; +	if (clp->cl_preserve_clid) +		goto out; +	cred = nfs4_get_clid_cred(clp); +	ret = nfs4_proc_destroy_clientid(clp, cred); +	if (cred) +		put_rpccred(cred); +	switch (ret) { +	case 0: +	case -NFS4ERR_STALE_CLIENTID: +		clp->cl_exchange_flags = 0; +	} +out: +	return ret; +} +  struct nfs4_get_lease_time_data {  	struct nfs4_get_lease_time_args *args;  	struct nfs4_get_lease_time_res *res; @@ -4545,20 +6836,16 @@ struct nfs4_get_lease_time_data {  static void nfs4_get_lease_time_prepare(struct rpc_task *task,  					void *calldata)  { -	int ret;  	struct nfs4_get_lease_time_data *data =  			(struct nfs4_get_lease_time_data *)calldata;  	dprintk("--> %s\n", __func__); -	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);  	/* just setup sequence, do not trigger session recovery  	   since we're invoked within one */ -	ret = nfs41_setup_sequence(data->clp->cl_session, -				   &data->args->la_seq_args, -				   &data->res->lr_seq_res, 0, task); - -	BUG_ON(ret == -EAGAIN); -	rpc_call_start(task); +	nfs41_setup_sequence(data->clp->cl_session, +			&data->args->la_seq_args, +			&data->res->lr_seq_res, +			task);  	dprintk("<-- %s\n", __func__);  } @@ -4580,13 +6867,15 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)  		dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status);  		rpc_delay(task, NFS4_POLL_RETRY_MIN);  		task->tk_status = 0; -		nfs_restart_rpc(task, data->clp); +		/* fall through */ +	case -NFS4ERR_RETRY_UNCACHED_REP: +		rpc_restart_call_prepare(task);  		return;  	}  	dprintk("<-- %s\n", __func__);  } -struct rpc_call_ops nfs4_get_lease_time_ops = { +static const struct rpc_call_ops nfs4_get_lease_time_ops = {  	.rpc_call_prepare = nfs4_get_lease_time_prepare,  	.rpc_call_done = nfs4_get_lease_time_done,  }; @@ -4612,10 +6901,13 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)  		.rpc_client = clp->cl_rpcclient,  		.rpc_message = &msg,  		.callback_ops = &nfs4_get_lease_time_ops, -		.callback_data = &data +		.callback_data = &data, +		.flags = RPC_TASK_TIMEOUT,  	};  	int status; +	nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); +	nfs4_set_sequence_privileged(&args.la_seq_args);  	dprintk("--> %s\n", __func__);  	task = rpc_run_task(&task_setup); @@ -4631,170 +6923,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)  }  /* - * Reset a slot table - */ -static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, -				 int ivalue) -{ -	struct nfs4_slot *new = NULL; -	int i; -	int ret = 0; - -	dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__, -		max_reqs, tbl->max_slots); - -	/* Does the newly negotiated max_reqs match the existing slot table? */ -	if (max_reqs != tbl->max_slots) { -		ret = -ENOMEM; -		new = kmalloc(max_reqs * sizeof(struct nfs4_slot), -			      GFP_NOFS); -		if (!new) -			goto out; -		ret = 0; -		kfree(tbl->slots); -	} -	spin_lock(&tbl->slot_tbl_lock); -	if (new) { -		tbl->slots = new; -		tbl->max_slots = max_reqs; -	} -	for (i = 0; i < tbl->max_slots; ++i) -		tbl->slots[i].seq_nr = ivalue; -	spin_unlock(&tbl->slot_tbl_lock); -	dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, -		tbl, tbl->slots, tbl->max_slots); -out: -	dprintk("<-- %s: return %d\n", __func__, ret); -	return ret; -} - -/* - * Reset the forechannel and backchannel slot tables - */ -static int nfs4_reset_slot_tables(struct nfs4_session *session) -{ -	int status; - -	status = nfs4_reset_slot_table(&session->fc_slot_table, -			session->fc_attrs.max_reqs, 1); -	if (status) -		return status; - -	status = nfs4_reset_slot_table(&session->bc_slot_table, -			session->bc_attrs.max_reqs, 0); -	return status; -} - -/* Destroy the slot table */ -static void nfs4_destroy_slot_tables(struct nfs4_session *session) -{ -	if (session->fc_slot_table.slots != NULL) { -		kfree(session->fc_slot_table.slots); -		session->fc_slot_table.slots = NULL; -	} -	if (session->bc_slot_table.slots != NULL) { -		kfree(session->bc_slot_table.slots); -		session->bc_slot_table.slots = NULL; -	} -	return; -} - -/* - * Initialize slot table - */ -static int nfs4_init_slot_table(struct nfs4_slot_table *tbl, -		int max_slots, int ivalue) -{ -	struct nfs4_slot *slot; -	int ret = -ENOMEM; - -	BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE); - -	dprintk("--> %s: max_reqs=%u\n", __func__, max_slots); - -	slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS); -	if (!slot) -		goto out; -	ret = 0; - -	spin_lock(&tbl->slot_tbl_lock); -	tbl->max_slots = max_slots; -	tbl->slots = slot; -	tbl->highest_used_slotid = -1;  /* no slot is currently used */ -	spin_unlock(&tbl->slot_tbl_lock); -	dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, -		tbl, tbl->slots, tbl->max_slots); -out: -	dprintk("<-- %s: return %d\n", __func__, ret); -	return ret; -} - -/* - * Initialize the forechannel and backchannel tables - */ -static int nfs4_init_slot_tables(struct nfs4_session *session) -{ -	struct nfs4_slot_table *tbl; -	int status = 0; - -	tbl = &session->fc_slot_table; -	if (tbl->slots == NULL) { -		status = nfs4_init_slot_table(tbl, -				session->fc_attrs.max_reqs, 1); -		if (status) -			return status; -	} - -	tbl = &session->bc_slot_table; -	if (tbl->slots == NULL) { -		status = nfs4_init_slot_table(tbl, -				session->bc_attrs.max_reqs, 0); -		if (status) -			nfs4_destroy_slot_tables(session); -	} - -	return status; -} - -struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) -{ -	struct nfs4_session *session; -	struct nfs4_slot_table *tbl; - -	session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); -	if (!session) -		return NULL; - -	init_completion(&session->complete); - -	tbl = &session->fc_slot_table; -	tbl->highest_used_slotid = -1; -	spin_lock_init(&tbl->slot_tbl_lock); -	rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table"); - -	tbl = &session->bc_slot_table; -	tbl->highest_used_slotid = -1; -	spin_lock_init(&tbl->slot_tbl_lock); -	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); - -	session->session_state = 1<<NFS4_SESSION_INITING; - -	session->clp = clp; -	return session; -} - -void nfs4_destroy_session(struct nfs4_session *session) -{ -	nfs4_proc_destroy_session(session); -	dprintk("%s Destroy backchannel for xprt %p\n", -		__func__, session->clp->cl_rpcclient->cl_xprt); -	xprt_destroy_backchannel(session->clp->cl_rpcclient->cl_xprt, -				NFS41_BC_MIN_CALLBACKS); -	nfs4_destroy_slot_tables(session); -	kfree(session); -} - -/*   * Initialize the values to be used by the client in CREATE_SESSION   * If nfs4_init_session set the fore channel request and response sizes,   * use them. @@ -4805,20 +6933,16 @@ void nfs4_destroy_session(struct nfs4_session *session)   */  static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)  { -	struct nfs4_session *session = args->client->cl_session; -	unsigned int mxrqst_sz = session->fc_attrs.max_rqst_sz, -		     mxresp_sz = session->fc_attrs.max_resp_sz; +	unsigned int max_rqst_sz, max_resp_sz; + +	max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead; +	max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead; -	if (mxrqst_sz == 0) -		mxrqst_sz = NFS_MAX_FILE_IO_SIZE; -	if (mxresp_sz == 0) -		mxresp_sz = NFS_MAX_FILE_IO_SIZE;  	/* Fore channel attributes */ -	args->fc_attrs.headerpadsz = 0; -	args->fc_attrs.max_rqst_sz = mxrqst_sz; -	args->fc_attrs.max_resp_sz = mxresp_sz; +	args->fc_attrs.max_rqst_sz = max_rqst_sz; +	args->fc_attrs.max_resp_sz = max_resp_sz;  	args->fc_attrs.max_ops = NFS4_MAX_OPS; -	args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs; +	args->fc_attrs.max_reqs = max_session_slots;  	dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "  		"max_ops=%u max_reqs=%u\n", @@ -4827,7 +6951,6 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)  		args->fc_attrs.max_ops, args->fc_attrs.max_reqs);  	/* Back channel attributes */ -	args->bc_attrs.headerpadsz = 0;  	args->bc_attrs.max_rqst_sz = PAGE_SIZE;  	args->bc_attrs.max_resp_sz = PAGE_SIZE;  	args->bc_attrs.max_resp_sz_cached = 0; @@ -4847,8 +6970,6 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args  	struct nfs4_channel_attrs *sent = &args->fc_attrs;  	struct nfs4_channel_attrs *rcvd = &session->fc_attrs; -	if (rcvd->headerpadsz > sent->headerpadsz) -		return -EINVAL;  	if (rcvd->max_resp_sz > sent->max_resp_sz)  		return -EINVAL;  	/* @@ -4861,6 +6982,8 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args  		return -EINVAL;  	if (rcvd->max_reqs == 0)  		return -EINVAL; +	if (rcvd->max_reqs > NFS4_MAX_SLOT_TABLE) +		rcvd->max_reqs = NFS4_MAX_SLOT_TABLE;  	return 0;  } @@ -4876,9 +6999,9 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args  	if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)  		return -EINVAL;  	/* These would render the backchannel useless: */ -	if (rcvd->max_ops  == 0) +	if (rcvd->max_ops != sent->max_ops)  		return -EINVAL; -	if (rcvd->max_reqs == 0) +	if (rcvd->max_reqs != sent->max_reqs)  		return -EINVAL;  	return 0;  } @@ -4894,7 +7017,8 @@ static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,  	return nfs4_verify_back_channel_attrs(args, session);  } -static int _nfs4_proc_create_session(struct nfs_client *clp) +static int _nfs4_proc_create_session(struct nfs_client *clp, +		struct rpc_cred *cred)  {  	struct nfs4_session *session = clp->cl_session;  	struct nfs41_create_session_args args = { @@ -4908,18 +7032,19 @@ static int _nfs4_proc_create_session(struct nfs_client *clp)  		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],  		.rpc_argp = &args,  		.rpc_resp = &res, +		.rpc_cred = cred,  	};  	int status;  	nfs4_init_channel_attrs(&args);  	args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); -	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, 0); +	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); +	trace_nfs4_create_session(clp, status); -	if (!status) +	if (!status) {  		/* Verify the session's negotiated channel_attrs values */  		status = nfs4_verify_channel_attrs(&args, session); -	if (!status) {  		/* Increment the clientid slot sequence id */  		clp->cl_seqid++;  	} @@ -4932,7 +7057,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp)   * It is the responsibility of the caller to verify the session is   * expired before calling this routine.   */ -int nfs4_proc_create_session(struct nfs_client *clp) +int nfs4_proc_create_session(struct nfs_client *clp, struct rpc_cred *cred)  {  	int status;  	unsigned *ptr; @@ -4940,17 +7065,13 @@ int nfs4_proc_create_session(struct nfs_client *clp)  	dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); -	status = _nfs4_proc_create_session(clp); +	status = _nfs4_proc_create_session(clp, cred);  	if (status)  		goto out; -	/* Init and reset the fore channel */ -	status = nfs4_init_slot_tables(session); -	dprintk("slot table initialization returned %d\n", status); -	if (status) -		goto out; -	status = nfs4_reset_slot_tables(session); -	dprintk("slot table reset returned %d\n", status); +	/* Init or reset the session slot tables */ +	status = nfs4_setup_session_slot_tables(session); +	dprintk("slot table setup returned %d\n", status);  	if (status)  		goto out; @@ -4966,10 +7087,15 @@ out:   * Issue the over-the-wire RPC DESTROY_SESSION.   * The caller must serialize access to this routine.   */ -int nfs4_proc_destroy_session(struct nfs4_session *session) +int nfs4_proc_destroy_session(struct nfs4_session *session, +		struct rpc_cred *cred)  { +	struct rpc_message msg = { +		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION], +		.rpc_argp = session, +		.rpc_cred = cred, +	};  	int status = 0; -	struct rpc_message msg;  	dprintk("--> nfs4_proc_destroy_session\n"); @@ -4977,51 +7103,17 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)  	if (session->clp->cl_cons_state != NFS_CS_READY)  		return status; -	msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION]; -	msg.rpc_argp = session; -	msg.rpc_resp = NULL; -	msg.rpc_cred = NULL; -	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, 0); +	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); +	trace_nfs4_destroy_session(session->clp, status);  	if (status) -		printk(KERN_WARNING -			"Got error %d from the server on DESTROY_SESSION. " +		dprintk("NFS: Got error %d from the server on DESTROY_SESSION. "  			"Session has been destroyed regardless...\n", status);  	dprintk("<-- nfs4_proc_destroy_session\n");  	return status;  } -int nfs4_init_session(struct nfs_server *server) -{ -	struct nfs_client *clp = server->nfs_client; -	struct nfs4_session *session; -	unsigned int rsize, wsize; -	int ret; - -	if (!nfs4_has_session(clp)) -		return 0; - -	session = clp->cl_session; -	if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) -		return 0; - -	rsize = server->rsize; -	if (rsize == 0) -		rsize = NFS_MAX_FILE_IO_SIZE; -	wsize = server->wsize; -	if (wsize == 0) -		wsize = NFS_MAX_FILE_IO_SIZE; - -	session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; -	session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; - -	ret = nfs4_recover_expired_lease(server); -	if (!ret) -		ret = nfs4_check_client_ready(clp); -	return ret; -} -  /*   * Renew the cl_session lease.   */ @@ -5049,7 +7141,7 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client  		rpc_delay(task, NFS4_POLL_RETRY_MAX);  		return -EAGAIN;  	default: -		nfs4_schedule_state_recovery(clp); +		nfs4_schedule_lease_recovery(clp);  	}  	return 0;  } @@ -5062,6 +7154,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data)  	if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp))  		return; +	trace_nfs4_sequence(clp, task->tk_status);  	if (task->tk_status < 0) {  		dprintk("%s ERROR %d\n", __func__, task->tk_status);  		if (atomic_read(&clp->cl_count) == 1) @@ -5087,9 +7180,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)  	args = task->tk_msg.rpc_argp;  	res = task->tk_msg.rpc_resp; -	if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task)) -		return; -	rpc_call_start(task); +	nfs41_setup_sequence(clp->cl_session, args, res, task);  }  static const struct rpc_call_ops nfs41_sequence_ops = { @@ -5098,7 +7189,9 @@ static const struct rpc_call_ops nfs41_sequence_ops = {  	.rpc_release = nfs41_sequence_release,  }; -static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, +		struct rpc_cred *cred, +		bool is_privileged)  {  	struct nfs4_sequence_data *calldata;  	struct rpc_message msg = { @@ -5109,7 +7202,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_  		.rpc_client = clp->cl_rpcclient,  		.rpc_message = &msg,  		.callback_ops = &nfs41_sequence_ops, -		.flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, +		.flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,  	};  	if (!atomic_inc_not_zero(&clp->cl_count)) @@ -5119,6 +7212,9 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_  		nfs_put_client(clp);  		return ERR_PTR(-ENOMEM);  	} +	nfs4_init_sequence(&calldata->args, &calldata->res, 0); +	if (is_privileged) +		nfs4_set_sequence_privileged(&calldata->args);  	msg.rpc_argp = &calldata->args;  	msg.rpc_resp = &calldata->res;  	calldata->clp = clp; @@ -5127,16 +7223,18 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_  	return rpc_run_task(&task_setup_data);  } -static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) +static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)  {  	struct rpc_task *task;  	int ret = 0; -	task = _nfs41_proc_sequence(clp, cred); +	if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) +		return 0; +	task = _nfs41_proc_sequence(clp, cred, false);  	if (IS_ERR(task))  		ret = PTR_ERR(task);  	else -		rpc_put_task(task); +		rpc_put_task_async(task);  	dprintk("<-- %s status=%d\n", __func__, ret);  	return ret;  } @@ -5146,14 +7244,19 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)  	struct rpc_task *task;  	int ret; -	task = _nfs41_proc_sequence(clp, cred); +	task = _nfs41_proc_sequence(clp, cred, true);  	if (IS_ERR(task)) {  		ret = PTR_ERR(task);  		goto out;  	}  	ret = rpc_wait_for_completion_task(task); -	if (!ret) +	if (!ret) { +		struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; + +		if (task->tk_status == 0) +			nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);  		ret = task->tk_status; +	}  	rpc_put_task(task);  out:  	dprintk("<-- %s status=%d\n", __func__, ret); @@ -5170,13 +7273,10 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)  {  	struct nfs4_reclaim_complete_data *calldata = data; -	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); -	if (nfs41_setup_sequence(calldata->clp->cl_session, -				&calldata->arg.seq_args, -				&calldata->res.seq_res, 0, task)) -		return; - -	rpc_call_start(task); +	nfs41_setup_sequence(calldata->clp->cl_session, +			&calldata->arg.seq_args, +			&calldata->res.seq_res, +			task);  }  static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) @@ -5188,9 +7288,11 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf  		break;  	case -NFS4ERR_DELAY:  		rpc_delay(task, NFS4_POLL_RETRY_MAX); +		/* fall through */ +	case -NFS4ERR_RETRY_UNCACHED_REP:  		return -EAGAIN;  	default: -		nfs4_schedule_state_recovery(clp); +		nfs4_schedule_lease_recovery(clp);  	}  	return 0;  } @@ -5205,6 +7307,7 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)  	if (!nfs41_sequence_done(task, res))  		return; +	trace_nfs4_reclaim_complete(clp, task->tk_status);  	if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) {  		rpc_restart_call_prepare(task);  		return; @@ -5228,12 +7331,14 @@ static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {  /*   * Issue a global reclaim complete.   */ -static int nfs41_proc_reclaim_complete(struct nfs_client *clp) +static int nfs41_proc_reclaim_complete(struct nfs_client *clp, +		struct rpc_cred *cred)  {  	struct nfs4_reclaim_complete_data *calldata;  	struct rpc_task *task;  	struct rpc_message msg = {  		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], +		.rpc_cred = cred,  	};  	struct rpc_task_setup task_setup_data = {  		.rpc_client = clp->cl_rpcclient, @@ -5250,6 +7355,8 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)  	calldata->clp = clp;  	calldata->arg.one_fs = 0; +	nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0); +	nfs4_set_sequence_privileged(&calldata->arg.seq_args);  	msg.rpc_argp = &calldata->arg;  	msg.rpc_resp = &calldata->res;  	task_setup_data.callback_data = calldata; @@ -5258,6 +7365,9 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)  		status = PTR_ERR(task);  		goto out;  	} +	status = nfs4_wait_for_completion_rpc_task(task); +	if (status == 0) +		status = task->tk_status;  	rpc_put_task(task);  	return 0;  out: @@ -5269,51 +7379,153 @@ static void  nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)  {  	struct nfs4_layoutget *lgp = calldata; -	struct inode *ino = lgp->args.inode; -	struct nfs_server *server = NFS_SERVER(ino); +	struct nfs_server *server = NFS_SERVER(lgp->args.inode); +	struct nfs4_session *session = nfs4_get_session(server);  	dprintk("--> %s\n", __func__); -	if (nfs4_setup_sequence(server, &lgp->args.seq_args, -				&lgp->res.seq_res, 0, task)) +	/* Note the is a race here, where a CB_LAYOUTRECALL can come in +	 * right now covering the LAYOUTGET we are about to send. +	 * However, that is not so catastrophic, and there seems +	 * to be no way to prevent it completely. +	 */ +	if (nfs41_setup_sequence(session, &lgp->args.seq_args, +				&lgp->res.seq_res, task))  		return; -	rpc_call_start(task); +	if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, +					  NFS_I(lgp->args.inode)->layout, +					  lgp->args.ctx->state)) { +		rpc_exit(task, NFS4_OK); +	}  }  static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)  {  	struct nfs4_layoutget *lgp = calldata; -	struct nfs_server *server = NFS_SERVER(lgp->args.inode); +	struct inode *inode = lgp->args.inode; +	struct nfs_server *server = NFS_SERVER(inode); +	struct pnfs_layout_hdr *lo; +	struct nfs4_state *state = NULL; +	unsigned long timeo, now, giveup; -	dprintk("--> %s\n", __func__); +	dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); -	if (!nfs4_sequence_done(task, &lgp->res.seq_res)) -		return; +	if (!nfs41_sequence_done(task, &lgp->res.seq_res)) +		goto out;  	switch (task->tk_status) {  	case 0: -		break; +		goto out; +	/* +	 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client +	 * (or clients) writing to the same RAID stripe +	 */  	case -NFS4ERR_LAYOUTTRYLATER: +	/* +	 * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall +	 * existing layout before getting a new one). +	 */  	case -NFS4ERR_RECALLCONFLICT: -		task->tk_status = -NFS4ERR_DELAY; -		/* Fall through */ -	default: -		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { +		timeo = rpc_get_timeout(task->tk_client); +		giveup = lgp->args.timestamp + timeo; +		now = jiffies; +		if (time_after(giveup, now)) { +			unsigned long delay; + +			/* Delay for: +			 * - Not less then NFS4_POLL_RETRY_MIN. +			 * - One last time a jiffie before we give up +			 * - exponential backoff (time_now minus start_attempt) +			 */ +			delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN, +				    min((giveup - now - 1), +					now - lgp->args.timestamp)); + +			dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n", +				__func__, delay); +			rpc_delay(task, delay); +			task->tk_status = 0;  			rpc_restart_call_prepare(task); -			return; +			goto out; /* Do not call nfs4_async_handle_error() */ +		} +		break; +	case -NFS4ERR_EXPIRED: +	case -NFS4ERR_BAD_STATEID: +		spin_lock(&inode->i_lock); +		lo = NFS_I(inode)->layout; +		if (!lo || list_empty(&lo->plh_segs)) { +			spin_unlock(&inode->i_lock); +			/* If the open stateid was bad, then recover it. */ +			state = lgp->args.ctx->state; +		} else { +			LIST_HEAD(head); + +			pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); +			spin_unlock(&inode->i_lock); +			/* Mark the bad layout state as invalid, then +			 * retry using the open stateid. */ +			pnfs_free_lseg_list(&head);  		}  	} -	lgp->status = task->tk_status; +	if (nfs4_async_handle_error(task, server, state) == -EAGAIN) +		rpc_restart_call_prepare(task); +out:  	dprintk("<-- %s\n", __func__);  } +static size_t max_response_pages(struct nfs_server *server) +{ +	u32 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; +	return nfs_page_array_len(0, max_resp_sz); +} + +static void nfs4_free_pages(struct page **pages, size_t size) +{ +	int i; + +	if (!pages) +		return; + +	for (i = 0; i < size; i++) { +		if (!pages[i]) +			break; +		__free_page(pages[i]); +	} +	kfree(pages); +} + +static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) +{ +	struct page **pages; +	int i; + +	pages = kcalloc(size, sizeof(struct page *), gfp_flags); +	if (!pages) { +		dprintk("%s: can't alloc array of %zu pages\n", __func__, size); +		return NULL; +	} + +	for (i = 0; i < size; i++) { +		pages[i] = alloc_page(gfp_flags); +		if (!pages[i]) { +			dprintk("%s: failed to allocate page\n", __func__); +			nfs4_free_pages(pages, size); +			return NULL; +		} +	} + +	return pages; +} +  static void nfs4_layoutget_release(void *calldata)  {  	struct nfs4_layoutget *lgp = calldata; +	struct inode *inode = lgp->args.inode; +	struct nfs_server *server = NFS_SERVER(inode); +	size_t max_pages = max_response_pages(server);  	dprintk("--> %s\n", __func__); -	put_layout_hdr(lgp->args.inode); -	if (lgp->res.layout.buf != NULL) -		free_page((unsigned long) lgp->res.layout.buf); +	nfs4_free_pages(lgp->args.layout.pages, max_pages); +	pnfs_put_layout_hdr(NFS_I(inode)->layout);  	put_nfs_open_context(lgp->args.ctx);  	kfree(calldata);  	dprintk("<-- %s\n", __func__); @@ -5325,14 +7537,18 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {  	.rpc_release = nfs4_layoutget_release,  }; -int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) +struct pnfs_layout_segment * +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)  { -	struct nfs_server *server = NFS_SERVER(lgp->args.inode); +	struct inode *inode = lgp->args.inode; +	struct nfs_server *server = NFS_SERVER(inode); +	size_t max_pages = max_response_pages(server);  	struct rpc_task *task;  	struct rpc_message msg = {  		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],  		.rpc_argp = &lgp->args,  		.rpc_resp = &lgp->res, +		.rpc_cred = lgp->cred,  	};  	struct rpc_task_setup task_setup_data = {  		.rpc_client = server->client, @@ -5341,35 +7557,186 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)  		.callback_data = lgp,  		.flags = RPC_TASK_ASYNC,  	}; +	struct pnfs_layout_segment *lseg = NULL;  	int status = 0;  	dprintk("--> %s\n", __func__); -	lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); -	if (lgp->res.layout.buf == NULL) { +	lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); +	if (!lgp->args.layout.pages) {  		nfs4_layoutget_release(lgp); -		return -ENOMEM; +		return ERR_PTR(-ENOMEM);  	} +	lgp->args.layout.pglen = max_pages * PAGE_SIZE; +	lgp->args.timestamp = jiffies; +	lgp->res.layoutp = &lgp->args.layout;  	lgp->res.seq_res.sr_slot = NULL; +	nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); + +	/* nfs4_layoutget_release calls pnfs_put_layout_hdr */ +	pnfs_get_layout_hdr(NFS_I(inode)->layout); +  	task = rpc_run_task(&task_setup_data);  	if (IS_ERR(task)) -		return PTR_ERR(task); +		return ERR_CAST(task);  	status = nfs4_wait_for_completion_rpc_task(task); -	if (status != 0) -		goto out; -	status = lgp->status; -	if (status != 0) -		goto out; -	status = pnfs_layout_process(lgp); -out: +	if (status == 0) +		status = task->tk_status; +	trace_nfs4_layoutget(lgp->args.ctx, +			&lgp->args.range, +			&lgp->res.range, +			status); +	/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ +	if (status == 0 && lgp->res.layoutp->len) +		lseg = pnfs_layout_process(lgp);  	rpc_put_task(task);  	dprintk("<-- %s status=%d\n", __func__, status); +	if (status) +		return ERR_PTR(status); +	return lseg; +} + +static void +nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) +{ +	struct nfs4_layoutreturn *lrp = calldata; + +	dprintk("--> %s\n", __func__); +	nfs41_setup_sequence(lrp->clp->cl_session, +			&lrp->args.seq_args, +			&lrp->res.seq_res, +			task); +} + +static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) +{ +	struct nfs4_layoutreturn *lrp = calldata; +	struct nfs_server *server; + +	dprintk("--> %s\n", __func__); + +	if (!nfs41_sequence_done(task, &lrp->res.seq_res)) +		return; + +	server = NFS_SERVER(lrp->args.inode); +	switch (task->tk_status) { +	default: +		task->tk_status = 0; +	case 0: +		break; +	case -NFS4ERR_DELAY: +		if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN) +			break; +		rpc_restart_call_prepare(task); +		return; +	} +	dprintk("<-- %s\n", __func__); +} + +static void nfs4_layoutreturn_release(void *calldata) +{ +	struct nfs4_layoutreturn *lrp = calldata; +	struct pnfs_layout_hdr *lo = lrp->args.layout; + +	dprintk("--> %s\n", __func__); +	spin_lock(&lo->plh_inode->i_lock); +	if (lrp->res.lrs_present) +		pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); +	lo->plh_block_lgets--; +	spin_unlock(&lo->plh_inode->i_lock); +	pnfs_put_layout_hdr(lrp->args.layout); +	kfree(calldata); +	dprintk("<-- %s\n", __func__); +} + +static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { +	.rpc_call_prepare = nfs4_layoutreturn_prepare, +	.rpc_call_done = nfs4_layoutreturn_done, +	.rpc_release = nfs4_layoutreturn_release, +}; + +int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) +{ +	struct rpc_task *task; +	struct rpc_message msg = { +		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], +		.rpc_argp = &lrp->args, +		.rpc_resp = &lrp->res, +		.rpc_cred = lrp->cred, +	}; +	struct rpc_task_setup task_setup_data = { +		.rpc_client = NFS_SERVER(lrp->args.inode)->client, +		.rpc_message = &msg, +		.callback_ops = &nfs4_layoutreturn_call_ops, +		.callback_data = lrp, +	}; +	int status; + +	dprintk("--> %s\n", __func__); +	nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1); +	task = rpc_run_task(&task_setup_data); +	if (IS_ERR(task)) +		return PTR_ERR(task); +	status = task->tk_status; +	trace_nfs4_layoutreturn(lrp->args.inode, status); +	dprintk("<-- %s status=%d\n", __func__, status); +	rpc_put_task(task);  	return status;  } +/* + * Retrieve the list of Data Server devices from the MDS. + */ +static int _nfs4_getdevicelist(struct nfs_server *server, +				    const struct nfs_fh *fh, +				    struct pnfs_devicelist *devlist) +{ +	struct nfs4_getdevicelist_args args = { +		.fh = fh, +		.layoutclass = server->pnfs_curr_ld->id, +	}; +	struct nfs4_getdevicelist_res res = { +		.devlist = devlist, +	}; +	struct rpc_message msg = { +		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], +		.rpc_argp = &args, +		.rpc_resp = &res, +	}; +	int status; + +	dprintk("--> %s\n", __func__); +	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, +				&res.seq_res, 0); +	dprintk("<-- %s status=%d\n", __func__, status); +	return status; +} + +int nfs4_proc_getdevicelist(struct nfs_server *server, +			    const struct nfs_fh *fh, +			    struct pnfs_devicelist *devlist) +{ +	struct nfs4_exception exception = { }; +	int err; + +	do { +		err = nfs4_handle_exception(server, +				_nfs4_getdevicelist(server, fh, devlist), +				&exception); +	} while (exception.retry); + +	dprintk("%s: err=%d, num_devs=%u\n", __func__, +		err, devlist->num_devs); + +	return err; +} +EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); +  static int -_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) +_nfs4_proc_getdeviceinfo(struct nfs_server *server, +		struct pnfs_device *pdev, +		struct rpc_cred *cred)  {  	struct nfs4_getdeviceinfo_args args = {  		.pdev = pdev, @@ -5381,101 +7748,596 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)  		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],  		.rpc_argp = &args,  		.rpc_resp = &res, +		.rpc_cred = cred,  	};  	int status;  	dprintk("--> %s\n", __func__); -	status = nfs4_call_sync(server, &msg, &args, &res, 0); +	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);  	dprintk("<-- %s status=%d\n", __func__, status);  	return status;  } -int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) +int nfs4_proc_getdeviceinfo(struct nfs_server *server, +		struct pnfs_device *pdev, +		struct rpc_cred *cred)  {  	struct nfs4_exception exception = { };  	int err;  	do {  		err = nfs4_handle_exception(server, -					_nfs4_proc_getdeviceinfo(server, pdev), +					_nfs4_proc_getdeviceinfo(server, pdev, cred),  					&exception);  	} while (exception.retry);  	return err;  }  EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo); +static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata) +{ +	struct nfs4_layoutcommit_data *data = calldata; +	struct nfs_server *server = NFS_SERVER(data->args.inode); +	struct nfs4_session *session = nfs4_get_session(server); + +	nfs41_setup_sequence(session, +			&data->args.seq_args, +			&data->res.seq_res, +			task); +} + +static void +nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) +{ +	struct nfs4_layoutcommit_data *data = calldata; +	struct nfs_server *server = NFS_SERVER(data->args.inode); + +	if (!nfs41_sequence_done(task, &data->res.seq_res)) +		return; + +	switch (task->tk_status) { /* Just ignore these failures */ +	case -NFS4ERR_DELEG_REVOKED: /* layout was recalled */ +	case -NFS4ERR_BADIOMODE:     /* no IOMODE_RW layout for range */ +	case -NFS4ERR_BADLAYOUT:     /* no layout */ +	case -NFS4ERR_GRACE:	    /* loca_recalim always false */ +		task->tk_status = 0; +	case 0: +		break; +	default: +		if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { +			rpc_restart_call_prepare(task); +			return; +		} +	} +} + +static void nfs4_layoutcommit_release(void *calldata) +{ +	struct nfs4_layoutcommit_data *data = calldata; + +	pnfs_cleanup_layoutcommit(data); +	nfs_post_op_update_inode_force_wcc(data->args.inode, +					   data->res.fattr); +	put_rpccred(data->cred); +	kfree(data); +} + +static const struct rpc_call_ops nfs4_layoutcommit_ops = { +	.rpc_call_prepare = nfs4_layoutcommit_prepare, +	.rpc_call_done = nfs4_layoutcommit_done, +	.rpc_release = nfs4_layoutcommit_release, +}; + +int +nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) +{ +	struct rpc_message msg = { +		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT], +		.rpc_argp = &data->args, +		.rpc_resp = &data->res, +		.rpc_cred = data->cred, +	}; +	struct rpc_task_setup task_setup_data = { +		.task = &data->task, +		.rpc_client = NFS_CLIENT(data->args.inode), +		.rpc_message = &msg, +		.callback_ops = &nfs4_layoutcommit_ops, +		.callback_data = data, +		.flags = RPC_TASK_ASYNC, +	}; +	struct rpc_task *task; +	int status = 0; + +	dprintk("NFS: %4d initiating layoutcommit call. sync %d " +		"lbw: %llu inode %lu\n", +		data->task.tk_pid, sync, +		data->args.lastbytewritten, +		data->args.inode->i_ino); + +	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); +	task = rpc_run_task(&task_setup_data); +	if (IS_ERR(task)) +		return PTR_ERR(task); +	if (sync == false) +		goto out; +	status = nfs4_wait_for_completion_rpc_task(task); +	if (status != 0) +		goto out; +	status = task->tk_status; +	trace_nfs4_layoutcommit(data->args.inode, status); +out: +	dprintk("%s: status %d\n", __func__, status); +	rpc_put_task(task); +	return status; +} + +/** + * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if + * possible) as per RFC3530bis and RFC5661 Security Considerations sections + */ +static int +_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, +		    struct nfs_fsinfo *info, +		    struct nfs4_secinfo_flavors *flavors, bool use_integrity) +{ +	struct nfs41_secinfo_no_name_args args = { +		.style = SECINFO_STYLE_CURRENT_FH, +	}; +	struct nfs4_secinfo_res res = { +		.flavors = flavors, +	}; +	struct rpc_message msg = { +		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO_NO_NAME], +		.rpc_argp = &args, +		.rpc_resp = &res, +	}; +	struct rpc_clnt *clnt = server->client; +	struct rpc_cred *cred = NULL; +	int status; + +	if (use_integrity) { +		clnt = server->nfs_client->cl_rpcclient; +		cred = nfs4_get_clid_cred(server->nfs_client); +		msg.rpc_cred = cred; +	} + +	dprintk("--> %s\n", __func__); +	status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, +				&res.seq_res, 0); +	dprintk("<-- %s status=%d\n", __func__, status); + +	if (cred) +		put_rpccred(cred); + +	return status; +} + +static int +nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, +			   struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) +{ +	struct nfs4_exception exception = { }; +	int err; +	do { +		/* first try using integrity protection */ +		err = -NFS4ERR_WRONGSEC; + +		/* try to use integrity protection with machine cred */ +		if (_nfs4_is_integrity_protected(server->nfs_client)) +			err = _nfs41_proc_secinfo_no_name(server, fhandle, info, +							  flavors, true); + +		/* +		 * if unable to use integrity protection, or SECINFO with +		 * integrity protection returns NFS4ERR_WRONGSEC (which is +		 * disallowed by spec, but exists in deployed servers) use +		 * the current filesystem's rpc_client and the user cred. +		 */ +		if (err == -NFS4ERR_WRONGSEC) +			err = _nfs41_proc_secinfo_no_name(server, fhandle, info, +							  flavors, false); + +		switch (err) { +		case 0: +		case -NFS4ERR_WRONGSEC: +		case -ENOTSUPP: +			goto out; +		default: +			err = nfs4_handle_exception(server, err, &exception); +		} +	} while (exception.retry); +out: +	return err; +} + +static int +nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, +		    struct nfs_fsinfo *info) +{ +	int err; +	struct page *page; +	rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; +	struct nfs4_secinfo_flavors *flavors; +	struct nfs4_secinfo4 *secinfo; +	int i; + +	page = alloc_page(GFP_KERNEL); +	if (!page) { +		err = -ENOMEM; +		goto out; +	} + +	flavors = page_address(page); +	err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); + +	/* +	 * Fall back on "guess and check" method if +	 * the server doesn't support SECINFO_NO_NAME +	 */ +	if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) { +		err = nfs4_find_root_sec(server, fhandle, info); +		goto out_freepage; +	} +	if (err) +		goto out_freepage; + +	for (i = 0; i < flavors->num_flavors; i++) { +		secinfo = &flavors->flavors[i]; + +		switch (secinfo->flavor) { +		case RPC_AUTH_NULL: +		case RPC_AUTH_UNIX: +		case RPC_AUTH_GSS: +			flavor = rpcauth_get_pseudoflavor(secinfo->flavor, +					&secinfo->flavor_info); +			break; +		default: +			flavor = RPC_AUTH_MAXFLAVOR; +			break; +		} + +		if (!nfs_auth_info_match(&server->auth_info, flavor)) +			flavor = RPC_AUTH_MAXFLAVOR; + +		if (flavor != RPC_AUTH_MAXFLAVOR) { +			err = nfs4_lookup_root_sec(server, fhandle, +						   info, flavor); +			if (!err) +				break; +		} +	} + +	if (flavor == RPC_AUTH_MAXFLAVOR) +		err = -EPERM; + +out_freepage: +	put_page(page); +	if (err == -EACCES) +		return -EPERM; +out: +	return err; +} + +static int _nfs41_test_stateid(struct nfs_server *server, +		nfs4_stateid *stateid, +		struct rpc_cred *cred) +{ +	int status; +	struct nfs41_test_stateid_args args = { +		.stateid = stateid, +	}; +	struct nfs41_test_stateid_res res; +	struct rpc_message msg = { +		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID], +		.rpc_argp = &args, +		.rpc_resp = &res, +		.rpc_cred = cred, +	}; +	struct rpc_clnt *rpc_client = server->client; + +	nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID, +		&rpc_client, &msg); + +	dprintk("NFS call  test_stateid %p\n", stateid); +	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); +	nfs4_set_sequence_privileged(&args.seq_args); +	status = nfs4_call_sync_sequence(rpc_client, server, &msg, +			&args.seq_args, &res.seq_res); +	if (status != NFS_OK) { +		dprintk("NFS reply test_stateid: failed, %d\n", status); +		return status; +	} +	dprintk("NFS reply test_stateid: succeeded, %d\n", -res.status); +	return -res.status; +} + +/** + * nfs41_test_stateid - perform a TEST_STATEID operation + * + * @server: server / transport on which to perform the operation + * @stateid: state ID to test + * @cred: credential + * + * Returns NFS_OK if the server recognizes that "stateid" is valid. + * Otherwise a negative NFS4ERR value is returned if the operation + * failed or the state ID is not currently valid. + */ +static int nfs41_test_stateid(struct nfs_server *server, +		nfs4_stateid *stateid, +		struct rpc_cred *cred) +{ +	struct nfs4_exception exception = { }; +	int err; +	do { +		err = _nfs41_test_stateid(server, stateid, cred); +		if (err != -NFS4ERR_DELAY) +			break; +		nfs4_handle_exception(server, err, &exception); +	} while (exception.retry); +	return err; +} + +struct nfs_free_stateid_data { +	struct nfs_server *server; +	struct nfs41_free_stateid_args args; +	struct nfs41_free_stateid_res res; +}; + +static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata) +{ +	struct nfs_free_stateid_data *data = calldata; +	nfs41_setup_sequence(nfs4_get_session(data->server), +			&data->args.seq_args, +			&data->res.seq_res, +			task); +} + +static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata) +{ +	struct nfs_free_stateid_data *data = calldata; + +	nfs41_sequence_done(task, &data->res.seq_res); + +	switch (task->tk_status) { +	case -NFS4ERR_DELAY: +		if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN) +			rpc_restart_call_prepare(task); +	} +} + +static void nfs41_free_stateid_release(void *calldata) +{ +	kfree(calldata); +} + +static const struct rpc_call_ops nfs41_free_stateid_ops = { +	.rpc_call_prepare = nfs41_free_stateid_prepare, +	.rpc_call_done = nfs41_free_stateid_done, +	.rpc_release = nfs41_free_stateid_release, +}; + +static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, +		nfs4_stateid *stateid, +		struct rpc_cred *cred, +		bool privileged) +{ +	struct rpc_message msg = { +		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID], +		.rpc_cred = cred, +	}; +	struct rpc_task_setup task_setup = { +		.rpc_client = server->client, +		.rpc_message = &msg, +		.callback_ops = &nfs41_free_stateid_ops, +		.flags = RPC_TASK_ASYNC, +	}; +	struct nfs_free_stateid_data *data; + +	nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID, +		&task_setup.rpc_client, &msg); + +	dprintk("NFS call  free_stateid %p\n", stateid); +	data = kmalloc(sizeof(*data), GFP_NOFS); +	if (!data) +		return ERR_PTR(-ENOMEM); +	data->server = server; +	nfs4_stateid_copy(&data->args.stateid, stateid); + +	task_setup.callback_data = data; + +	msg.rpc_argp = &data->args; +	msg.rpc_resp = &data->res; +	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); +	if (privileged) +		nfs4_set_sequence_privileged(&data->args.seq_args); + +	return rpc_run_task(&task_setup); +} + +/** + * nfs41_free_stateid - perform a FREE_STATEID operation + * + * @server: server / transport on which to perform the operation + * @stateid: state ID to release + * @cred: credential + * + * Returns NFS_OK if the server freed "stateid".  Otherwise a + * negative NFS4ERR value is returned. + */ +static int nfs41_free_stateid(struct nfs_server *server, +		nfs4_stateid *stateid, +		struct rpc_cred *cred) +{ +	struct rpc_task *task; +	int ret; + +	task = _nfs41_free_stateid(server, stateid, cred, true); +	if (IS_ERR(task)) +		return PTR_ERR(task); +	ret = rpc_wait_for_completion_task(task); +	if (!ret) +		ret = task->tk_status; +	rpc_put_task(task); +	return ret; +} + +static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) +{ +	struct rpc_task *task; +	struct rpc_cred *cred = lsp->ls_state->owner->so_cred; + +	task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); +	nfs4_free_lock_state(server, lsp); +	if (IS_ERR(task)) +		return PTR_ERR(task); +	rpc_put_task(task); +	return 0; +} + +static bool nfs41_match_stateid(const nfs4_stateid *s1, +		const nfs4_stateid *s2) +{ +	if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0) +		return false; + +	if (s1->seqid == s2->seqid) +		return true; +	if (s1->seqid == 0 || s2->seqid == 0) +		return true; + +	return false; +} +  #endif /* CONFIG_NFS_V4_1 */ -struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { +static bool nfs4_match_stateid(const nfs4_stateid *s1, +		const nfs4_stateid *s2) +{ +	return nfs4_stateid_match(s1, s2); +} + + +static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {  	.owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,  	.state_flag_bit	= NFS_STATE_RECLAIM_REBOOT,  	.recover_open	= nfs4_open_reclaim,  	.recover_lock	= nfs4_lock_reclaim,  	.establish_clid = nfs4_init_clientid, -	.get_clid_cred	= nfs4_get_setclientid_cred, +	.detect_trunking = nfs40_discover_server_trunking,  };  #if defined(CONFIG_NFS_V4_1) -struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = { +static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {  	.owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,  	.state_flag_bit	= NFS_STATE_RECLAIM_REBOOT,  	.recover_open	= nfs4_open_reclaim,  	.recover_lock	= nfs4_lock_reclaim,  	.establish_clid = nfs41_init_clientid, -	.get_clid_cred	= nfs4_get_exchange_id_cred,  	.reclaim_complete = nfs41_proc_reclaim_complete, +	.detect_trunking = nfs41_discover_server_trunking,  };  #endif /* CONFIG_NFS_V4_1 */ -struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = { +static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {  	.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,  	.state_flag_bit	= NFS_STATE_RECLAIM_NOGRACE,  	.recover_open	= nfs4_open_expired,  	.recover_lock	= nfs4_lock_expired,  	.establish_clid = nfs4_init_clientid, -	.get_clid_cred	= nfs4_get_setclientid_cred,  };  #if defined(CONFIG_NFS_V4_1) -struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { +static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {  	.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,  	.state_flag_bit	= NFS_STATE_RECLAIM_NOGRACE, -	.recover_open	= nfs4_open_expired, -	.recover_lock	= nfs4_lock_expired, +	.recover_open	= nfs41_open_expired, +	.recover_lock	= nfs41_lock_expired,  	.establish_clid = nfs41_init_clientid, -	.get_clid_cred	= nfs4_get_exchange_id_cred,  };  #endif /* CONFIG_NFS_V4_1 */ -struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = { +static const struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {  	.sched_state_renewal = nfs4_proc_async_renew,  	.get_state_renewal_cred_locked = nfs4_get_renew_cred_locked,  	.renew_lease = nfs4_proc_renew,  };  #if defined(CONFIG_NFS_V4_1) -struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { +static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {  	.sched_state_renewal = nfs41_proc_async_sequence,  	.get_state_renewal_cred_locked = nfs4_get_machine_cred_locked,  	.renew_lease = nfs4_proc_sequence,  };  #endif +static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = { +	.get_locations = _nfs40_proc_get_locations, +	.fsid_present = _nfs40_proc_fsid_present, +}; + +#if defined(CONFIG_NFS_V4_1) +static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = { +	.get_locations = _nfs41_proc_get_locations, +	.fsid_present = _nfs41_proc_fsid_present, +}; +#endif	/* CONFIG_NFS_V4_1 */ +  static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {  	.minor_version = 0, -	.call_sync = _nfs4_call_sync, -	.validate_stateid = nfs4_validate_delegation_stateid, +	.init_caps = NFS_CAP_READDIRPLUS +		| NFS_CAP_ATOMIC_OPEN +		| NFS_CAP_CHANGE_ATTR +		| NFS_CAP_POSIX_LOCK, +	.init_client = nfs40_init_client, +	.shutdown_client = nfs40_shutdown_client, +	.match_stateid = nfs4_match_stateid, +	.find_root_sec = nfs4_find_root_sec, +	.free_lock_state = nfs4_release_lockowner, +	.call_sync_ops = &nfs40_call_sync_ops,  	.reboot_recovery_ops = &nfs40_reboot_recovery_ops,  	.nograce_recovery_ops = &nfs40_nograce_recovery_ops,  	.state_renewal_ops = &nfs40_state_renewal_ops, +	.mig_recovery_ops = &nfs40_mig_recovery_ops,  };  #if defined(CONFIG_NFS_V4_1)  static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {  	.minor_version = 1, -	.call_sync = _nfs4_call_sync_session, -	.validate_stateid = nfs41_validate_delegation_stateid, +	.init_caps = NFS_CAP_READDIRPLUS +		| NFS_CAP_ATOMIC_OPEN +		| NFS_CAP_CHANGE_ATTR +		| NFS_CAP_POSIX_LOCK +		| NFS_CAP_STATEID_NFSV41 +		| NFS_CAP_ATOMIC_OPEN_V1, +	.init_client = nfs41_init_client, +	.shutdown_client = nfs41_shutdown_client, +	.match_stateid = nfs41_match_stateid, +	.find_root_sec = nfs41_find_root_sec, +	.free_lock_state = nfs41_free_lock_state, +	.call_sync_ops = &nfs41_call_sync_ops, +	.reboot_recovery_ops = &nfs41_reboot_recovery_ops, +	.nograce_recovery_ops = &nfs41_nograce_recovery_ops, +	.state_renewal_ops = &nfs41_state_renewal_ops, +	.mig_recovery_ops = &nfs41_mig_recovery_ops, +}; +#endif + +#if defined(CONFIG_NFS_V4_2) +static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { +	.minor_version = 2, +	.init_caps = NFS_CAP_READDIRPLUS +		| NFS_CAP_ATOMIC_OPEN +		| NFS_CAP_CHANGE_ATTR +		| NFS_CAP_POSIX_LOCK +		| NFS_CAP_STATEID_NFSV41 +		| NFS_CAP_ATOMIC_OPEN_V1, +	.init_client = nfs41_init_client, +	.shutdown_client = nfs41_shutdown_client, +	.match_stateid = nfs41_match_stateid, +	.find_root_sec = nfs41_find_root_sec, +	.free_lock_state = nfs41_free_lock_state, +	.call_sync_ops = &nfs41_call_sync_ops,  	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,  	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,  	.state_renewal_ops = &nfs41_state_renewal_ops, @@ -5487,15 +8349,39 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {  #if defined(CONFIG_NFS_V4_1)  	[1] = &nfs_v4_1_minor_ops,  #endif +#if defined(CONFIG_NFS_V4_2) +	[2] = &nfs_v4_2_minor_ops, +#endif +}; + +static const struct inode_operations nfs4_dir_inode_operations = { +	.create		= nfs_create, +	.lookup		= nfs_lookup, +	.atomic_open	= nfs_atomic_open, +	.link		= nfs_link, +	.unlink		= nfs_unlink, +	.symlink	= nfs_symlink, +	.mkdir		= nfs_mkdir, +	.rmdir		= nfs_rmdir, +	.mknod		= nfs_mknod, +	.rename		= nfs_rename, +	.permission	= nfs_permission, +	.getattr	= nfs_getattr, +	.setattr	= nfs_setattr, +	.getxattr	= generic_getxattr, +	.setxattr	= generic_setxattr, +	.listxattr	= generic_listxattr, +	.removexattr	= generic_removexattr,  };  static const struct inode_operations nfs4_file_inode_operations = {  	.permission	= nfs_permission,  	.getattr	= nfs_getattr,  	.setattr	= nfs_setattr, -	.getxattr	= nfs4_getxattr, -	.setxattr	= nfs4_setxattr, -	.listxattr	= nfs4_listxattr, +	.getxattr	= generic_getxattr, +	.setxattr	= generic_setxattr, +	.listxattr	= generic_listxattr, +	.removexattr	= generic_removexattr,  };  const struct nfs_rpc_ops nfs_v4_clientops = { @@ -5503,19 +8389,22 @@ const struct nfs_rpc_ops nfs_v4_clientops = {  	.dentry_ops	= &nfs4_dentry_operations,  	.dir_inode_ops	= &nfs4_dir_inode_operations,  	.file_inode_ops	= &nfs4_file_inode_operations, +	.file_ops	= &nfs4_file_operations,  	.getroot	= nfs4_proc_get_root, +	.submount	= nfs4_submount, +	.try_mount	= nfs4_try_mount,  	.getattr	= nfs4_proc_getattr,  	.setattr	= nfs4_proc_setattr, -	.lookupfh	= nfs4_proc_lookupfh,  	.lookup		= nfs4_proc_lookup,  	.access		= nfs4_proc_access,  	.readlink	= nfs4_proc_readlink,  	.create		= nfs4_proc_create,  	.remove		= nfs4_proc_remove,  	.unlink_setup	= nfs4_proc_unlink_setup, +	.unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare,  	.unlink_done	= nfs4_proc_unlink_done, -	.rename		= nfs4_proc_rename,  	.rename_setup	= nfs4_proc_rename_setup, +	.rename_rpc_prepare = nfs4_proc_rename_rpc_prepare,  	.rename_done	= nfs4_proc_rename_done,  	.link		= nfs4_proc_link,  	.symlink	= nfs4_proc_symlink, @@ -5528,16 +8417,40 @@ const struct nfs_rpc_ops nfs_v4_clientops = {  	.pathconf	= nfs4_proc_pathconf,  	.set_capabilities = nfs4_server_capabilities,  	.decode_dirent	= nfs4_decode_dirent, +	.pgio_rpc_prepare = nfs4_proc_pgio_rpc_prepare,  	.read_setup	= nfs4_proc_read_setup,  	.read_done	= nfs4_read_done,  	.write_setup	= nfs4_proc_write_setup,  	.write_done	= nfs4_write_done,  	.commit_setup	= nfs4_proc_commit_setup, +	.commit_rpc_prepare = nfs4_proc_commit_rpc_prepare,  	.commit_done	= nfs4_commit_done,  	.lock		= nfs4_proc_lock,  	.clear_acl_cache = nfs4_zap_acl_attr,  	.close_context  = nfs4_close_context,  	.open_context	= nfs4_atomic_open, +	.have_delegation = nfs4_have_delegation, +	.return_delegation = nfs4_inode_return_delegation, +	.alloc_client	= nfs4_alloc_client, +	.init_client	= nfs4_init_client, +	.free_client	= nfs4_free_client, +	.create_server	= nfs4_create_server, +	.clone_server	= nfs_clone_server, +}; + +static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { +	.prefix	= XATTR_NAME_NFSV4_ACL, +	.list	= nfs4_xattr_list_nfs4_acl, +	.get	= nfs4_xattr_get_nfs4_acl, +	.set	= nfs4_xattr_set_nfs4_acl, +}; + +const struct xattr_handler *nfs4_xattr_handlers[] = { +	&nfs4_xattr_nfs4_acl_handler, +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +	&nfs4_xattr_nfs4_label_handler, +#endif +	NULL  };  /* diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 72b6c580af1..1720d32ffa5 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -49,7 +49,7 @@  #include "nfs4_fs.h"  #include "delegation.h" -#define NFSDBG_FACILITY	NFSDBG_PROC +#define NFSDBG_FACILITY		NFSDBG_STATE  void  nfs4_renew_state(struct work_struct *work) @@ -60,29 +60,36 @@ nfs4_renew_state(struct work_struct *work)  	struct rpc_cred *cred;  	long lease;  	unsigned long last, now; +	unsigned renew_flags = 0;  	ops = clp->cl_mvops->state_renewal_ops;  	dprintk("%s: start\n", __func__); -	/* Are there any active superblocks? */ -	if (list_empty(&clp->cl_superblocks)) + +	if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))  		goto out; +  	spin_lock(&clp->cl_lock);  	lease = clp->cl_lease_time;  	last = clp->cl_last_renewal;  	now = jiffies;  	/* Are we close to a lease timeout? */ -	if (time_after(now, last + lease/3)) { +	if (time_after(now, last + lease/3)) +		renew_flags |= NFS4_RENEW_TIMEOUT; +	if (nfs_delegations_present(clp)) +		renew_flags |= NFS4_RENEW_DELEGATION_CB; + +	if (renew_flags != 0) {  		cred = ops->get_state_renewal_cred_locked(clp);  		spin_unlock(&clp->cl_lock);  		if (cred == NULL) { -			if (list_empty(&clp->cl_delegations)) { +			if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) {  				set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);  				goto out;  			}  			nfs_expire_all_delegations(clp);  		} else {  			/* Queue an asynchronous RENEW. */ -			ops->sched_state_renewal(clp, cred); +			ops->sched_state_renewal(clp, cred, renew_flags);  			put_rpccred(cred);  			goto out_exp;  		} @@ -110,8 +117,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)  		timeout = 5 * HZ;  	dprintk("%s: requeueing work. Lease period = %ld\n",  			__func__, (timeout + HZ - 1) / HZ); -	cancel_delayed_work(&clp->cl_renewd); -	schedule_delayed_work(&clp->cl_renewd, timeout); +	mod_delayed_work(system_wq, &clp->cl_renewd, timeout);  	set_bit(NFS_CS_RENEWD, &clp->cl_res_state);  	spin_unlock(&clp->cl_lock);  } diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c new file mode 100644 index 00000000000..e799dc3c3b1 --- /dev/null +++ b/fs/nfs/nfs4session.c @@ -0,0 +1,565 @@ +/* + * fs/nfs/nfs4session.c + * + * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com> + * + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/sunrpc/sched.h> +#include <linux/sunrpc/bc_xprt.h> +#include <linux/nfs.h> +#include <linux/nfs4.h> +#include <linux/nfs_fs.h> +#include <linux/module.h> + +#include "nfs4_fs.h" +#include "internal.h" +#include "nfs4session.h" +#include "callback.h" + +#define NFSDBG_FACILITY		NFSDBG_STATE + +static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue) +{ +	tbl->highest_used_slotid = NFS4_NO_SLOT; +	spin_lock_init(&tbl->slot_tbl_lock); +	rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue); +	init_completion(&tbl->complete); +} + +/* + * nfs4_shrink_slot_table - free retired slots from the slot table + */ +static void nfs4_shrink_slot_table(struct nfs4_slot_table  *tbl, u32 newsize) +{ +	struct nfs4_slot **p; +	if (newsize >= tbl->max_slots) +		return; + +	p = &tbl->slots; +	while (newsize--) +		p = &(*p)->next; +	while (*p) { +		struct nfs4_slot *slot = *p; + +		*p = slot->next; +		kfree(slot); +		tbl->max_slots--; +	} +} + +/** + * nfs4_slot_tbl_drain_complete - wake waiters when drain is complete + * @tbl - controlling slot table + * + */ +void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl) +{ +	if (nfs4_slot_tbl_draining(tbl)) +		complete(&tbl->complete); +} + +/* + * nfs4_free_slot - free a slot and efficiently update slot table. + * + * freeing a slot is trivially done by clearing its respective bit + * in the bitmap. + * If the freed slotid equals highest_used_slotid we want to update it + * so that the server would be able to size down the slot table if needed, + * otherwise we know that the highest_used_slotid is still in use. + * When updating highest_used_slotid there may be "holes" in the bitmap + * so we need to scan down from highest_used_slotid to 0 looking for the now + * highest slotid in use. + * If none found, highest_used_slotid is set to NFS4_NO_SLOT. + * + * Must be called while holding tbl->slot_tbl_lock + */ +void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot) +{ +	u32 slotid = slot->slot_nr; + +	/* clear used bit in bitmap */ +	__clear_bit(slotid, tbl->used_slots); + +	/* update highest_used_slotid when it is freed */ +	if (slotid == tbl->highest_used_slotid) { +		u32 new_max = find_last_bit(tbl->used_slots, slotid); +		if (new_max < slotid) +			tbl->highest_used_slotid = new_max; +		else { +			tbl->highest_used_slotid = NFS4_NO_SLOT; +			nfs4_slot_tbl_drain_complete(tbl); +		} +	} +	dprintk("%s: slotid %u highest_used_slotid %u\n", __func__, +		slotid, tbl->highest_used_slotid); +} + +static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table  *tbl, +		u32 slotid, u32 seq_init, gfp_t gfp_mask) +{ +	struct nfs4_slot *slot; + +	slot = kzalloc(sizeof(*slot), gfp_mask); +	if (slot) { +		slot->table = tbl; +		slot->slot_nr = slotid; +		slot->seq_nr = seq_init; +	} +	return slot; +} + +static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table  *tbl, +		u32 slotid, u32 seq_init, gfp_t gfp_mask) +{ +	struct nfs4_slot **p, *slot; + +	p = &tbl->slots; +	for (;;) { +		if (*p == NULL) { +			*p = nfs4_new_slot(tbl, tbl->max_slots, +					seq_init, gfp_mask); +			if (*p == NULL) +				break; +			tbl->max_slots++; +		} +		slot = *p; +		if (slot->slot_nr == slotid) +			return slot; +		p = &slot->next; +	} +	return ERR_PTR(-ENOMEM); +} + +/* + * nfs4_alloc_slot - efficiently look for a free slot + * + * nfs4_alloc_slot looks for an unset bit in the used_slots bitmap. + * If found, we mark the slot as used, update the highest_used_slotid, + * and respectively set up the sequence operation args. + * + * Note: must be called with under the slot_tbl_lock. + */ +struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl) +{ +	struct nfs4_slot *ret = ERR_PTR(-EBUSY); +	u32 slotid; + +	dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n", +		__func__, tbl->used_slots[0], tbl->highest_used_slotid, +		tbl->max_slotid + 1); +	slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1); +	if (slotid > tbl->max_slotid) +		goto out; +	ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT); +	if (IS_ERR(ret)) +		goto out; +	__set_bit(slotid, tbl->used_slots); +	if (slotid > tbl->highest_used_slotid || +			tbl->highest_used_slotid == NFS4_NO_SLOT) +		tbl->highest_used_slotid = slotid; +	ret->generation = tbl->generation; + +out: +	dprintk("<-- %s used_slots=%04lx highest_used=%u slotid=%u\n", +		__func__, tbl->used_slots[0], tbl->highest_used_slotid, +		!IS_ERR(ret) ? ret->slot_nr : NFS4_NO_SLOT); +	return ret; +} + +static int nfs4_grow_slot_table(struct nfs4_slot_table *tbl, +		 u32 max_reqs, u32 ivalue) +{ +	if (max_reqs <= tbl->max_slots) +		return 0; +	if (!IS_ERR(nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS))) +		return 0; +	return -ENOMEM; +} + +static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl, +		u32 server_highest_slotid, +		u32 ivalue) +{ +	struct nfs4_slot **p; + +	nfs4_shrink_slot_table(tbl, server_highest_slotid + 1); +	p = &tbl->slots; +	while (*p) { +		(*p)->seq_nr = ivalue; +		(*p)->interrupted = 0; +		p = &(*p)->next; +	} +	tbl->highest_used_slotid = NFS4_NO_SLOT; +	tbl->target_highest_slotid = server_highest_slotid; +	tbl->server_highest_slotid = server_highest_slotid; +	tbl->d_target_highest_slotid = 0; +	tbl->d2_target_highest_slotid = 0; +	tbl->max_slotid = server_highest_slotid; +} + +/* + * (re)Initialise a slot table + */ +static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, +		u32 max_reqs, u32 ivalue) +{ +	int ret; + +	dprintk("--> %s: max_reqs=%u, tbl->max_slots %u\n", __func__, +		max_reqs, tbl->max_slots); + +	if (max_reqs > NFS4_MAX_SLOT_TABLE) +		max_reqs = NFS4_MAX_SLOT_TABLE; + +	ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue); +	if (ret) +		goto out; + +	spin_lock(&tbl->slot_tbl_lock); +	nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue); +	spin_unlock(&tbl->slot_tbl_lock); + +	dprintk("%s: tbl=%p slots=%p max_slots=%u\n", __func__, +		tbl, tbl->slots, tbl->max_slots); +out: +	dprintk("<-- %s: return %d\n", __func__, ret); +	return ret; +} + +/* + * nfs4_release_slot_table - release all slot table entries + */ +static void nfs4_release_slot_table(struct nfs4_slot_table *tbl) +{ +	nfs4_shrink_slot_table(tbl, 0); +} + +/** + * nfs4_shutdown_slot_table - release resources attached to a slot table + * @tbl: slot table to shut down + * + */ +void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl) +{ +	nfs4_release_slot_table(tbl); +	rpc_destroy_wait_queue(&tbl->slot_tbl_waitq); +} + +/** + * nfs4_setup_slot_table - prepare a stand-alone slot table for use + * @tbl: slot table to set up + * @max_reqs: maximum number of requests allowed + * @queue: name to give RPC wait queue + * + * Returns zero on success, or a negative errno. + */ +int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, unsigned int max_reqs, +		const char *queue) +{ +	nfs4_init_slot_table(tbl, queue); +	return nfs4_realloc_slot_table(tbl, max_reqs, 0); +} + +static bool nfs41_assign_slot(struct rpc_task *task, void *pslot) +{ +	struct nfs4_sequence_args *args = task->tk_msg.rpc_argp; +	struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; +	struct nfs4_slot *slot = pslot; +	struct nfs4_slot_table *tbl = slot->table; + +	if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) +		return false; +	slot->generation = tbl->generation; +	args->sa_slot = slot; +	res->sr_timestamp = jiffies; +	res->sr_slot = slot; +	res->sr_status_flags = 0; +	res->sr_status = 1; +	return true; +} + +static bool __nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, +		struct nfs4_slot *slot) +{ +	if (rpc_wake_up_first(&tbl->slot_tbl_waitq, nfs41_assign_slot, slot)) +		return true; +	return false; +} + +bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, +		struct nfs4_slot *slot) +{ +	if (slot->slot_nr > tbl->max_slotid) +		return false; +	return __nfs41_wake_and_assign_slot(tbl, slot); +} + +static bool nfs41_try_wake_next_slot_table_entry(struct nfs4_slot_table *tbl) +{ +	struct nfs4_slot *slot = nfs4_alloc_slot(tbl); +	if (!IS_ERR(slot)) { +		bool ret = __nfs41_wake_and_assign_slot(tbl, slot); +		if (ret) +			return ret; +		nfs4_free_slot(tbl, slot); +	} +	return false; +} + +void nfs41_wake_slot_table(struct nfs4_slot_table *tbl) +{ +	for (;;) { +		if (!nfs41_try_wake_next_slot_table_entry(tbl)) +			break; +	} +} + +#if defined(CONFIG_NFS_V4_1) + +static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl, +		u32 target_highest_slotid) +{ +	u32 max_slotid; + +	max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, target_highest_slotid); +	if (max_slotid > tbl->server_highest_slotid) +		max_slotid = tbl->server_highest_slotid; +	if (max_slotid > tbl->target_highest_slotid) +		max_slotid = tbl->target_highest_slotid; +	tbl->max_slotid = max_slotid; +	nfs41_wake_slot_table(tbl); +} + +/* Update the client's idea of target_highest_slotid */ +static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl, +		u32 target_highest_slotid) +{ +	if (tbl->target_highest_slotid == target_highest_slotid) +		return; +	tbl->target_highest_slotid = target_highest_slotid; +	tbl->generation++; +} + +void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, +		u32 target_highest_slotid) +{ +	spin_lock(&tbl->slot_tbl_lock); +	nfs41_set_target_slotid_locked(tbl, target_highest_slotid); +	tbl->d_target_highest_slotid = 0; +	tbl->d2_target_highest_slotid = 0; +	nfs41_set_max_slotid_locked(tbl, target_highest_slotid); +	spin_unlock(&tbl->slot_tbl_lock); +} + +static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl, +		u32 highest_slotid) +{ +	if (tbl->server_highest_slotid == highest_slotid) +		return; +	if (tbl->highest_used_slotid > highest_slotid) +		return; +	/* Deallocate slots */ +	nfs4_shrink_slot_table(tbl, highest_slotid + 1); +	tbl->server_highest_slotid = highest_slotid; +} + +static s32 nfs41_derivative_target_slotid(s32 s1, s32 s2) +{ +	s1 -= s2; +	if (s1 == 0) +		return 0; +	if (s1 < 0) +		return (s1 - 1) >> 1; +	return (s1 + 1) >> 1; +} + +static int nfs41_sign_s32(s32 s1) +{ +	if (s1 > 0) +		return 1; +	if (s1 < 0) +		return -1; +	return 0; +} + +static bool nfs41_same_sign_or_zero_s32(s32 s1, s32 s2) +{ +	if (!s1 || !s2) +		return true; +	return nfs41_sign_s32(s1) == nfs41_sign_s32(s2); +} + +/* Try to eliminate outliers by checking for sharp changes in the + * derivatives and second derivatives + */ +static bool nfs41_is_outlier_target_slotid(struct nfs4_slot_table *tbl, +		u32 new_target) +{ +	s32 d_target, d2_target; +	bool ret = true; + +	d_target = nfs41_derivative_target_slotid(new_target, +			tbl->target_highest_slotid); +	d2_target = nfs41_derivative_target_slotid(d_target, +			tbl->d_target_highest_slotid); +	/* Is first derivative same sign? */ +	if (nfs41_same_sign_or_zero_s32(d_target, tbl->d_target_highest_slotid)) +		ret = false; +	/* Is second derivative same sign? */ +	if (nfs41_same_sign_or_zero_s32(d2_target, tbl->d2_target_highest_slotid)) +		ret = false; +	tbl->d_target_highest_slotid = d_target; +	tbl->d2_target_highest_slotid = d2_target; +	return ret; +} + +void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, +		struct nfs4_slot *slot, +		struct nfs4_sequence_res *res) +{ +	spin_lock(&tbl->slot_tbl_lock); +	if (!nfs41_is_outlier_target_slotid(tbl, res->sr_target_highest_slotid)) +		nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid); +	if (tbl->generation == slot->generation) +		nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid); +	nfs41_set_max_slotid_locked(tbl, res->sr_target_highest_slotid); +	spin_unlock(&tbl->slot_tbl_lock); +} + +static void nfs4_release_session_slot_tables(struct nfs4_session *session) +{ +	nfs4_release_slot_table(&session->fc_slot_table); +	nfs4_release_slot_table(&session->bc_slot_table); +} + +/* + * Initialize or reset the forechannel and backchannel tables + */ +int nfs4_setup_session_slot_tables(struct nfs4_session *ses) +{ +	struct nfs4_slot_table *tbl; +	int status; + +	dprintk("--> %s\n", __func__); +	/* Fore channel */ +	tbl = &ses->fc_slot_table; +	tbl->session = ses; +	status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1); +	if (status) /* -ENOMEM */ +		return status; +	/* Back channel */ +	tbl = &ses->bc_slot_table; +	tbl->session = ses; +	status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0); +	if (status && tbl->slots == NULL) +		/* Fore and back channel share a connection so get +		 * both slot tables or neither */ +		nfs4_release_session_slot_tables(ses); +	return status; +} + +struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) +{ +	struct nfs4_session *session; + +	session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); +	if (!session) +		return NULL; + +	nfs4_init_slot_table(&session->fc_slot_table, "ForeChannel Slot table"); +	nfs4_init_slot_table(&session->bc_slot_table, "BackChannel Slot table"); +	session->session_state = 1<<NFS4_SESSION_INITING; + +	session->clp = clp; +	return session; +} + +static void nfs4_destroy_session_slot_tables(struct nfs4_session *session) +{ +	nfs4_shutdown_slot_table(&session->fc_slot_table); +	nfs4_shutdown_slot_table(&session->bc_slot_table); +} + +void nfs4_destroy_session(struct nfs4_session *session) +{ +	struct rpc_xprt *xprt; +	struct rpc_cred *cred; + +	cred = nfs4_get_clid_cred(session->clp); +	nfs4_proc_destroy_session(session, cred); +	if (cred) +		put_rpccred(cred); + +	rcu_read_lock(); +	xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt); +	rcu_read_unlock(); +	dprintk("%s Destroy backchannel for xprt %p\n", +		__func__, xprt); +	xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS); +	nfs4_destroy_session_slot_tables(session); +	kfree(session); +} + +/* + * With sessions, the client is not marked ready until after a + * successful EXCHANGE_ID and CREATE_SESSION. + * + * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate + * other versions of NFS can be tried. + */ +static int nfs41_check_session_ready(struct nfs_client *clp) +{ +	int ret; +	 +	if (clp->cl_cons_state == NFS_CS_SESSION_INITING) { +		ret = nfs4_client_recover_expired_lease(clp); +		if (ret) +			return ret; +	} +	if (clp->cl_cons_state < NFS_CS_READY) +		return -EPROTONOSUPPORT; +	smp_rmb(); +	return 0; +} + +int nfs4_init_session(struct nfs_client *clp) +{ +	if (!nfs4_has_session(clp)) +		return 0; + +	clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state); +	return nfs41_check_session_ready(clp); +} + +int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time) +{ +	struct nfs4_session *session = clp->cl_session; +	int ret; + +	spin_lock(&clp->cl_lock); +	if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { +		/* +		 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the +		 * DS lease to be equal to the MDS lease. +		 */ +		clp->cl_lease_time = lease_time; +		clp->cl_last_renewal = jiffies; +	} +	spin_unlock(&clp->cl_lock); + +	ret = nfs41_check_session_ready(clp); +	if (ret) +		return ret; +	/* Test for the DS role */ +	if (!is_ds_client(clp)) +		return -ENODEV; +	return 0; +} +EXPORT_SYMBOL_GPL(nfs4_init_ds_session); + +#endif	/* defined(CONFIG_NFS_V4_1) */ diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h new file mode 100644 index 00000000000..b34ada9bc6a --- /dev/null +++ b/fs/nfs/nfs4session.h @@ -0,0 +1,153 @@ +/* + * fs/nfs/nfs4session.h + * + * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com> + * + */ +#ifndef __LINUX_FS_NFS_NFS4SESSION_H +#define __LINUX_FS_NFS_NFS4SESSION_H + +/* maximum number of slots to use */ +#define NFS4_DEF_SLOT_TABLE_SIZE (64U) +#define NFS4_MAX_SLOT_TABLE (1024U) +#define NFS4_NO_SLOT ((u32)-1) + +#if IS_ENABLED(CONFIG_NFS_V4) + +/* Sessions slot seqid */ +struct nfs4_slot { +	struct nfs4_slot_table	*table; +	struct nfs4_slot	*next; +	unsigned long		generation; +	u32			slot_nr; +	u32		 	seq_nr; +	unsigned int		interrupted : 1; +}; + +/* Sessions */ +enum nfs4_slot_tbl_state { +	NFS4_SLOT_TBL_DRAINING, +}; + +#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long)) +struct nfs4_slot_table { +	struct nfs4_session *session;		/* Parent session */ +	struct nfs4_slot *slots;		/* seqid per slot */ +	unsigned long   used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */ +	spinlock_t	slot_tbl_lock; +	struct rpc_wait_queue	slot_tbl_waitq;	/* allocators may wait here */ +	u32		max_slots;		/* # slots in table */ +	u32		max_slotid;		/* Max allowed slotid value */ +	u32		highest_used_slotid;	/* sent to server on each SEQ. +						 * op for dynamic resizing */ +	u32		target_highest_slotid;	/* Server max_slot target */ +	u32		server_highest_slotid;	/* Server highest slotid */ +	s32		d_target_highest_slotid; /* Derivative */ +	s32		d2_target_highest_slotid; /* 2nd derivative */ +	unsigned long	generation;		/* Generation counter for +						   target_highest_slotid */ +	struct completion complete; +	unsigned long	slot_tbl_state; +}; + +/* + * Session related parameters + */ +struct nfs4_session { +	struct nfs4_sessionid		sess_id; +	u32				flags; +	unsigned long			session_state; +	u32				hash_alg; +	u32				ssv_len; + +	/* The fore and back channel */ +	struct nfs4_channel_attrs	fc_attrs; +	struct nfs4_slot_table		fc_slot_table; +	struct nfs4_channel_attrs	bc_attrs; +	struct nfs4_slot_table		bc_slot_table; +	struct nfs_client		*clp; +}; + +enum nfs4_session_state { +	NFS4_SESSION_INITING, +}; + +extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, +		unsigned int max_reqs, const char *queue); +extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl); +extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); +extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); +extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); +bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, +		struct nfs4_slot *slot); +void nfs41_wake_slot_table(struct nfs4_slot_table *tbl); + +static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl) +{ +	return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); +} + +#if defined(CONFIG_NFS_V4_1) +extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, +		u32 target_highest_slotid); +extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, +		struct nfs4_slot *slot, +		struct nfs4_sequence_res *res); + +extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses); + +extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); +extern void nfs4_destroy_session(struct nfs4_session *session); +extern int nfs4_init_session(struct nfs_client *clp); +extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); + +/* + * Determine if sessions are in use. + */ +static inline int nfs4_has_session(const struct nfs_client *clp) +{ +	if (clp->cl_session) +		return 1; +	return 0; +} + +static inline int nfs4_has_persistent_session(const struct nfs_client *clp) +{ +	if (nfs4_has_session(clp)) +		return (clp->cl_session->flags & SESSION4_PERSIST); +	return 0; +} + +#ifdef CONFIG_CRC32 +/* + * nfs_session_id_hash - calculate the crc32 hash for the session id + * @session - pointer to session + */ +#define nfs_session_id_hash(sess_id) \ +	(~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data))) +#else +#define nfs_session_id_hash(session) (0) +#endif +#else /* defined(CONFIG_NFS_V4_1) */ + +static inline int nfs4_init_session(struct nfs_client *clp) +{ +	return 0; +} + +/* + * Determine if sessions are in use. + */ +static inline int nfs4_has_session(const struct nfs_client *clp) +{ +	return 0; +} + +static inline int nfs4_has_persistent_session(const struct nfs_client *clp) +{ +	return 0; +} + +#endif /* defined(CONFIG_NFS_V4_1) */ +#endif /* IS_ENABLED(CONFIG_NFS_V4) */ +#endif /* __LINUX_FS_NFS_NFS4SESSION_H */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f575a312673..848f6853c59 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -49,41 +49,102 @@  #include <linux/ratelimit.h>  #include <linux/workqueue.h>  #include <linux/bitops.h> +#include <linux/jiffies.h> + +#include <linux/sunrpc/clnt.h>  #include "nfs4_fs.h"  #include "callback.h"  #include "delegation.h"  #include "internal.h" +#include "nfs4session.h"  #include "pnfs.h" +#include "netns.h" + +#define NFSDBG_FACILITY		NFSDBG_STATE  #define OPENOWNER_POOL_SIZE	8  const nfs4_stateid zero_stateid; - -static LIST_HEAD(nfs4_clientid_list); +static DEFINE_MUTEX(nfs_clid_init_mutex);  int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)  { -	struct nfs4_setclientid_res clid; +	struct nfs4_setclientid_res clid = { +		.clientid = clp->cl_clientid, +		.confirm = clp->cl_confirm, +	};  	unsigned short port;  	int status; +	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); -	port = nfs_callback_tcpport; +	if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) +		goto do_confirm; +	port = nn->nfs_callback_tcpport;  	if (clp->cl_addr.ss_family == AF_INET6) -		port = nfs_callback_tcpport6; +		port = nn->nfs_callback_tcpport6;  	status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);  	if (status != 0)  		goto out; +	clp->cl_clientid = clid.clientid; +	clp->cl_confirm = clid.confirm; +	set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); +do_confirm:  	status = nfs4_proc_setclientid_confirm(clp, &clid, cred);  	if (status != 0)  		goto out; -	clp->cl_clientid = clid.clientid; +	clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);  	nfs4_schedule_state_renewal(clp);  out:  	return status;  } +/** + * nfs40_discover_server_trunking - Detect server IP address trunking (mv0) + * + * @clp: nfs_client under test + * @result: OUT: found nfs_client, or clp + * @cred: credential to use for trunking test + * + * Returns zero, a negative errno, or a negative NFS4ERR status. + * If zero is returned, an nfs_client pointer is planted in + * "result". + * + * Note: The returned client may not yet be marked ready. + */ +int nfs40_discover_server_trunking(struct nfs_client *clp, +				   struct nfs_client **result, +				   struct rpc_cred *cred) +{ +	struct nfs4_setclientid_res clid = { +		.clientid = clp->cl_clientid, +		.confirm = clp->cl_confirm, +	}; +	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); +	unsigned short port; +	int status; + +	port = nn->nfs_callback_tcpport; +	if (clp->cl_addr.ss_family == AF_INET6) +		port = nn->nfs_callback_tcpport6; + +	status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); +	if (status != 0) +		goto out; +	clp->cl_clientid = clid.clientid; +	clp->cl_confirm = clid.confirm; + +	status = nfs40_walk_client_list(clp, result, cred); +	if (status == 0) { +		/* Sustain the lease, even if it's empty.  If the clientid4 +		 * goes stale it's of no use for trunking discovery. */ +		nfs4_schedule_state_renewal(*result); +	} +out: +	return status; +} +  struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)  {  	struct rpc_cred *cred = NULL; @@ -93,26 +154,30 @@ struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)  	return cred;  } -static void nfs4_clear_machine_cred(struct nfs_client *clp) +static void nfs4_root_machine_cred(struct nfs_client *clp)  { -	struct rpc_cred *cred; +	struct rpc_cred *cred, *new; +	new = rpc_lookup_machine_cred(NULL);  	spin_lock(&clp->cl_lock);  	cred = clp->cl_machine_cred; -	clp->cl_machine_cred = NULL; +	clp->cl_machine_cred = new;  	spin_unlock(&clp->cl_lock);  	if (cred != NULL)  		put_rpccred(cred);  } -struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) +static struct rpc_cred * +nfs4_get_renew_cred_server_locked(struct nfs_server *server)  { +	struct rpc_cred *cred = NULL;  	struct nfs4_state_owner *sp;  	struct rb_node *pos; -	struct rpc_cred *cred = NULL; -	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { -		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); +	for (pos = rb_first(&server->state_owners); +	     pos != NULL; +	     pos = rb_next(pos)) { +		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);  		if (list_empty(&sp->so_states))  			continue;  		cred = get_rpccred(sp->so_cred); @@ -121,6 +186,88 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)  	return cred;  } +/** + * nfs4_get_renew_cred_locked - Acquire credential for a renew operation + * @clp: client state handle + * + * Returns an rpc_cred with reference count bumped, or NULL. + * Caller must hold clp->cl_lock. + */ +struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) +{ +	struct rpc_cred *cred = NULL; +	struct nfs_server *server; + +	/* Use machine credentials if available */ +	cred = nfs4_get_machine_cred_locked(clp); +	if (cred != NULL) +		goto out; + +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +		cred = nfs4_get_renew_cred_server_locked(server); +		if (cred != NULL) +			break; +	} +	rcu_read_unlock(); + +out: +	return cred; +} + +static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl) +{ +	if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { +		spin_lock(&tbl->slot_tbl_lock); +		nfs41_wake_slot_table(tbl); +		spin_unlock(&tbl->slot_tbl_lock); +	} +} + +static void nfs4_end_drain_session(struct nfs_client *clp) +{ +	struct nfs4_session *ses = clp->cl_session; + +	if (clp->cl_slot_tbl) { +		nfs4_end_drain_slot_table(clp->cl_slot_tbl); +		return; +	} + +	if (ses != NULL) { +		nfs4_end_drain_slot_table(&ses->bc_slot_table); +		nfs4_end_drain_slot_table(&ses->fc_slot_table); +	} +} + +static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl) +{ +	set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); +	spin_lock(&tbl->slot_tbl_lock); +	if (tbl->highest_used_slotid != NFS4_NO_SLOT) { +		reinit_completion(&tbl->complete); +		spin_unlock(&tbl->slot_tbl_lock); +		return wait_for_completion_interruptible(&tbl->complete); +	} +	spin_unlock(&tbl->slot_tbl_lock); +	return 0; +} + +static int nfs4_begin_drain_session(struct nfs_client *clp) +{ +	struct nfs4_session *ses = clp->cl_session; +	int ret = 0; + +	if (clp->cl_slot_tbl) +		return nfs4_drain_slot_tbl(clp->cl_slot_tbl); + +	/* back channel */ +	ret = nfs4_drain_slot_tbl(&ses->bc_slot_table); +	if (ret) +		return ret; +	/* fore channel */ +	return nfs4_drain_slot_tbl(&ses->fc_slot_table); +} +  #if defined(CONFIG_NFS_V4_1)  static int nfs41_setup_state_renewal(struct nfs_client *clp) @@ -128,6 +275,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)  	int status;  	struct nfs_fsinfo fsinfo; +	if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { +		nfs4_schedule_state_renewal(clp); +		return 0; +	} +  	status = nfs4_proc_get_lease_time(clp, &fsinfo);  	if (status == 0) {  		/* Update lease time and schedule renewal */ @@ -142,222 +294,163 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)  	return status;  } -static void nfs4_end_drain_session(struct nfs_client *clp) -{ -	struct nfs4_session *ses = clp->cl_session; -	int max_slots; - -	if (ses == NULL) -		return; -	if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { -		spin_lock(&ses->fc_slot_table.slot_tbl_lock); -		max_slots = ses->fc_slot_table.max_slots; -		while (max_slots--) { -			struct rpc_task *task; - -			task = rpc_wake_up_next(&ses->fc_slot_table. -						slot_tbl_waitq); -			if (!task) -				break; -			rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); -		} -		spin_unlock(&ses->fc_slot_table.slot_tbl_lock); -	} -} - -static int nfs4_begin_drain_session(struct nfs_client *clp) +static void nfs41_finish_session_reset(struct nfs_client *clp)  { -	struct nfs4_session *ses = clp->cl_session; -	struct nfs4_slot_table *tbl = &ses->fc_slot_table; - -	spin_lock(&tbl->slot_tbl_lock); -	set_bit(NFS4_SESSION_DRAINING, &ses->session_state); -	if (tbl->highest_used_slotid != -1) { -		INIT_COMPLETION(ses->complete); -		spin_unlock(&tbl->slot_tbl_lock); -		return wait_for_completion_interruptible(&ses->complete); -	} -	spin_unlock(&tbl->slot_tbl_lock); -	return 0; +	clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); +	clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); +	/* create_session negotiated new slot table */ +	clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); +	nfs41_setup_state_renewal(clp);  }  int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)  {  	int status; +	if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) +		goto do_confirm;  	nfs4_begin_drain_session(clp);  	status = nfs4_proc_exchange_id(clp, cred);  	if (status != 0)  		goto out; -	status = nfs4_proc_create_session(clp); +	set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); +do_confirm: +	status = nfs4_proc_create_session(clp, cred);  	if (status != 0)  		goto out; -	nfs41_setup_state_renewal(clp); +	nfs41_finish_session_reset(clp);  	nfs_mark_client_ready(clp, NFS_CS_READY);  out:  	return status;  } -struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp) +/** + * nfs41_discover_server_trunking - Detect server IP address trunking (mv1) + * + * @clp: nfs_client under test + * @result: OUT: found nfs_client, or clp + * @cred: credential to use for trunking test + * + * Returns NFS4_OK, a negative errno, or a negative NFS4ERR status. + * If NFS4_OK is returned, an nfs_client pointer is planted in + * "result". + * + * Note: The returned client may not yet be marked ready. + */ +int nfs41_discover_server_trunking(struct nfs_client *clp, +				   struct nfs_client **result, +				   struct rpc_cred *cred)  { -	struct rpc_cred *cred; +	int status; -	spin_lock(&clp->cl_lock); -	cred = nfs4_get_machine_cred_locked(clp); -	spin_unlock(&clp->cl_lock); -	return cred; +	status = nfs4_proc_exchange_id(clp, cred); +	if (status != NFS4_OK) +		return status; +	set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + +	return nfs41_walk_client_list(clp, result, cred);  }  #endif /* CONFIG_NFS_V4_1 */ -struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) +/** + * nfs4_get_clid_cred - Acquire credential for a setclientid operation + * @clp: client state handle + * + * Returns an rpc_cred with reference count bumped, or NULL. + */ +struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp)  { -	struct nfs4_state_owner *sp; -	struct rb_node *pos;  	struct rpc_cred *cred;  	spin_lock(&clp->cl_lock);  	cred = nfs4_get_machine_cred_locked(clp); -	if (cred != NULL) -		goto out; -	pos = rb_first(&clp->cl_state_owners); -	if (pos != NULL) { -		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); -		cred = get_rpccred(sp->so_cred); -	} -out:  	spin_unlock(&clp->cl_lock);  	return cred;  } -static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new, -		__u64 minval, int maxbits) -{ -	struct rb_node **p, *parent; -	struct nfs_unique_id *pos; -	__u64 mask = ~0ULL; - -	if (maxbits < 64) -		mask = (1ULL << maxbits) - 1ULL; - -	/* Ensure distribution is more or less flat */ -	get_random_bytes(&new->id, sizeof(new->id)); -	new->id &= mask; -	if (new->id < minval) -		new->id += minval; -retry: -	p = &root->rb_node; -	parent = NULL; - -	while (*p != NULL) { -		parent = *p; -		pos = rb_entry(parent, struct nfs_unique_id, rb_node); - -		if (new->id < pos->id) -			p = &(*p)->rb_left; -		else if (new->id > pos->id) -			p = &(*p)->rb_right; -		else -			goto id_exists; -	} -	rb_link_node(&new->rb_node, parent, p); -	rb_insert_color(&new->rb_node, root); -	return; -id_exists: -	for (;;) { -		new->id++; -		if (new->id < minval || (new->id & mask) != new->id) { -			new->id = minval; -			break; -		} -		parent = rb_next(parent); -		if (parent == NULL) -			break; -		pos = rb_entry(parent, struct nfs_unique_id, rb_node); -		if (new->id < pos->id) -			break; -	} -	goto retry; -} - -static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id) -{ -	rb_erase(&id->rb_node, root); -} -  static struct nfs4_state_owner * -nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred) +nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)  { -	struct nfs_client *clp = server->nfs_client; -	struct rb_node **p = &clp->cl_state_owners.rb_node, +	struct rb_node **p = &server->state_owners.rb_node,  		       *parent = NULL; -	struct nfs4_state_owner *sp, *res = NULL; +	struct nfs4_state_owner *sp;  	while (*p != NULL) {  		parent = *p; -		sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); +		sp = rb_entry(parent, struct nfs4_state_owner, so_server_node); -		if (server < sp->so_server) { -			p = &parent->rb_left; -			continue; -		} -		if (server > sp->so_server) { -			p = &parent->rb_right; -			continue; -		}  		if (cred < sp->so_cred)  			p = &parent->rb_left;  		else if (cred > sp->so_cred)  			p = &parent->rb_right;  		else { +			if (!list_empty(&sp->so_lru)) +				list_del_init(&sp->so_lru);  			atomic_inc(&sp->so_count); -			res = sp; -			break; +			return sp;  		}  	} -	return res; +	return NULL;  }  static struct nfs4_state_owner * -nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new) +nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)  { -	struct rb_node **p = &clp->cl_state_owners.rb_node, +	struct nfs_server *server = new->so_server; +	struct rb_node **p = &server->state_owners.rb_node,  		       *parent = NULL;  	struct nfs4_state_owner *sp; +	int err;  	while (*p != NULL) {  		parent = *p; -		sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); +		sp = rb_entry(parent, struct nfs4_state_owner, so_server_node); -		if (new->so_server < sp->so_server) { -			p = &parent->rb_left; -			continue; -		} -		if (new->so_server > sp->so_server) { -			p = &parent->rb_right; -			continue; -		}  		if (new->so_cred < sp->so_cred)  			p = &parent->rb_left;  		else if (new->so_cred > sp->so_cred)  			p = &parent->rb_right;  		else { +			if (!list_empty(&sp->so_lru)) +				list_del_init(&sp->so_lru);  			atomic_inc(&sp->so_count);  			return sp;  		}  	} -	nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64); -	rb_link_node(&new->so_client_node, parent, p); -	rb_insert_color(&new->so_client_node, &clp->cl_state_owners); +	err = ida_get_new(&server->openowner_id, &new->so_seqid.owner_id); +	if (err) +		return ERR_PTR(err); +	rb_link_node(&new->so_server_node, parent, p); +	rb_insert_color(&new->so_server_node, &server->state_owners);  	return new;  }  static void -nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp) +nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp) +{ +	struct nfs_server *server = sp->so_server; + +	if (!RB_EMPTY_NODE(&sp->so_server_node)) +		rb_erase(&sp->so_server_node, &server->state_owners); +	ida_remove(&server->openowner_id, sp->so_seqid.owner_id); +} + +static void +nfs4_init_seqid_counter(struct nfs_seqid_counter *sc)  { -	if (!RB_EMPTY_NODE(&sp->so_client_node)) -		rb_erase(&sp->so_client_node, &clp->cl_state_owners); -	nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id); +	sc->create_time = ktime_get(); +	sc->flags = 0; +	sc->counter = 0; +	spin_lock_init(&sc->lock); +	INIT_LIST_HEAD(&sc->list); +	rpc_init_wait_queue(&sc->wait, "Seqid_waitqueue"); +} + +static void +nfs4_destroy_seqid_counter(struct nfs_seqid_counter *sc) +{ +	rpc_destroy_wait_queue(&sc->wait);  }  /* @@ -366,75 +459,162 @@ nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp)   *   */  static struct nfs4_state_owner * -nfs4_alloc_state_owner(void) +nfs4_alloc_state_owner(struct nfs_server *server, +		struct rpc_cred *cred, +		gfp_t gfp_flags)  {  	struct nfs4_state_owner *sp; -	sp = kzalloc(sizeof(*sp),GFP_NOFS); +	sp = kzalloc(sizeof(*sp), gfp_flags);  	if (!sp)  		return NULL; +	sp->so_server = server; +	sp->so_cred = get_rpccred(cred);  	spin_lock_init(&sp->so_lock);  	INIT_LIST_HEAD(&sp->so_states); -	rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); -	sp->so_seqid.sequence = &sp->so_sequence; -	spin_lock_init(&sp->so_sequence.lock); -	INIT_LIST_HEAD(&sp->so_sequence.list); +	nfs4_init_seqid_counter(&sp->so_seqid);  	atomic_set(&sp->so_count, 1); +	INIT_LIST_HEAD(&sp->so_lru); +	seqcount_init(&sp->so_reclaim_seqcount); +	mutex_init(&sp->so_delegreturn_mutex);  	return sp;  }  static void  nfs4_drop_state_owner(struct nfs4_state_owner *sp)  { -	if (!RB_EMPTY_NODE(&sp->so_client_node)) { -		struct nfs_client *clp = sp->so_server->nfs_client; +	struct rb_node *rb_node = &sp->so_server_node; + +	if (!RB_EMPTY_NODE(rb_node)) { +		struct nfs_server *server = sp->so_server; +		struct nfs_client *clp = server->nfs_client;  		spin_lock(&clp->cl_lock); -		rb_erase(&sp->so_client_node, &clp->cl_state_owners); -		RB_CLEAR_NODE(&sp->so_client_node); +		if (!RB_EMPTY_NODE(rb_node)) { +			rb_erase(rb_node, &server->state_owners); +			RB_CLEAR_NODE(rb_node); +		}  		spin_unlock(&clp->cl_lock);  	}  } -struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred) +static void nfs4_free_state_owner(struct nfs4_state_owner *sp) +{ +	nfs4_destroy_seqid_counter(&sp->so_seqid); +	put_rpccred(sp->so_cred); +	kfree(sp); +} + +static void nfs4_gc_state_owners(struct nfs_server *server) +{ +	struct nfs_client *clp = server->nfs_client; +	struct nfs4_state_owner *sp, *tmp; +	unsigned long time_min, time_max; +	LIST_HEAD(doomed); + +	spin_lock(&clp->cl_lock); +	time_max = jiffies; +	time_min = (long)time_max - (long)clp->cl_lease_time; +	list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) { +		/* NB: LRU is sorted so that oldest is at the head */ +		if (time_in_range(sp->so_expires, time_min, time_max)) +			break; +		list_move(&sp->so_lru, &doomed); +		nfs4_remove_state_owner_locked(sp); +	} +	spin_unlock(&clp->cl_lock); + +	list_for_each_entry_safe(sp, tmp, &doomed, so_lru) { +		list_del(&sp->so_lru); +		nfs4_free_state_owner(sp); +	} +} + +/** + * nfs4_get_state_owner - Look up a state owner given a credential + * @server: nfs_server to search + * @cred: RPC credential to match + * + * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL. + */ +struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, +					      struct rpc_cred *cred, +					      gfp_t gfp_flags)  {  	struct nfs_client *clp = server->nfs_client;  	struct nfs4_state_owner *sp, *new;  	spin_lock(&clp->cl_lock); -	sp = nfs4_find_state_owner(server, cred); +	sp = nfs4_find_state_owner_locked(server, cred);  	spin_unlock(&clp->cl_lock);  	if (sp != NULL) -		return sp; -	new = nfs4_alloc_state_owner(); +		goto out; +	new = nfs4_alloc_state_owner(server, cred, gfp_flags);  	if (new == NULL) -		return NULL; -	new->so_server = server; -	new->so_cred = cred; -	spin_lock(&clp->cl_lock); -	sp = nfs4_insert_state_owner(clp, new); -	spin_unlock(&clp->cl_lock); -	if (sp == new) -		get_rpccred(cred); -	else { -		rpc_destroy_wait_queue(&new->so_sequence.wait); -		kfree(new); -	} +		goto out; +	do { +		if (ida_pre_get(&server->openowner_id, gfp_flags) == 0) +			break; +		spin_lock(&clp->cl_lock); +		sp = nfs4_insert_state_owner_locked(new); +		spin_unlock(&clp->cl_lock); +	} while (sp == ERR_PTR(-EAGAIN)); +	if (sp != new) +		nfs4_free_state_owner(new); +out: +	nfs4_gc_state_owners(server);  	return sp;  } +/** + * nfs4_put_state_owner - Release a nfs4_state_owner + * @sp: state owner data to release + * + * Note that we keep released state owners on an LRU + * list. + * This caches valid state owners so that they can be + * reused, to avoid the OPEN_CONFIRM on minor version 0. + * It also pins the uniquifier of dropped state owners for + * a while, to ensure that those state owner names are + * never reused. + */  void nfs4_put_state_owner(struct nfs4_state_owner *sp)  { -	struct nfs_client *clp = sp->so_server->nfs_client; -	struct rpc_cred *cred = sp->so_cred; +	struct nfs_server *server = sp->so_server; +	struct nfs_client *clp = server->nfs_client;  	if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))  		return; -	nfs4_remove_state_owner(clp, sp); + +	sp->so_expires = jiffies; +	list_add_tail(&sp->so_lru, &server->state_owners_lru);  	spin_unlock(&clp->cl_lock); -	rpc_destroy_wait_queue(&sp->so_sequence.wait); -	put_rpccred(cred); -	kfree(sp); +} + +/** + * nfs4_purge_state_owners - Release all cached state owners + * @server: nfs_server with cached state owners to release + * + * Called at umount time.  Remaining state owners will be on + * the LRU with ref count of zero. + */ +void nfs4_purge_state_owners(struct nfs_server *server) +{ +	struct nfs_client *clp = server->nfs_client; +	struct nfs4_state_owner *sp, *tmp; +	LIST_HEAD(doomed); + +	spin_lock(&clp->cl_lock); +	list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) { +		list_move(&sp->so_lru, &doomed); +		nfs4_remove_state_owner_locked(sp); +	} +	spin_unlock(&clp->cl_lock); + +	list_for_each_entry_safe(sp, tmp, &doomed, so_lru) { +		list_del(&sp->so_lru); +		nfs4_free_state_owner(sp); +	}  }  static struct nfs4_state * @@ -476,6 +656,8 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner)  	list_for_each_entry(state, &nfsi->open_states, inode_states) {  		if (state->owner != owner)  			continue; +		if (!nfs4_valid_open_stateid(state)) +			continue;  		if (atomic_inc_not_zero(&state->count))  			return state;  	} @@ -508,7 +690,8 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner)  		state->owner = owner;  		atomic_inc(&owner->so_count);  		list_add(&state->inode_states, &nfsi->open_states); -		state->inode = igrab(inode); +		ihold(inode); +		state->inode = inode;  		spin_unlock(&inode->i_lock);  		/* Note: The reclaim code dictates that we add stateless  		 * and read-only stateids to the end of the list */ @@ -544,7 +727,7 @@ void nfs4_put_open_state(struct nfs4_state *state)  /*   * Close the current file.   */ -static void __nfs4_close(struct path *path, struct nfs4_state *state, +static void __nfs4_close(struct nfs4_state *state,  		fmode_t fmode, gfp_t gfp_mask, int wait)  {  	struct nfs4_state_owner *owner = state->owner; @@ -586,17 +769,17 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state,  		nfs4_put_open_state(state);  		nfs4_put_state_owner(owner);  	} else -		nfs4_do_close(path, state, gfp_mask, wait); +		nfs4_do_close(state, gfp_mask, wait);  } -void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) +void nfs4_close_state(struct nfs4_state *state, fmode_t fmode)  { -	__nfs4_close(path, state, fmode, GFP_NOFS, 0); +	__nfs4_close(state, fmode, GFP_NOFS, 0);  } -void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode) +void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)  { -	__nfs4_close(path, state, fmode, GFP_KERNEL, 1); +	__nfs4_close(state, fmode, GFP_KERNEL, 1);  }  /* @@ -633,15 +816,12 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p  static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)  {  	struct nfs4_lock_state *lsp; -	struct nfs_client *clp = state->owner->so_server->nfs_client; +	struct nfs_server *server = state->owner->so_server;  	lsp = kzalloc(sizeof(*lsp), GFP_NOFS);  	if (lsp == NULL)  		return NULL; -	rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue"); -	spin_lock_init(&lsp->ls_sequence.lock); -	INIT_LIST_HEAD(&lsp->ls_sequence.list); -	lsp->ls_seqid.sequence = &lsp->ls_sequence; +	nfs4_init_seqid_counter(&lsp->ls_seqid);  	atomic_set(&lsp->ls_count, 1);  	lsp->ls_state = state;  	lsp->ls_owner.lo_type = type; @@ -653,24 +833,22 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f  		lsp->ls_owner.lo_u.posix_owner = fl_owner;  		break;  	default: -		kfree(lsp); -		return NULL; +		goto out_free;  	} -	spin_lock(&clp->cl_lock); -	nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); -	spin_unlock(&clp->cl_lock); +	lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS); +	if (lsp->ls_seqid.owner_id < 0) +		goto out_free;  	INIT_LIST_HEAD(&lsp->ls_locks);  	return lsp; +out_free: +	kfree(lsp); +	return NULL;  } -static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) +void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)  { -	struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client; - -	spin_lock(&clp->cl_lock); -	nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); -	spin_unlock(&clp->cl_lock); -	rpc_destroy_wait_queue(&lsp->ls_sequence.wait); +	ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id); +	nfs4_destroy_seqid_counter(&lsp->ls_seqid);  	kfree(lsp);  } @@ -702,7 +880,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_  	}  	spin_unlock(&state->state_lock);  	if (new != NULL) -		nfs4_free_lock_state(new); +		nfs4_free_lock_state(state->owner->so_server, new);  	return lsp;  } @@ -712,6 +890,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_   */  void nfs4_put_lock_state(struct nfs4_lock_state *lsp)  { +	struct nfs_server *server;  	struct nfs4_state *state;  	if (lsp == NULL) @@ -723,9 +902,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)  	if (list_empty(&state->lock_states))  		clear_bit(LK_STATE_IN_USE, &state->flags);  	spin_unlock(&state->state_lock); -	if (lsp->ls_flags & NFS_LOCK_INITIALIZED) -		nfs4_release_lockowner(lsp); -	nfs4_free_lock_state(lsp); +	server = state->owner->so_server; +	if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { +		struct nfs_client *clp = server->nfs_client; + +		clp->cl_mvops->free_lock_state(server, lsp); +	} else +		nfs4_free_lock_state(server, lsp);  }  static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) @@ -755,7 +938,8 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)  	if (fl->fl_flags & FL_POSIX)  		lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);  	else if (fl->fl_flags & FL_FLOCK) -		lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE); +		lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid, +				NFS4_FLOCK_LOCK_TYPE);  	else  		return -EINVAL;  	if (lsp == NULL) @@ -765,28 +949,80 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)  	return 0;  } -/* - * Byte-range lock aware utility to initialize the stateid of read/write - * requests. - */ -void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid) +static int nfs4_copy_lock_stateid(nfs4_stateid *dst, +		struct nfs4_state *state, +		const struct nfs_lockowner *lockowner)  {  	struct nfs4_lock_state *lsp; -	int seq; +	fl_owner_t fl_owner; +	pid_t fl_pid; +	int ret = -ENOENT; + + +	if (lockowner == NULL) +		goto out; -	do { -		seq = read_seqbegin(&state->seqlock); -		memcpy(dst, &state->stateid, sizeof(*dst)); -	} while (read_seqretry(&state->seqlock, seq));  	if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) -		return; +		goto out; +	fl_owner = lockowner->l_owner; +	fl_pid = lockowner->l_pid;  	spin_lock(&state->state_lock);  	lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); -	if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) -		memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); +	if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) +		ret = -EIO; +	else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { +		nfs4_stateid_copy(dst, &lsp->ls_stateid); +		ret = 0; +	}  	spin_unlock(&state->state_lock);  	nfs4_put_lock_state(lsp); +out: +	return ret; +} + +static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +{ +	const nfs4_stateid *src; +	int seq; + +	do { +		src = &zero_stateid; +		seq = read_seqbegin(&state->seqlock); +		if (test_bit(NFS_OPEN_STATE, &state->flags)) +			src = &state->open_stateid; +		nfs4_stateid_copy(dst, src); +	} while (read_seqretry(&state->seqlock, seq)); +} + +/* + * Byte-range lock aware utility to initialize the stateid of read/write + * requests. + */ +int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, +		fmode_t fmode, const struct nfs_lockowner *lockowner) +{ +	int ret = nfs4_copy_lock_stateid(dst, state, lockowner); +	if (ret == -EIO) +		/* A lost lock - don't even consider delegations */ +		goto out; +	/* returns true if delegation stateid found and copied */ +	if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) { +		ret = 0; +		goto out; +	} +	if (ret != -ENOENT) +		/* nfs4_copy_delegation_stateid() didn't over-write +		 * dst, so it still has the lock stateid which we now +		 * choose to use. +		 */ +		goto out; +	nfs4_copy_open_stateid(dst, state); +	ret = 0; +out: +	if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41)) +		dst->seqid = 0; +	return ret;  }  struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask) @@ -797,20 +1033,28 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_m  	if (new != NULL) {  		new->sequence = counter;  		INIT_LIST_HEAD(&new->list); +		new->task = NULL;  	}  	return new;  }  void nfs_release_seqid(struct nfs_seqid *seqid)  { -	if (!list_empty(&seqid->list)) { -		struct rpc_sequence *sequence = seqid->sequence->sequence; +	struct nfs_seqid_counter *sequence; + +	if (list_empty(&seqid->list)) +		return; +	sequence = seqid->sequence; +	spin_lock(&sequence->lock); +	list_del_init(&seqid->list); +	if (!list_empty(&sequence->list)) { +		struct nfs_seqid *next; -		spin_lock(&sequence->lock); -		list_del_init(&seqid->list); -		spin_unlock(&sequence->lock); -		rpc_wake_up(&sequence->wait); +		next = list_first_entry(&sequence->list, +				struct nfs_seqid, list); +		rpc_wake_up_queued_task(&sequence->wait, next->task);  	} +	spin_unlock(&sequence->lock);  }  void nfs_free_seqid(struct nfs_seqid *seqid) @@ -822,18 +1066,17 @@ void nfs_free_seqid(struct nfs_seqid *seqid)  /*   * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or   * failed with a seqid incrementing error - - * see comments nfs_fs.h:seqid_mutating_error() + * see comments nfs4.h:seqid_mutating_error()   */  static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)  { -	BUG_ON(list_first_entry(&seqid->sequence->sequence->list, struct nfs_seqid, list) != seqid);  	switch (status) {  		case 0:  			break;  		case -NFS4ERR_BAD_SEQID:  			if (seqid->sequence->flags & NFS_SEQID_CONFIRMED)  				return; -			printk(KERN_WARNING "NFS: v4 server returned a bad" +			pr_warn_ratelimited("NFS: v4 server returned a bad"  					" sequence-id error on an"  					" unconfirmed sequence %p!\n",  					seqid->sequence); @@ -868,7 +1111,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)  /*   * Increment the seqid if the LOCK/LOCKU succeeded, or   * failed with a seqid incrementing error - - * see comments nfs_fs.h:seqid_mutating_error() + * see comments nfs4.h:seqid_mutating_error()   */  void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)  { @@ -877,10 +1120,11 @@ void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)  int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)  { -	struct rpc_sequence *sequence = seqid->sequence->sequence; +	struct nfs_seqid_counter *sequence = seqid->sequence;  	int status = 0;  	spin_lock(&sequence->lock); +	seqid->task = task;  	if (list_empty(&seqid->list))  		list_add_tail(&seqid->list, &sequence->list);  	if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid) @@ -896,9 +1140,9 @@ static int nfs4_run_state_manager(void *);  static void nfs4_clear_state_manager_bit(struct nfs_client *clp)  { -	smp_mb__before_clear_bit(); +	smp_mb__before_atomic();  	clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); -	smp_mb__after_clear_bit(); +	smp_mb__after_atomic();  	wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING);  	rpc_wake_up(&clp->cl_rpcwaitq);  } @@ -909,34 +1153,156 @@ static void nfs4_clear_state_manager_bit(struct nfs_client *clp)  void nfs4_schedule_state_manager(struct nfs_client *clp)  {  	struct task_struct *task; +	char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];  	if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)  		return;  	__module_get(THIS_MODULE);  	atomic_inc(&clp->cl_count); -	task = kthread_run(nfs4_run_state_manager, clp, "%s-manager", -				rpc_peeraddr2str(clp->cl_rpcclient, -							RPC_DISPLAY_ADDR)); -	if (!IS_ERR(task)) -		return; -	nfs4_clear_state_manager_bit(clp); -	nfs_put_client(clp); -	module_put(THIS_MODULE); + +	/* The rcu_read_lock() is not strictly necessary, as the state +	 * manager is the only thread that ever changes the rpc_xprt +	 * after it's initialized.  At this point, we're single threaded. */ +	rcu_read_lock(); +	snprintf(buf, sizeof(buf), "%s-manager", +			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); +	rcu_read_unlock(); +	task = kthread_run(nfs4_run_state_manager, clp, "%s", buf); +	if (IS_ERR(task)) { +		printk(KERN_ERR "%s: kthread_run: %ld\n", +			__func__, PTR_ERR(task)); +		nfs4_clear_state_manager_bit(clp); +		nfs_put_client(clp); +		module_put(THIS_MODULE); +	}  }  /* - * Schedule a state recovery attempt + * Schedule a lease recovery attempt   */ -void nfs4_schedule_state_recovery(struct nfs_client *clp) +void nfs4_schedule_lease_recovery(struct nfs_client *clp)  {  	if (!clp)  		return;  	if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))  		set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); +	dprintk("%s: scheduling lease recovery for server %s\n", __func__, +			clp->cl_hostname);  	nfs4_schedule_state_manager(clp);  } +EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery); -int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) +/** + * nfs4_schedule_migration_recovery - trigger migration recovery + * + * @server: FSID that is migrating + * + * Returns zero if recovery has started, otherwise a negative NFS4ERR + * value is returned. + */ +int nfs4_schedule_migration_recovery(const struct nfs_server *server) +{ +	struct nfs_client *clp = server->nfs_client; + +	if (server->fh_expire_type != NFS4_FH_PERSISTENT) { +		pr_err("NFS: volatile file handles not supported (server %s)\n", +				clp->cl_hostname); +		return -NFS4ERR_IO; +	} + +	if (test_bit(NFS_MIG_FAILED, &server->mig_status)) +		return -NFS4ERR_IO; + +	dprintk("%s: scheduling migration recovery for (%llx:%llx) on %s\n", +			__func__, +			(unsigned long long)server->fsid.major, +			(unsigned long long)server->fsid.minor, +			clp->cl_hostname); + +	set_bit(NFS_MIG_IN_TRANSITION, +			&((struct nfs_server *)server)->mig_status); +	set_bit(NFS4CLNT_MOVED, &clp->cl_state); + +	nfs4_schedule_state_manager(clp); +	return 0; +} +EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery); + +/** + * nfs4_schedule_lease_moved_recovery - start lease-moved recovery + * + * @clp: server to check for moved leases + * + */ +void nfs4_schedule_lease_moved_recovery(struct nfs_client *clp) +{ +	dprintk("%s: scheduling lease-moved recovery for client ID %llx on %s\n", +		__func__, clp->cl_clientid, clp->cl_hostname); + +	set_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state); +	nfs4_schedule_state_manager(clp); +} +EXPORT_SYMBOL_GPL(nfs4_schedule_lease_moved_recovery); + +int nfs4_wait_clnt_recover(struct nfs_client *clp) +{ +	int res; + +	might_sleep(); + +	atomic_inc(&clp->cl_count); +	res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, +			nfs_wait_bit_killable, TASK_KILLABLE); +	if (res) +		goto out; +	if (clp->cl_cons_state < 0) +		res = clp->cl_cons_state; +out: +	nfs_put_client(clp); +	return res; +} + +int nfs4_client_recover_expired_lease(struct nfs_client *clp) +{ +	unsigned int loop; +	int ret; + +	for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { +		ret = nfs4_wait_clnt_recover(clp); +		if (ret != 0) +			break; +		if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && +		    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) +			break; +		nfs4_schedule_state_manager(clp); +		ret = -EIO; +	} +	return ret; +} + +/* + * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN + * @clp: client to process + * + * Set the NFS4CLNT_LEASE_EXPIRED state in order to force a + * resend of the SETCLIENTID and hence re-establish the + * callback channel. Then return all existing delegations. + */ +static void nfs40_handle_cb_pathdown(struct nfs_client *clp) +{ +	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); +	nfs_expire_all_delegations(clp); +	dprintk("%s: handling CB_PATHDOWN recovery for server %s\n", __func__, +			clp->cl_hostname); +} + +void nfs4_schedule_path_down_recovery(struct nfs_client *clp) +{ +	nfs40_handle_cb_pathdown(clp); +	nfs4_schedule_state_manager(clp); +} + +static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)  {  	set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); @@ -959,6 +1325,68 @@ int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *s  	return 1;  } +int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state) +{ +	struct nfs_client *clp = server->nfs_client; + +	if (!nfs4_valid_open_stateid(state)) +		return -EBADF; +	nfs4_state_mark_reclaim_nograce(clp, state); +	dprintk("%s: scheduling stateid recovery for server %s\n", __func__, +			clp->cl_hostname); +	nfs4_schedule_state_manager(clp); +	return 0; +} +EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); + +void nfs_inode_find_state_and_recover(struct inode *inode, +		const nfs4_stateid *stateid) +{ +	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; +	struct nfs_inode *nfsi = NFS_I(inode); +	struct nfs_open_context *ctx; +	struct nfs4_state *state; +	bool found = false; + +	spin_lock(&inode->i_lock); +	list_for_each_entry(ctx, &nfsi->open_files, list) { +		state = ctx->state; +		if (state == NULL) +			continue; +		if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) +			continue; +		if (!nfs4_stateid_match(&state->stateid, stateid)) +			continue; +		nfs4_state_mark_reclaim_nograce(clp, state); +		found = true; +	} +	spin_unlock(&inode->i_lock); +	if (found) +		nfs4_schedule_state_manager(clp); +} + +static void nfs4_state_mark_open_context_bad(struct nfs4_state *state) +{ +	struct inode *inode = state->inode; +	struct nfs_inode *nfsi = NFS_I(inode); +	struct nfs_open_context *ctx; + +	spin_lock(&inode->i_lock); +	list_for_each_entry(ctx, &nfsi->open_files, list) { +		if (ctx->state != state) +			continue; +		set_bit(NFS_CONTEXT_BAD, &ctx->flags); +	} +	spin_unlock(&inode->i_lock); +} + +static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error) +{ +	set_bit(NFS_STATE_RECOVERY_FAILED, &state->flags); +	nfs4_state_mark_open_context_bad(state); +} + +  static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)  {  	struct inode *inode = state->inode; @@ -972,13 +1400,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_  	/* Guard against delegation returns and new lock/unlock calls */  	down_write(&nfsi->rwsem);  	/* Protect inode->i_flock using the BKL */ -	lock_flocks(); +	spin_lock(&inode->i_lock);  	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {  		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))  			continue;  		if (nfs_file_open_context(fl->fl_file)->state != state)  			continue; -		unlock_flocks(); +		spin_unlock(&inode->i_lock);  		status = ops->recover_lock(state, fl);  		switch (status) {  			case 0: @@ -996,8 +1424,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_  			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:  				goto out;  			default: -				printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", -						__func__, status); +				printk(KERN_ERR "NFS: %s: unhandled error %d\n", +					 __func__, status);  			case -ENOMEM:  			case -NFS4ERR_DENIED:  			case -NFS4ERR_RECLAIM_BAD: @@ -1005,9 +1433,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_  				/* kill_proc(fl->fl_pid, SIGLOST, 1); */  				status = 0;  		} -		lock_flocks(); +		spin_lock(&inode->i_lock);  	} -	unlock_flocks(); +	spin_unlock(&inode->i_lock);  out:  	up_write(&nfsi->rwsem);  	return status; @@ -1027,11 +1455,14 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs  	 * recovering after a network partition or a reboot from a  	 * server that doesn't support a grace period.  	 */ -restart:  	spin_lock(&sp->so_lock); +	raw_write_seqcount_begin(&sp->so_reclaim_seqcount); +restart:  	list_for_each_entry(state, &sp->so_states, open_states) {  		if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))  			continue; +		if (!nfs4_valid_open_stateid(state)) +			continue;  		if (state->state == 0)  			continue;  		atomic_inc(&state->count); @@ -1040,39 +1471,33 @@ restart:  		if (status >= 0) {  			status = nfs4_reclaim_locks(state, ops);  			if (status >= 0) { -				list_for_each_entry(lock, &state->lock_states, ls_locks) { -					if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) -						printk("%s: Lock reclaim failed!\n", -							__func__); +				if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { +					spin_lock(&state->state_lock); +					list_for_each_entry(lock, &state->lock_states, ls_locks) { +						if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) +							pr_warn_ratelimited("NFS: " +									    "%s: Lock reclaim " +									    "failed!\n", __func__); +					} +					spin_unlock(&state->state_lock);  				}  				nfs4_put_open_state(state); +				spin_lock(&sp->so_lock);  				goto restart;  			}  		}  		switch (status) {  			default: -				printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", -						__func__, status); +				printk(KERN_ERR "NFS: %s: unhandled error %d\n", +					__func__, status);  			case -ENOENT:  			case -ENOMEM:  			case -ESTALE: -				/* -				 * Open state on this file cannot be recovered -				 * All we can do is revert to using the zero stateid. -				 */ -				memset(state->stateid.data, 0, -					sizeof(state->stateid.data)); -				/* Mark the file as being 'closed' */ -				state->state = 0; -				break; -			case -EKEYEXPIRED: -				/* -				 * User RPCSEC_GSS context has expired. -				 * We cannot recover this stateid now, so -				 * skip it and allow recovery thread to -				 * proceed. -				 */ +				/* Open state on this file cannot be recovered */ +				nfs4_state_mark_recovery_failed(state, status);  				break; +			case -EAGAIN: +				ssleep(1);  			case -NFS4ERR_ADMIN_REVOKED:  			case -NFS4ERR_STALE_STATEID:  			case -NFS4ERR_BAD_STATEID: @@ -1091,12 +1516,17 @@ restart:  				goto out_err;  		}  		nfs4_put_open_state(state); +		spin_lock(&sp->so_lock);  		goto restart;  	} +	raw_write_seqcount_end(&sp->so_reclaim_seqcount);  	spin_unlock(&sp->so_lock);  	return 0;  out_err:  	nfs4_put_open_state(state); +	spin_lock(&sp->so_lock); +	raw_write_seqcount_end(&sp->so_reclaim_seqcount); +	spin_unlock(&sp->so_lock);  	return status;  } @@ -1108,21 +1538,27 @@ static void nfs4_clear_open_state(struct nfs4_state *state)  	clear_bit(NFS_O_RDONLY_STATE, &state->flags);  	clear_bit(NFS_O_WRONLY_STATE, &state->flags);  	clear_bit(NFS_O_RDWR_STATE, &state->flags); +	spin_lock(&state->state_lock);  	list_for_each_entry(lock, &state->lock_states, ls_locks) {  		lock->ls_seqid.flags = 0; -		lock->ls_flags &= ~NFS_LOCK_INITIALIZED; +		clear_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags);  	} +	spin_unlock(&state->state_lock);  } -static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state)) +static void nfs4_reset_seqids(struct nfs_server *server, +	int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))  { +	struct nfs_client *clp = server->nfs_client;  	struct nfs4_state_owner *sp;  	struct rb_node *pos;  	struct nfs4_state *state; -	/* Reset all sequence ids to zero */ -	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { -		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); +	spin_lock(&clp->cl_lock); +	for (pos = rb_first(&server->state_owners); +	     pos != NULL; +	     pos = rb_next(pos)) { +		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);  		sp->so_seqid.flags = 0;  		spin_lock(&sp->so_lock);  		list_for_each_entry(state, &sp->so_states, open_states) { @@ -1131,6 +1567,18 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_re  		}  		spin_unlock(&sp->so_lock);  	} +	spin_unlock(&clp->cl_lock); +} + +static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, +	int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state)) +{ +	struct nfs_server *server; + +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) +		nfs4_reset_seqids(server, mark_reclaim); +	rcu_read_unlock();  }  static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) @@ -1141,32 +1589,49 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)  }  static void nfs4_reclaim_complete(struct nfs_client *clp, -				 const struct nfs4_state_recovery_ops *ops) +				 const struct nfs4_state_recovery_ops *ops, +				 struct rpc_cred *cred)  {  	/* Notify the server we're done reclaiming our state */  	if (ops->reclaim_complete) -		(void)ops->reclaim_complete(clp); +		(void)ops->reclaim_complete(clp, cred);  } -static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp) +static void nfs4_clear_reclaim_server(struct nfs_server *server)  { +	struct nfs_client *clp = server->nfs_client;  	struct nfs4_state_owner *sp;  	struct rb_node *pos;  	struct nfs4_state *state; -	if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) -		return 0; - -	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { -		sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); +	spin_lock(&clp->cl_lock); +	for (pos = rb_first(&server->state_owners); +	     pos != NULL; +	     pos = rb_next(pos)) { +		sp = rb_entry(pos, struct nfs4_state_owner, so_server_node);  		spin_lock(&sp->so_lock);  		list_for_each_entry(state, &sp->so_states, open_states) { -			if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags)) +			if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, +						&state->flags))  				continue;  			nfs4_state_mark_reclaim_nograce(clp, state);  		}  		spin_unlock(&sp->so_lock);  	} +	spin_unlock(&clp->cl_lock); +} + +static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp) +{ +	struct nfs_server *server; + +	if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) +		return 0; + +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) +		nfs4_clear_reclaim_server(server); +	rcu_read_unlock();  	nfs_delegation_reap_unclaimed(clp);  	return 1; @@ -1174,9 +1639,15 @@ static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)  static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)  { +	const struct nfs4_state_recovery_ops *ops; +	struct rpc_cred *cred; +  	if (!nfs4_state_clear_reclaim_reboot(clp))  		return; -	nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); +	ops = clp->cl_mvops->reboot_recovery_ops; +	cred = nfs4_get_clid_cred(clp); +	nfs4_reclaim_complete(clp, ops, cred); +	put_rpccred(cred);  }  static void nfs_delegation_clear_all(struct nfs_client *clp) @@ -1191,25 +1662,18 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)  	nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);  } -static void nfs4_warn_keyexpired(const char *s) -{ -	printk_ratelimited(KERN_WARNING "Error: state manager" -			" encountered RPCSEC_GSS session" -			" expired against NFSv4 server %s.\n", -			s); -} -  static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)  {  	switch (error) { +		case 0: +			break;  		case -NFS4ERR_CB_PATH_DOWN: -			nfs_handle_cb_pathdown(clp); -			return 0; +			nfs40_handle_cb_pathdown(clp); +			break;  		case -NFS4ERR_NO_GRACE:  			nfs4_state_end_reclaim_reboot(clp); -			return 0; +			break;  		case -NFS4ERR_STALE_CLIENTID: -		case -NFS4ERR_LEASE_MOVED:  			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);  			nfs4_state_clear_reclaim_reboot(clp);  			nfs4_state_start_reclaim_reboot(clp); @@ -1222,43 +1686,61 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)  		case -NFS4ERR_BADSLOT:  		case -NFS4ERR_BAD_HIGH_SLOT:  		case -NFS4ERR_DEADSESSION: -		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:  		case -NFS4ERR_SEQ_FALSE_RETRY:  		case -NFS4ERR_SEQ_MISORDERED:  			set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);  			/* Zero session reset errors */ -			return 0; -		case -EKEYEXPIRED: -			/* Nothing we can do */ -			nfs4_warn_keyexpired(clp->cl_hostname); -			return 0; +			break; +		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: +			set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); +			break; +		default: +			dprintk("%s: failed to handle error %d for server %s\n", +					__func__, error, clp->cl_hostname); +			return error;  	} -	return error; +	dprintk("%s: handled error %d for server %s\n", __func__, error, +			clp->cl_hostname); +	return 0;  }  static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)  { +	struct nfs4_state_owner *sp; +	struct nfs_server *server;  	struct rb_node *pos;  	int status = 0;  restart: -	spin_lock(&clp->cl_lock); -	for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { -		struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); -		if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags)) -			continue; -		atomic_inc(&sp->so_count); -		spin_unlock(&clp->cl_lock); -		status = nfs4_reclaim_open_state(sp, ops); -		if (status < 0) { -			set_bit(ops->owner_flag_bit, &sp->so_flags); +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +		nfs4_purge_state_owners(server); +		spin_lock(&clp->cl_lock); +		for (pos = rb_first(&server->state_owners); +		     pos != NULL; +		     pos = rb_next(pos)) { +			sp = rb_entry(pos, +				struct nfs4_state_owner, so_server_node); +			if (!test_and_clear_bit(ops->owner_flag_bit, +							&sp->so_flags)) +				continue; +			atomic_inc(&sp->so_count); +			spin_unlock(&clp->cl_lock); +			rcu_read_unlock(); + +			status = nfs4_reclaim_open_state(sp, ops); +			if (status < 0) { +				set_bit(ops->owner_flag_bit, &sp->so_flags); +				nfs4_put_state_owner(sp); +				return nfs4_recovery_handle_error(clp, status); +			} +  			nfs4_put_state_owner(sp); -			return nfs4_recovery_handle_error(clp, status); +			goto restart;  		} -		nfs4_put_state_owner(sp); -		goto restart; +		spin_unlock(&clp->cl_lock);  	} -	spin_unlock(&clp->cl_lock); +	rcu_read_unlock();  	return status;  } @@ -1267,7 +1749,7 @@ static int nfs4_check_lease(struct nfs_client *clp)  	struct rpc_cred *cred;  	const struct nfs4_state_maintenance_ops *ops =  		clp->cl_mvops->state_renewal_ops; -	int status = -NFS4ERR_EXPIRED; +	int status;  	/* Is the client already known to have an expired lease? */  	if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) @@ -1276,51 +1758,424 @@ static int nfs4_check_lease(struct nfs_client *clp)  	cred = ops->get_state_renewal_cred_locked(clp);  	spin_unlock(&clp->cl_lock);  	if (cred == NULL) { -		cred = nfs4_get_setclientid_cred(clp); +		cred = nfs4_get_clid_cred(clp); +		status = -ENOKEY;  		if (cred == NULL)  			goto out;  	}  	status = ops->renew_lease(clp, cred);  	put_rpccred(cred); +	if (status == -ETIMEDOUT) { +		set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); +		return 0; +	}  out:  	return nfs4_recovery_handle_error(clp, status);  } -static int nfs4_reclaim_lease(struct nfs_client *clp) +/* Set NFS4CLNT_LEASE_EXPIRED and reclaim reboot state for all v4.0 errors + * and for recoverable errors on EXCHANGE_ID for v4.1 + */ +static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) +{ +	switch (status) { +	case -NFS4ERR_SEQ_MISORDERED: +		if (test_and_set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) +			return -ESERVERFAULT; +		/* Lease confirmation error: retry after purging the lease */ +		ssleep(1); +		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); +		break; +	case -NFS4ERR_STALE_CLIENTID: +		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); +		nfs4_state_clear_reclaim_reboot(clp); +		nfs4_state_start_reclaim_reboot(clp); +		break; +	case -NFS4ERR_CLID_INUSE: +		pr_err("NFS: Server %s reports our clientid is in use\n", +			clp->cl_hostname); +		nfs_mark_client_ready(clp, -EPERM); +		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); +		return -EPERM; +	case -EACCES: +	case -NFS4ERR_DELAY: +	case -ETIMEDOUT: +	case -EAGAIN: +		ssleep(1); +		break; + +	case -NFS4ERR_MINOR_VERS_MISMATCH: +		if (clp->cl_cons_state == NFS_CS_SESSION_INITING) +			nfs_mark_client_ready(clp, -EPROTONOSUPPORT); +		dprintk("%s: exit with error %d for server %s\n", +				__func__, -EPROTONOSUPPORT, clp->cl_hostname); +		return -EPROTONOSUPPORT; +	case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery +				 * in nfs4_exchange_id */ +	default: +		dprintk("%s: exit with error %d for server %s\n", __func__, +				status, clp->cl_hostname); +		return status; +	} +	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); +	dprintk("%s: handled error %d for server %s\n", __func__, status, +			clp->cl_hostname); +	return 0; +} + +static int nfs4_establish_lease(struct nfs_client *clp)  {  	struct rpc_cred *cred;  	const struct nfs4_state_recovery_ops *ops =  		clp->cl_mvops->reboot_recovery_ops; -	int status = -ENOENT; +	int status; -	cred = ops->get_clid_cred(clp); -	if (cred != NULL) { -		status = ops->establish_clid(clp, cred); -		put_rpccred(cred); -		/* Handle case where the user hasn't set up machine creds */ -		if (status == -EACCES && cred == clp->cl_machine_cred) { -			nfs4_clear_machine_cred(clp); -			status = -EAGAIN; +	cred = nfs4_get_clid_cred(clp); +	if (cred == NULL) +		return -ENOENT; +	status = ops->establish_clid(clp, cred); +	put_rpccred(cred); +	if (status != 0) +		return status; +	pnfs_destroy_all_layouts(clp); +	return 0; +} + +/* + * Returns zero or a negative errno.  NFS4ERR values are converted + * to local errno values. + */ +static int nfs4_reclaim_lease(struct nfs_client *clp) +{ +	int status; + +	status = nfs4_establish_lease(clp); +	if (status < 0) +		return nfs4_handle_reclaim_lease_error(clp, status); +	if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state)) +		nfs4_state_start_reclaim_nograce(clp); +	if (!test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) +		set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); +	clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); +	clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); +	return 0; +} + +static int nfs4_purge_lease(struct nfs_client *clp) +{ +	int status; + +	status = nfs4_establish_lease(clp); +	if (status < 0) +		return nfs4_handle_reclaim_lease_error(clp, status); +	clear_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state); +	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); +	nfs4_state_start_reclaim_nograce(clp); +	return 0; +} + +/* + * Try remote migration of one FSID from a source server to a + * destination server.  The source server provides a list of + * potential destinations. + * + * Returns zero or a negative NFS4ERR status code. + */ +static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) +{ +	struct nfs_client *clp = server->nfs_client; +	struct nfs4_fs_locations *locations = NULL; +	struct inode *inode; +	struct page *page; +	int status, result; + +	dprintk("--> %s: FSID %llx:%llx on \"%s\"\n", __func__, +			(unsigned long long)server->fsid.major, +			(unsigned long long)server->fsid.minor, +			clp->cl_hostname); + +	result = 0; +	page = alloc_page(GFP_KERNEL); +	locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); +	if (page == NULL || locations == NULL) { +		dprintk("<-- %s: no memory\n", __func__); +		goto out; +	} + +	inode = server->super->s_root->d_inode; +	result = nfs4_proc_get_locations(inode, locations, page, cred); +	if (result) { +		dprintk("<-- %s: failed to retrieve fs_locations: %d\n", +			__func__, result); +		goto out; +	} + +	result = -NFS4ERR_NXIO; +	if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { +		dprintk("<-- %s: No fs_locations data, migration skipped\n", +			__func__); +		goto out; +	} + +	nfs4_begin_drain_session(clp); + +	status = nfs4_replace_transport(server, locations); +	if (status != 0) { +		dprintk("<-- %s: failed to replace transport: %d\n", +			__func__, status); +		goto out; +	} + +	result = 0; +	dprintk("<-- %s: migration succeeded\n", __func__); + +out: +	if (page != NULL) +		__free_page(page); +	kfree(locations); +	if (result) { +		pr_err("NFS: migration recovery failed (server %s)\n", +				clp->cl_hostname); +		set_bit(NFS_MIG_FAILED, &server->mig_status); +	} +	return result; +} + +/* + * Returns zero or a negative NFS4ERR status code. + */ +static int nfs4_handle_migration(struct nfs_client *clp) +{ +	const struct nfs4_state_maintenance_ops *ops = +				clp->cl_mvops->state_renewal_ops; +	struct nfs_server *server; +	struct rpc_cred *cred; + +	dprintk("%s: migration reported on \"%s\"\n", __func__, +			clp->cl_hostname); + +	spin_lock(&clp->cl_lock); +	cred = ops->get_state_renewal_cred_locked(clp); +	spin_unlock(&clp->cl_lock); +	if (cred == NULL) +		return -NFS4ERR_NOENT; + +	clp->cl_mig_gen++; +restart: +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +		int status; + +		if (server->mig_gen == clp->cl_mig_gen) +			continue; +		server->mig_gen = clp->cl_mig_gen; + +		if (!test_and_clear_bit(NFS_MIG_IN_TRANSITION, +						&server->mig_status)) +			continue; + +		rcu_read_unlock(); +		status = nfs4_try_migration(server, cred); +		if (status < 0) { +			put_rpccred(cred); +			return status;  		} -		if (status == -NFS4ERR_MINOR_VERS_MISMATCH) -			status = -EPROTONOSUPPORT; +		goto restart; +	} +	rcu_read_unlock(); +	put_rpccred(cred); +	return 0; +} + +/* + * Test each nfs_server on the clp's cl_superblocks list to see + * if it's moved to another server.  Stop when the server no longer + * returns NFS4ERR_LEASE_MOVED. + */ +static int nfs4_handle_lease_moved(struct nfs_client *clp) +{ +	const struct nfs4_state_maintenance_ops *ops = +				clp->cl_mvops->state_renewal_ops; +	struct nfs_server *server; +	struct rpc_cred *cred; + +	dprintk("%s: lease moved reported on \"%s\"\n", __func__, +			clp->cl_hostname); + +	spin_lock(&clp->cl_lock); +	cred = ops->get_state_renewal_cred_locked(clp); +	spin_unlock(&clp->cl_lock); +	if (cred == NULL) +		return -NFS4ERR_NOENT; + +	clp->cl_mig_gen++; +restart: +	rcu_read_lock(); +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +		struct inode *inode; +		int status; + +		if (server->mig_gen == clp->cl_mig_gen) +			continue; +		server->mig_gen = clp->cl_mig_gen; + +		rcu_read_unlock(); + +		inode = server->super->s_root->d_inode; +		status = nfs4_proc_fsid_present(inode, cred); +		if (status != -NFS4ERR_MOVED) +			goto restart;	/* wasn't this one */ +		if (nfs4_try_migration(server, cred) == -NFS4ERR_LEASE_MOVED) +			goto restart;	/* there are more */ +		goto out;  	} +	rcu_read_unlock(); + +out: +	put_rpccred(cred); +	return 0; +} + +/** + * nfs4_discover_server_trunking - Detect server IP address trunking + * + * @clp: nfs_client under test + * @result: OUT: found nfs_client, or clp + * + * Returns zero or a negative errno.  If zero is returned, + * an nfs_client pointer is planted in "result". + * + * Note: since we are invoked in process context, and + * not from inside the state manager, we cannot use + * nfs4_handle_reclaim_lease_error(). + */ +int nfs4_discover_server_trunking(struct nfs_client *clp, +				  struct nfs_client **result) +{ +	const struct nfs4_state_recovery_ops *ops = +				clp->cl_mvops->reboot_recovery_ops; +	struct rpc_clnt *clnt; +	struct rpc_cred *cred; +	int i, status; + +	dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname); + +	clnt = clp->cl_rpcclient; +	i = 0; + +	mutex_lock(&nfs_clid_init_mutex); +again: +	status  = -ENOENT; +	cred = nfs4_get_clid_cred(clp); +	if (cred == NULL) +		goto out_unlock; + +	status = ops->detect_trunking(clp, result, cred); +	put_rpccred(cred); +	switch (status) { +	case 0: +		break; +	case -ETIMEDOUT: +		if (clnt->cl_softrtry) +			break; +	case -NFS4ERR_DELAY: +	case -EAGAIN: +		ssleep(1); +	case -NFS4ERR_STALE_CLIENTID: +		dprintk("NFS: %s after status %d, retrying\n", +			__func__, status); +		goto again; +	case -EACCES: +		if (i++ == 0) { +			nfs4_root_machine_cred(clp); +			goto again; +		} +		if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) +			break; +	case -NFS4ERR_CLID_INUSE: +	case -NFS4ERR_WRONGSEC: +		/* No point in retrying if we already used RPC_AUTH_UNIX */ +		if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) { +			status = -EPERM; +			break; +		} +		clnt = rpc_clone_client_set_auth(clnt, RPC_AUTH_UNIX); +		if (IS_ERR(clnt)) { +			status = PTR_ERR(clnt); +			break; +		} +		/* Note: this is safe because we haven't yet marked the +		 * client as ready, so we are the only user of +		 * clp->cl_rpcclient +		 */ +		clnt = xchg(&clp->cl_rpcclient, clnt); +		rpc_shutdown_client(clnt); +		clnt = clp->cl_rpcclient; +		goto again; + +	case -NFS4ERR_MINOR_VERS_MISMATCH: +		status = -EPROTONOSUPPORT; +		break; + +	case -EKEYEXPIRED: +	case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery +				 * in nfs4_exchange_id */ +		status = -EKEYEXPIRED; +		break; +	default: +		pr_warn("NFS: %s unhandled error %d. Exiting with error EIO\n", +				__func__, status); +		status = -EIO; +	} + +out_unlock: +	mutex_unlock(&nfs_clid_init_mutex); +	dprintk("NFS: %s: status = %d\n", __func__, status);  	return status;  }  #ifdef CONFIG_NFS_V4_1 -void nfs41_handle_recall_slot(struct nfs_client *clp) +void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) +{ +	struct nfs_client *clp = session->clp; + +	switch (err) { +	default: +		set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); +		break; +	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: +		set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); +	} +	nfs4_schedule_lease_recovery(clp); +} +EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); + +static void nfs41_ping_server(struct nfs_client *clp) +{ +	/* Use CHECK_LEASE to ping the server with a SEQUENCE */ +	set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); +	nfs4_schedule_state_manager(clp); +} + +void nfs41_server_notify_target_slotid_update(struct nfs_client *clp)  { -	set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); -	nfs4_schedule_state_recovery(clp); +	nfs41_ping_server(clp); +} + +void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp) +{ +	nfs41_ping_server(clp);  }  static void nfs4_reset_all_state(struct nfs_client *clp)  {  	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { -		clp->cl_boot_time = CURRENT_TIME; +		set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state); +		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);  		nfs4_state_start_reclaim_nograce(clp); -		nfs4_schedule_state_recovery(clp); +		dprintk("%s: scheduling reset of all state for server %s!\n", +				__func__, clp->cl_hostname); +		nfs4_schedule_state_manager(clp);  	}  } @@ -1328,168 +2183,173 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp)  {  	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {  		nfs4_state_start_reclaim_reboot(clp); -		nfs4_schedule_state_recovery(clp); +		dprintk("%s: server %s rebooted!\n", __func__, +				clp->cl_hostname); +		nfs4_schedule_state_manager(clp);  	}  }  static void nfs41_handle_state_revoked(struct nfs_client *clp)  { -	/* Temporary */  	nfs4_reset_all_state(clp); +	dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);  }  static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)  {  	/* This will need to handle layouts too */  	nfs_expire_all_delegations(clp); +	dprintk("%s: Recallable state revoked on server %s!\n", __func__, +			clp->cl_hostname);  } -static void nfs41_handle_cb_path_down(struct nfs_client *clp) +static void nfs41_handle_backchannel_fault(struct nfs_client *clp)  {  	nfs_expire_all_delegations(clp);  	if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) -		nfs4_schedule_state_recovery(clp); +		nfs4_schedule_state_manager(clp); +	dprintk("%s: server %s declared a backchannel fault\n", __func__, +			clp->cl_hostname); +} + +static void nfs41_handle_cb_path_down(struct nfs_client *clp) +{ +	if (test_and_set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, +		&clp->cl_state) == 0) +		nfs4_schedule_state_manager(clp);  }  void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)  {  	if (!flags)  		return; -	else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) + +	dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n", +		__func__, clp->cl_hostname, clp->cl_clientid, flags); + +	if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)  		nfs41_handle_server_reboot(clp); -	else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | +	if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |  			    SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | -			    SEQ4_STATUS_ADMIN_STATE_REVOKED | -			    SEQ4_STATUS_LEASE_MOVED)) +			    SEQ4_STATUS_ADMIN_STATE_REVOKED))  		nfs41_handle_state_revoked(clp); -	else if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) +	if (flags & SEQ4_STATUS_LEASE_MOVED) +		nfs4_schedule_lease_moved_recovery(clp); +	if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)  		nfs41_handle_recallable_state_revoked(clp); +	if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT) +		nfs41_handle_backchannel_fault(clp);  	else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | -			    SEQ4_STATUS_BACKCHANNEL_FAULT | -			    SEQ4_STATUS_CB_PATH_DOWN_SESSION)) +				SEQ4_STATUS_CB_PATH_DOWN_SESSION))  		nfs41_handle_cb_path_down(clp);  }  static int nfs4_reset_session(struct nfs_client *clp)  { +	struct rpc_cred *cred;  	int status; +	if (!nfs4_has_session(clp)) +		return 0;  	nfs4_begin_drain_session(clp); -	status = nfs4_proc_destroy_session(clp->cl_session); -	if (status && status != -NFS4ERR_BADSESSION && -	    status != -NFS4ERR_DEADSESSION) { +	cred = nfs4_get_clid_cred(clp); +	status = nfs4_proc_destroy_session(clp->cl_session, cred); +	switch (status) { +	case 0: +	case -NFS4ERR_BADSESSION: +	case -NFS4ERR_DEADSESSION: +		break; +	case -NFS4ERR_BACK_CHAN_BUSY: +	case -NFS4ERR_DELAY: +		set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); +		status = 0; +		ssleep(1); +		goto out; +	default:  		status = nfs4_recovery_handle_error(clp, status);  		goto out;  	}  	memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN); -	status = nfs4_proc_create_session(clp); +	status = nfs4_proc_create_session(clp, cred);  	if (status) { -		status = nfs4_recovery_handle_error(clp, status); +		dprintk("%s: session reset failed with status %d for server %s!\n", +			__func__, status, clp->cl_hostname); +		status = nfs4_handle_reclaim_lease_error(clp, status);  		goto out;  	} -	/* create_session negotiated new slot table */ -	clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); - -	 /* Let the state manager reestablish state */ -	if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) -		nfs41_setup_state_renewal(clp); +	nfs41_finish_session_reset(clp); +	dprintk("%s: session reset was successful for server %s!\n", +			__func__, clp->cl_hostname);  out: +	if (cred) +		put_rpccred(cred);  	return status;  } -static int nfs4_recall_slot(struct nfs_client *clp) +static int nfs4_bind_conn_to_session(struct nfs_client *clp)  { -	struct nfs4_slot_table *fc_tbl = &clp->cl_session->fc_slot_table; -	struct nfs4_channel_attrs *fc_attrs = &clp->cl_session->fc_attrs; -	struct nfs4_slot *new, *old; -	int i; +	struct rpc_cred *cred; +	int ret; +	if (!nfs4_has_session(clp)) +		return 0;  	nfs4_begin_drain_session(clp); -	new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot), -		      GFP_NOFS); -        if (!new) -		return -ENOMEM; - -	spin_lock(&fc_tbl->slot_tbl_lock); -	for (i = 0; i < fc_tbl->target_max_slots; i++) -		new[i].seq_nr = fc_tbl->slots[i].seq_nr; -	old = fc_tbl->slots; -	fc_tbl->slots = new; -	fc_tbl->max_slots = fc_tbl->target_max_slots; -	fc_tbl->target_max_slots = 0; -	fc_attrs->max_reqs = fc_tbl->max_slots; -	spin_unlock(&fc_tbl->slot_tbl_lock); - -	kfree(old); -	nfs4_end_drain_session(clp); +	cred = nfs4_get_clid_cred(clp); +	ret = nfs4_proc_bind_conn_to_session(clp, cred); +	if (cred) +		put_rpccred(cred); +	clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); +	switch (ret) { +	case 0: +		dprintk("%s: bind_conn_to_session was successful for server %s!\n", +			__func__, clp->cl_hostname); +		break; +	case -NFS4ERR_DELAY: +		ssleep(1); +		set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); +		break; +	default: +		return nfs4_recovery_handle_error(clp, ret); +	}  	return 0;  } -  #else /* CONFIG_NFS_V4_1 */  static int nfs4_reset_session(struct nfs_client *clp) { return 0; } -static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; } -static int nfs4_recall_slot(struct nfs_client *clp) { return 0; } -#endif /* CONFIG_NFS_V4_1 */ -/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors - * on EXCHANGE_ID for v4.1 - */ -static void nfs4_set_lease_expired(struct nfs_client *clp, int status) +static int nfs4_bind_conn_to_session(struct nfs_client *clp)  { -	if (nfs4_has_session(clp)) { -		switch (status) { -		case -NFS4ERR_DELAY: -		case -NFS4ERR_CLID_INUSE: -		case -EAGAIN: -			break; - -		case -EKEYEXPIRED: -			nfs4_warn_keyexpired(clp->cl_hostname); -		case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery -					 * in nfs4_exchange_id */ -		default: -			return; -		} -	} -	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); +	return 0;  } +#endif /* CONFIG_NFS_V4_1 */  static void nfs4_state_manager(struct nfs_client *clp)  {  	int status = 0; +	const char *section = "", *section_sep = "";  	/* Ensure exclusive access to NFSv4 state */ -	for(;;) { -		if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) { -			/* We're going to have to re-establish a clientid */ -			status = nfs4_reclaim_lease(clp); -			if (status) { -				nfs4_set_lease_expired(clp, status); -				if (test_bit(NFS4CLNT_LEASE_EXPIRED, -							&clp->cl_state)) -					continue; -				if (clp->cl_cons_state == -							NFS_CS_SESSION_INITING) -					nfs_mark_client_ready(clp, status); +	do { +		if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { +			section = "purge state"; +			status = nfs4_purge_lease(clp); +			if (status < 0)  				goto out_error; -			} -			clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); -			set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); -			pnfs_destroy_all_layouts(clp); +			continue;  		} -		if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { -			status = nfs4_check_lease(clp); -			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) -				continue; -			if (status < 0 && status != -NFS4ERR_CB_PATH_DOWN) +		if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) { +			section = "lease expired"; +			/* We're going to have to re-establish a clientid */ +			status = nfs4_reclaim_lease(clp); +			if (status < 0)  				goto out_error; +			continue;  		}  		/* Initialize or reset the session */ -		if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) -		   && nfs4_has_session(clp)) { +		if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) { +			section = "reset session";  			status = nfs4_reset_session(clp);  			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))  				continue; @@ -1497,8 +2357,40 @@ static void nfs4_state_manager(struct nfs_client *clp)  				goto out_error;  		} +		/* Send BIND_CONN_TO_SESSION */ +		if (test_and_clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, +				&clp->cl_state)) { +			section = "bind conn to session"; +			status = nfs4_bind_conn_to_session(clp); +			if (status < 0) +				goto out_error; +			continue; +		} + +		if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { +			section = "check lease"; +			status = nfs4_check_lease(clp); +			if (status < 0) +				goto out_error; +		} + +		if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) { +			section = "migration"; +			status = nfs4_handle_migration(clp); +			if (status < 0) +				goto out_error; +		} + +		if (test_and_clear_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state)) { +			section = "lease moved"; +			status = nfs4_handle_lease_moved(clp); +			if (status < 0) +				goto out_error; +		} +  		/* First recover reboot state... */  		if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { +			section = "reclaim reboot";  			status = nfs4_do_reclaim(clp,  				clp->cl_mvops->reboot_recovery_ops);  			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || @@ -1513,6 +2405,7 @@ static void nfs4_state_manager(struct nfs_client *clp)  		/* Now recover expired state... */  		if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { +			section = "reclaim nograce";  			status = nfs4_do_reclaim(clp,  				clp->cl_mvops->nograce_recovery_ops);  			if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || @@ -1528,15 +2421,6 @@ static void nfs4_state_manager(struct nfs_client *clp)  			nfs_client_return_marked_delegations(clp);  			continue;  		} -		/* Recall session slots */ -		if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state) -		   && nfs4_has_session(clp)) { -			status = nfs4_recall_slot(clp); -			if (status < 0) -				goto out_error; -			continue; -		} -  		nfs4_clear_state_manager_bit(clp);  		/* Did we race with an attempt to give us more work? */ @@ -1544,11 +2428,15 @@ static void nfs4_state_manager(struct nfs_client *clp)  			break;  		if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)  			break; -	} +	} while (atomic_read(&clp->cl_count) > 1);  	return;  out_error: -	printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s" -			" with error %d\n", clp->cl_hostname, -status); +	if (strlen(section)) +		section_sep = ": "; +	pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" +			" with error %d\n", section_sep, section, +			clp->cl_hostname, -status); +	ssleep(1);  	nfs4_end_drain_session(clp);  	nfs4_clear_state_manager_bit(clp);  } diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c new file mode 100644 index 00000000000..6f340f02f2b --- /dev/null +++ b/fs/nfs/nfs4super.c @@ -0,0 +1,358 @@ +/* + * Copyright (c) 2012 Bryan Schumaker <bjschuma@netapp.com> + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/nfs_idmap.h> +#include <linux/nfs4_mount.h> +#include <linux/nfs_fs.h> +#include "delegation.h" +#include "internal.h" +#include "nfs4_fs.h" +#include "dns_resolve.h" +#include "pnfs.h" +#include "nfs.h" + +#define NFSDBG_FACILITY		NFSDBG_VFS + +static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc); +static void nfs4_evict_inode(struct inode *inode); +static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type, +	int flags, const char *dev_name, void *raw_data); +static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, +	int flags, const char *dev_name, void *raw_data); +static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type, +	int flags, const char *dev_name, void *raw_data); + +static struct file_system_type nfs4_remote_fs_type = { +	.owner		= THIS_MODULE, +	.name		= "nfs4", +	.mount		= nfs4_remote_mount, +	.kill_sb	= nfs_kill_super, +	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, +}; + +static struct file_system_type nfs4_remote_referral_fs_type = { +	.owner		= THIS_MODULE, +	.name		= "nfs4", +	.mount		= nfs4_remote_referral_mount, +	.kill_sb	= nfs_kill_super, +	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, +}; + +struct file_system_type nfs4_referral_fs_type = { +	.owner		= THIS_MODULE, +	.name		= "nfs4", +	.mount		= nfs4_referral_mount, +	.kill_sb	= nfs_kill_super, +	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, +}; + +static const struct super_operations nfs4_sops = { +	.alloc_inode	= nfs_alloc_inode, +	.destroy_inode	= nfs_destroy_inode, +	.write_inode	= nfs4_write_inode, +	.drop_inode	= nfs_drop_inode, +	.put_super	= nfs_put_super, +	.statfs		= nfs_statfs, +	.evict_inode	= nfs4_evict_inode, +	.umount_begin	= nfs_umount_begin, +	.show_options	= nfs_show_options, +	.show_devname	= nfs_show_devname, +	.show_path	= nfs_show_path, +	.show_stats	= nfs_show_stats, +	.remount_fs	= nfs_remount, +}; + +struct nfs_subversion nfs_v4 = { +	.owner = THIS_MODULE, +	.nfs_fs   = &nfs4_fs_type, +	.rpc_vers = &nfs_version4, +	.rpc_ops  = &nfs_v4_clientops, +	.sops     = &nfs4_sops, +	.xattr    = nfs4_xattr_handlers, +}; + +static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc) +{ +	int ret = nfs_write_inode(inode, wbc); + +	if (ret == 0) +		ret = pnfs_layoutcommit_inode(inode, +				wbc->sync_mode == WB_SYNC_ALL); +	return ret; +} + +/* + * Clean out any remaining NFSv4 state that might be left over due + * to open() calls that passed nfs_atomic_lookup, but failed to call + * nfs_open(). + */ +static void nfs4_evict_inode(struct inode *inode) +{ +	truncate_inode_pages_final(&inode->i_data); +	clear_inode(inode); +	pnfs_return_layout(inode); +	pnfs_destroy_layout(NFS_I(inode)); +	/* If we are holding a delegation, return it! */ +	nfs_inode_return_delegation_noreclaim(inode); +	/* First call standard NFS clear_inode() code */ +	nfs_clear_inode(inode); +} + +/* + * Get the superblock for the NFS4 root partition + */ +static struct dentry * +nfs4_remote_mount(struct file_system_type *fs_type, int flags, +		  const char *dev_name, void *info) +{ +	struct nfs_mount_info *mount_info = info; +	struct nfs_server *server; +	struct dentry *mntroot = ERR_PTR(-ENOMEM); + +	mount_info->set_security = nfs_set_sb_security; + +	/* Get a volume representation */ +	server = nfs4_create_server(mount_info, &nfs_v4); +	if (IS_ERR(server)) { +		mntroot = ERR_CAST(server); +		goto out; +	} + +	mntroot = nfs_fs_mount_common(server, flags, dev_name, mount_info, &nfs_v4); + +out: +	return mntroot; +} + +static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type, +		int flags, void *data, const char *hostname) +{ +	struct vfsmount *root_mnt; +	char *root_devname; +	size_t len; + +	len = strlen(hostname) + 5; +	root_devname = kmalloc(len, GFP_KERNEL); +	if (root_devname == NULL) +		return ERR_PTR(-ENOMEM); +	/* Does hostname needs to be enclosed in brackets? */ +	if (strchr(hostname, ':')) +		snprintf(root_devname, len, "[%s]:/", hostname); +	else +		snprintf(root_devname, len, "%s:/", hostname); +	root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data); +	kfree(root_devname); +	return root_mnt; +} + +struct nfs_referral_count { +	struct list_head list; +	const struct task_struct *task; +	unsigned int referral_count; +}; + +static LIST_HEAD(nfs_referral_count_list); +static DEFINE_SPINLOCK(nfs_referral_count_list_lock); + +static struct nfs_referral_count *nfs_find_referral_count(void) +{ +	struct nfs_referral_count *p; + +	list_for_each_entry(p, &nfs_referral_count_list, list) { +		if (p->task == current) +			return p; +	} +	return NULL; +} + +#define NFS_MAX_NESTED_REFERRALS 2 + +static int nfs_referral_loop_protect(void) +{ +	struct nfs_referral_count *p, *new; +	int ret = -ENOMEM; + +	new = kmalloc(sizeof(*new), GFP_KERNEL); +	if (!new) +		goto out; +	new->task = current; +	new->referral_count = 1; + +	ret = 0; +	spin_lock(&nfs_referral_count_list_lock); +	p = nfs_find_referral_count(); +	if (p != NULL) { +		if (p->referral_count >= NFS_MAX_NESTED_REFERRALS) +			ret = -ELOOP; +		else +			p->referral_count++; +	} else { +		list_add(&new->list, &nfs_referral_count_list); +		new = NULL; +	} +	spin_unlock(&nfs_referral_count_list_lock); +	kfree(new); +out: +	return ret; +} + +static void nfs_referral_loop_unprotect(void) +{ +	struct nfs_referral_count *p; + +	spin_lock(&nfs_referral_count_list_lock); +	p = nfs_find_referral_count(); +	p->referral_count--; +	if (p->referral_count == 0) +		list_del(&p->list); +	else +		p = NULL; +	spin_unlock(&nfs_referral_count_list_lock); +	kfree(p); +} + +static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, +		const char *export_path) +{ +	struct dentry *dentry; +	int err; + +	if (IS_ERR(root_mnt)) +		return ERR_CAST(root_mnt); + +	err = nfs_referral_loop_protect(); +	if (err) { +		mntput(root_mnt); +		return ERR_PTR(err); +	} + +	dentry = mount_subtree(root_mnt, export_path); +	nfs_referral_loop_unprotect(); + +	return dentry; +} + +struct dentry *nfs4_try_mount(int flags, const char *dev_name, +			      struct nfs_mount_info *mount_info, +			      struct nfs_subversion *nfs_mod) +{ +	char *export_path; +	struct vfsmount *root_mnt; +	struct dentry *res; +	struct nfs_parsed_mount_data *data = mount_info->parsed; + +	dfprintk(MOUNT, "--> nfs4_try_mount()\n"); + +	export_path = data->nfs_server.export_path; +	data->nfs_server.export_path = "/"; +	root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info, +			data->nfs_server.hostname); +	data->nfs_server.export_path = export_path; + +	res = nfs_follow_remote_path(root_mnt, export_path); + +	dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", +		 PTR_ERR_OR_ZERO(res), +		 IS_ERR(res) ? " [error]" : ""); +	return res; +} + +static struct dentry * +nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags, +			   const char *dev_name, void *raw_data) +{ +	struct nfs_mount_info mount_info = { +		.fill_super = nfs_fill_super, +		.set_security = nfs_clone_sb_security, +		.cloned = raw_data, +	}; +	struct nfs_server *server; +	struct dentry *mntroot = ERR_PTR(-ENOMEM); + +	dprintk("--> nfs4_referral_get_sb()\n"); + +	mount_info.mntfh = nfs_alloc_fhandle(); +	if (mount_info.cloned == NULL || mount_info.mntfh == NULL) +		goto out; + +	/* create a new volume representation */ +	server = nfs4_create_referral_server(mount_info.cloned, mount_info.mntfh); +	if (IS_ERR(server)) { +		mntroot = ERR_CAST(server); +		goto out; +	} + +	mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, &nfs_v4); +out: +	nfs_free_fhandle(mount_info.mntfh); +	return mntroot; +} + +/* + * Create an NFS4 server record on referral traversal + */ +static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, +		int flags, const char *dev_name, void *raw_data) +{ +	struct nfs_clone_mount *data = raw_data; +	char *export_path; +	struct vfsmount *root_mnt; +	struct dentry *res; + +	dprintk("--> nfs4_referral_mount()\n"); + +	export_path = data->mnt_path; +	data->mnt_path = "/"; + +	root_mnt = nfs_do_root_mount(&nfs4_remote_referral_fs_type, +			flags, data, data->hostname); +	data->mnt_path = export_path; + +	res = nfs_follow_remote_path(root_mnt, export_path); +	dprintk("<-- nfs4_referral_mount() = %d%s\n", +		PTR_ERR_OR_ZERO(res), +		IS_ERR(res) ? " [error]" : ""); +	return res; +} + + +static int __init init_nfs_v4(void) +{ +	int err; + +	err = nfs_dns_resolver_init(); +	if (err) +		goto out; + +	err = nfs_idmap_init(); +	if (err) +		goto out1; + +	err = nfs4_register_sysctl(); +	if (err) +		goto out2; + +	register_nfs_version(&nfs_v4); +	return 0; +out2: +	nfs_idmap_quit(); +out1: +	nfs_dns_resolver_destroy(); +out: +	return err; +} + +static void __exit exit_nfs_v4(void) +{ +	unregister_nfs_version(&nfs_v4); +	nfs4_unregister_sysctl(); +	nfs_idmap_quit(); +	nfs_dns_resolver_destroy(); +} + +MODULE_LICENSE("GPL"); + +module_init(init_nfs_v4); +module_exit(exit_nfs_v4); diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c new file mode 100644 index 00000000000..b6ebe7e445f --- /dev/null +++ b/fs/nfs/nfs4sysctl.c @@ -0,0 +1,69 @@ +/* + * linux/fs/nfs/nfs4sysctl.c + * + * Sysctl interface to NFS v4 parameters + * + * Copyright (c) 2006 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#include <linux/sysctl.h> +#include <linux/nfs_idmap.h> +#include <linux/nfs_fs.h> + +#include "nfs4_fs.h" +#include "callback.h" + +static const int nfs_set_port_min = 0; +static const int nfs_set_port_max = 65535; +static struct ctl_table_header *nfs4_callback_sysctl_table; + +static struct ctl_table nfs4_cb_sysctls[] = { +	{ +		.procname = "nfs_callback_tcpport", +		.data = &nfs_callback_set_tcpport, +		.maxlen = sizeof(int), +		.mode = 0644, +		.proc_handler = proc_dointvec_minmax, +		.extra1 = (int *)&nfs_set_port_min, +		.extra2 = (int *)&nfs_set_port_max, +	}, +	{ +		.procname = "idmap_cache_timeout", +		.data = &nfs_idmap_cache_timeout, +		.maxlen = sizeof(int), +		.mode = 0644, +		.proc_handler = proc_dointvec_jiffies, +	}, +	{ } +}; + +static struct ctl_table nfs4_cb_sysctl_dir[] = { +	{ +		.procname = "nfs", +		.mode = 0555, +		.child = nfs4_cb_sysctls, +	}, +	{ } +}; + +static struct ctl_table nfs4_cb_sysctl_root[] = { +	{ +		.procname = "fs", +		.mode = 0555, +		.child = nfs4_cb_sysctl_dir, +	}, +	{ } +}; + +int nfs4_register_sysctl(void) +{ +	nfs4_callback_sysctl_table = register_sysctl_table(nfs4_cb_sysctl_root); +	if (nfs4_callback_sysctl_table == NULL) +		return -ENOMEM; +	return 0; +} + +void nfs4_unregister_sysctl(void) +{ +	unregister_sysctl_table(nfs4_callback_sysctl_table); +	nfs4_callback_sysctl_table = NULL; +} diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c new file mode 100644 index 00000000000..d774335cc8b --- /dev/null +++ b/fs/nfs/nfs4trace.c @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#include <linux/nfs_fs.h> +#include "nfs4_fs.h" +#include "internal.h" +#include "nfs4session.h" +#include "callback.h" + +#define CREATE_TRACE_POINTS +#include "nfs4trace.h" + +#ifdef CONFIG_NFS_V4_1 +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds); +#endif diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h new file mode 100644 index 00000000000..0a744f3a86f --- /dev/null +++ b/fs/nfs/nfs4trace.h @@ -0,0 +1,1148 @@ +/* + * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nfs4 + +#if !defined(_TRACE_NFS4_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NFS4_H + +#include <linux/tracepoint.h> + +#define show_nfsv4_errors(error) \ +	__print_symbolic(error, \ +		{ NFS4_OK, "OK" }, \ +		/* Mapped by nfs4_stat_to_errno() */ \ +		{ -EPERM, "EPERM" }, \ +		{ -ENOENT, "ENOENT" }, \ +		{ -EIO, "EIO" }, \ +		{ -ENXIO, "ENXIO" }, \ +		{ -EACCES, "EACCES" }, \ +		{ -EEXIST, "EEXIST" }, \ +		{ -EXDEV, "EXDEV" }, \ +		{ -ENOTDIR, "ENOTDIR" }, \ +		{ -EISDIR, "EISDIR" }, \ +		{ -EFBIG, "EFBIG" }, \ +		{ -ENOSPC, "ENOSPC" }, \ +		{ -EROFS, "EROFS" }, \ +		{ -EMLINK, "EMLINK" }, \ +		{ -ENAMETOOLONG, "ENAMETOOLONG" }, \ +		{ -ENOTEMPTY, "ENOTEMPTY" }, \ +		{ -EDQUOT, "EDQUOT" }, \ +		{ -ESTALE, "ESTALE" }, \ +		{ -EBADHANDLE, "EBADHANDLE" }, \ +		{ -EBADCOOKIE, "EBADCOOKIE" }, \ +		{ -ENOTSUPP, "ENOTSUPP" }, \ +		{ -ETOOSMALL, "ETOOSMALL" }, \ +		{ -EREMOTEIO, "EREMOTEIO" }, \ +		{ -EBADTYPE, "EBADTYPE" }, \ +		{ -EAGAIN, "EAGAIN" }, \ +		{ -ELOOP, "ELOOP" }, \ +		{ -EOPNOTSUPP, "EOPNOTSUPP" }, \ +		{ -EDEADLK, "EDEADLK" }, \ +		/* RPC errors */ \ +		{ -ENOMEM, "ENOMEM" }, \ +		{ -EKEYEXPIRED, "EKEYEXPIRED" }, \ +		{ -ETIMEDOUT, "ETIMEDOUT" }, \ +		{ -ERESTARTSYS, "ERESTARTSYS" }, \ +		{ -ECONNREFUSED, "ECONNREFUSED" }, \ +		{ -ECONNRESET, "ECONNRESET" }, \ +		{ -ENETUNREACH, "ENETUNREACH" }, \ +		{ -EHOSTUNREACH, "EHOSTUNREACH" }, \ +		{ -EHOSTDOWN, "EHOSTDOWN" }, \ +		{ -EPIPE, "EPIPE" }, \ +		{ -EPFNOSUPPORT, "EPFNOSUPPORT" }, \ +		{ -EPROTONOSUPPORT, "EPROTONOSUPPORT" }, \ +		/* NFSv4 native errors */ \ +		{ -NFS4ERR_ACCESS, "ACCESS" }, \ +		{ -NFS4ERR_ATTRNOTSUPP, "ATTRNOTSUPP" }, \ +		{ -NFS4ERR_ADMIN_REVOKED, "ADMIN_REVOKED" }, \ +		{ -NFS4ERR_BACK_CHAN_BUSY, "BACK_CHAN_BUSY" }, \ +		{ -NFS4ERR_BADCHAR, "BADCHAR" }, \ +		{ -NFS4ERR_BADHANDLE, "BADHANDLE" }, \ +		{ -NFS4ERR_BADIOMODE, "BADIOMODE" }, \ +		{ -NFS4ERR_BADLAYOUT, "BADLAYOUT" }, \ +		{ -NFS4ERR_BADLABEL, "BADLABEL" }, \ +		{ -NFS4ERR_BADNAME, "BADNAME" }, \ +		{ -NFS4ERR_BADOWNER, "BADOWNER" }, \ +		{ -NFS4ERR_BADSESSION, "BADSESSION" }, \ +		{ -NFS4ERR_BADSLOT, "BADSLOT" }, \ +		{ -NFS4ERR_BADTYPE, "BADTYPE" }, \ +		{ -NFS4ERR_BADXDR, "BADXDR" }, \ +		{ -NFS4ERR_BAD_COOKIE, "BAD_COOKIE" }, \ +		{ -NFS4ERR_BAD_HIGH_SLOT, "BAD_HIGH_SLOT" }, \ +		{ -NFS4ERR_BAD_RANGE, "BAD_RANGE" }, \ +		{ -NFS4ERR_BAD_SEQID, "BAD_SEQID" }, \ +		{ -NFS4ERR_BAD_SESSION_DIGEST, "BAD_SESSION_DIGEST" }, \ +		{ -NFS4ERR_BAD_STATEID, "BAD_STATEID" }, \ +		{ -NFS4ERR_CB_PATH_DOWN, "CB_PATH_DOWN" }, \ +		{ -NFS4ERR_CLID_INUSE, "CLID_INUSE" }, \ +		{ -NFS4ERR_CLIENTID_BUSY, "CLIENTID_BUSY" }, \ +		{ -NFS4ERR_COMPLETE_ALREADY, "COMPLETE_ALREADY" }, \ +		{ -NFS4ERR_CONN_NOT_BOUND_TO_SESSION, \ +			"CONN_NOT_BOUND_TO_SESSION" }, \ +		{ -NFS4ERR_DEADLOCK, "DEADLOCK" }, \ +		{ -NFS4ERR_DEADSESSION, "DEAD_SESSION" }, \ +		{ -NFS4ERR_DELAY, "DELAY" }, \ +		{ -NFS4ERR_DELEG_ALREADY_WANTED, \ +			"DELEG_ALREADY_WANTED" }, \ +		{ -NFS4ERR_DELEG_REVOKED, "DELEG_REVOKED" }, \ +		{ -NFS4ERR_DENIED, "DENIED" }, \ +		{ -NFS4ERR_DIRDELEG_UNAVAIL, "DIRDELEG_UNAVAIL" }, \ +		{ -NFS4ERR_DQUOT, "DQUOT" }, \ +		{ -NFS4ERR_ENCR_ALG_UNSUPP, "ENCR_ALG_UNSUPP" }, \ +		{ -NFS4ERR_EXIST, "EXIST" }, \ +		{ -NFS4ERR_EXPIRED, "EXPIRED" }, \ +		{ -NFS4ERR_FBIG, "FBIG" }, \ +		{ -NFS4ERR_FHEXPIRED, "FHEXPIRED" }, \ +		{ -NFS4ERR_FILE_OPEN, "FILE_OPEN" }, \ +		{ -NFS4ERR_GRACE, "GRACE" }, \ +		{ -NFS4ERR_HASH_ALG_UNSUPP, "HASH_ALG_UNSUPP" }, \ +		{ -NFS4ERR_INVAL, "INVAL" }, \ +		{ -NFS4ERR_IO, "IO" }, \ +		{ -NFS4ERR_ISDIR, "ISDIR" }, \ +		{ -NFS4ERR_LAYOUTTRYLATER, "LAYOUTTRYLATER" }, \ +		{ -NFS4ERR_LAYOUTUNAVAILABLE, "LAYOUTUNAVAILABLE" }, \ +		{ -NFS4ERR_LEASE_MOVED, "LEASE_MOVED" }, \ +		{ -NFS4ERR_LOCKED, "LOCKED" }, \ +		{ -NFS4ERR_LOCKS_HELD, "LOCKS_HELD" }, \ +		{ -NFS4ERR_LOCK_RANGE, "LOCK_RANGE" }, \ +		{ -NFS4ERR_MINOR_VERS_MISMATCH, "MINOR_VERS_MISMATCH" }, \ +		{ -NFS4ERR_MLINK, "MLINK" }, \ +		{ -NFS4ERR_MOVED, "MOVED" }, \ +		{ -NFS4ERR_NAMETOOLONG, "NAMETOOLONG" }, \ +		{ -NFS4ERR_NOENT, "NOENT" }, \ +		{ -NFS4ERR_NOFILEHANDLE, "NOFILEHANDLE" }, \ +		{ -NFS4ERR_NOMATCHING_LAYOUT, "NOMATCHING_LAYOUT" }, \ +		{ -NFS4ERR_NOSPC, "NOSPC" }, \ +		{ -NFS4ERR_NOTDIR, "NOTDIR" }, \ +		{ -NFS4ERR_NOTEMPTY, "NOTEMPTY" }, \ +		{ -NFS4ERR_NOTSUPP, "NOTSUPP" }, \ +		{ -NFS4ERR_NOT_ONLY_OP, "NOT_ONLY_OP" }, \ +		{ -NFS4ERR_NOT_SAME, "NOT_SAME" }, \ +		{ -NFS4ERR_NO_GRACE, "NO_GRACE" }, \ +		{ -NFS4ERR_NXIO, "NXIO" }, \ +		{ -NFS4ERR_OLD_STATEID, "OLD_STATEID" }, \ +		{ -NFS4ERR_OPENMODE, "OPENMODE" }, \ +		{ -NFS4ERR_OP_ILLEGAL, "OP_ILLEGAL" }, \ +		{ -NFS4ERR_OP_NOT_IN_SESSION, "OP_NOT_IN_SESSION" }, \ +		{ -NFS4ERR_PERM, "PERM" }, \ +		{ -NFS4ERR_PNFS_IO_HOLE, "PNFS_IO_HOLE" }, \ +		{ -NFS4ERR_PNFS_NO_LAYOUT, "PNFS_NO_LAYOUT" }, \ +		{ -NFS4ERR_RECALLCONFLICT, "RECALLCONFLICT" }, \ +		{ -NFS4ERR_RECLAIM_BAD, "RECLAIM_BAD" }, \ +		{ -NFS4ERR_RECLAIM_CONFLICT, "RECLAIM_CONFLICT" }, \ +		{ -NFS4ERR_REJECT_DELEG, "REJECT_DELEG" }, \ +		{ -NFS4ERR_REP_TOO_BIG, "REP_TOO_BIG" }, \ +		{ -NFS4ERR_REP_TOO_BIG_TO_CACHE, \ +			"REP_TOO_BIG_TO_CACHE" }, \ +		{ -NFS4ERR_REQ_TOO_BIG, "REQ_TOO_BIG" }, \ +		{ -NFS4ERR_RESOURCE, "RESOURCE" }, \ +		{ -NFS4ERR_RESTOREFH, "RESTOREFH" }, \ +		{ -NFS4ERR_RETRY_UNCACHED_REP, "RETRY_UNCACHED_REP" }, \ +		{ -NFS4ERR_RETURNCONFLICT, "RETURNCONFLICT" }, \ +		{ -NFS4ERR_ROFS, "ROFS" }, \ +		{ -NFS4ERR_SAME, "SAME" }, \ +		{ -NFS4ERR_SHARE_DENIED, "SHARE_DENIED" }, \ +		{ -NFS4ERR_SEQUENCE_POS, "SEQUENCE_POS" }, \ +		{ -NFS4ERR_SEQ_FALSE_RETRY, "SEQ_FALSE_RETRY" }, \ +		{ -NFS4ERR_SEQ_MISORDERED, "SEQ_MISORDERED" }, \ +		{ -NFS4ERR_SERVERFAULT, "SERVERFAULT" }, \ +		{ -NFS4ERR_STALE, "STALE" }, \ +		{ -NFS4ERR_STALE_CLIENTID, "STALE_CLIENTID" }, \ +		{ -NFS4ERR_STALE_STATEID, "STALE_STATEID" }, \ +		{ -NFS4ERR_SYMLINK, "SYMLINK" }, \ +		{ -NFS4ERR_TOOSMALL, "TOOSMALL" }, \ +		{ -NFS4ERR_TOO_MANY_OPS, "TOO_MANY_OPS" }, \ +		{ -NFS4ERR_UNKNOWN_LAYOUTTYPE, "UNKNOWN_LAYOUTTYPE" }, \ +		{ -NFS4ERR_UNSAFE_COMPOUND, "UNSAFE_COMPOUND" }, \ +		{ -NFS4ERR_WRONGSEC, "WRONGSEC" }, \ +		{ -NFS4ERR_WRONG_CRED, "WRONG_CRED" }, \ +		{ -NFS4ERR_WRONG_TYPE, "WRONG_TYPE" }, \ +		{ -NFS4ERR_XDEV, "XDEV" }) + +#define show_open_flags(flags) \ +	__print_flags(flags, "|", \ +		{ O_CREAT, "O_CREAT" }, \ +		{ O_EXCL, "O_EXCL" }, \ +		{ O_TRUNC, "O_TRUNC" }, \ +		{ O_DIRECT, "O_DIRECT" }) + +#define show_fmode_flags(mode) \ +	__print_flags(mode, "|", \ +		{ ((__force unsigned long)FMODE_READ), "READ" }, \ +		{ ((__force unsigned long)FMODE_WRITE), "WRITE" }, \ +		{ ((__force unsigned long)FMODE_EXEC), "EXEC" }) + +#define show_nfs_fattr_flags(valid) \ +	__print_flags((unsigned long)valid, "|", \ +		{ NFS_ATTR_FATTR_TYPE, "TYPE" }, \ +		{ NFS_ATTR_FATTR_MODE, "MODE" }, \ +		{ NFS_ATTR_FATTR_NLINK, "NLINK" }, \ +		{ NFS_ATTR_FATTR_OWNER, "OWNER" }, \ +		{ NFS_ATTR_FATTR_GROUP, "GROUP" }, \ +		{ NFS_ATTR_FATTR_RDEV, "RDEV" }, \ +		{ NFS_ATTR_FATTR_SIZE, "SIZE" }, \ +		{ NFS_ATTR_FATTR_FSID, "FSID" }, \ +		{ NFS_ATTR_FATTR_FILEID, "FILEID" }, \ +		{ NFS_ATTR_FATTR_ATIME, "ATIME" }, \ +		{ NFS_ATTR_FATTR_MTIME, "MTIME" }, \ +		{ NFS_ATTR_FATTR_CTIME, "CTIME" }, \ +		{ NFS_ATTR_FATTR_CHANGE, "CHANGE" }, \ +		{ NFS_ATTR_FATTR_OWNER_NAME, "OWNER_NAME" }, \ +		{ NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" }) + +DECLARE_EVENT_CLASS(nfs4_clientid_event, +		TP_PROTO( +			const struct nfs_client *clp, +			int error +		), + +		TP_ARGS(clp, error), + +		TP_STRUCT__entry( +			__string(dstaddr, +				rpc_peeraddr2str(clp->cl_rpcclient, +					RPC_DISPLAY_ADDR)) +			__field(int, error) +		), + +		TP_fast_assign( +			__entry->error = error; +			__assign_str(dstaddr, +				rpc_peeraddr2str(clp->cl_rpcclient, +						RPC_DISPLAY_ADDR)); +		), + +		TP_printk( +			"error=%d (%s) dstaddr=%s", +			__entry->error, +			show_nfsv4_errors(__entry->error), +			__get_str(dstaddr) +		) +); +#define DEFINE_NFS4_CLIENTID_EVENT(name) \ +	DEFINE_EVENT(nfs4_clientid_event, name,	 \ +			TP_PROTO( \ +				const struct nfs_client *clp, \ +				int error \ +			), \ +			TP_ARGS(clp, error)) +DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid_confirm); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew_async); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_CLIENTID_EVENT(nfs4_exchange_id); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_create_session); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_session); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_clientid); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete); + +TRACE_EVENT(nfs4_setup_sequence, +		TP_PROTO( +			const struct nfs4_session *session, +			const struct nfs4_sequence_args *args +		), +		TP_ARGS(session, args), + +		TP_STRUCT__entry( +			__field(unsigned int, session) +			__field(unsigned int, slot_nr) +			__field(unsigned int, seq_nr) +			__field(unsigned int, highest_used_slotid) +		), + +		TP_fast_assign( +			const struct nfs4_slot *sa_slot = args->sa_slot; +			__entry->session = nfs_session_id_hash(&session->sess_id); +			__entry->slot_nr = sa_slot->slot_nr; +			__entry->seq_nr = sa_slot->seq_nr; +			__entry->highest_used_slotid = +					sa_slot->table->highest_used_slotid; +		), +		TP_printk( +			"session=0x%08x slot_nr=%u seq_nr=%u " +			"highest_used_slotid=%u", +			__entry->session, +			__entry->slot_nr, +			__entry->seq_nr, +			__entry->highest_used_slotid +		) +); + +#define show_nfs4_sequence_status_flags(status) \ +	__print_flags((unsigned long)status, "|", \ +		{ SEQ4_STATUS_CB_PATH_DOWN, "CB_PATH_DOWN" }, \ +		{ SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING, \ +			"CB_GSS_CONTEXTS_EXPIRING" }, \ +		{ SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED, \ +			"CB_GSS_CONTEXTS_EXPIRED" }, \ +		{ SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED, \ +			"EXPIRED_ALL_STATE_REVOKED" }, \ +		{ SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED, \ +			"EXPIRED_SOME_STATE_REVOKED" }, \ +		{ SEQ4_STATUS_ADMIN_STATE_REVOKED, \ +			"ADMIN_STATE_REVOKED" }, \ +		{ SEQ4_STATUS_RECALLABLE_STATE_REVOKED,	 \ +			"RECALLABLE_STATE_REVOKED" }, \ +		{ SEQ4_STATUS_LEASE_MOVED, "LEASE_MOVED" }, \ +		{ SEQ4_STATUS_RESTART_RECLAIM_NEEDED, \ +			"RESTART_RECLAIM_NEEDED" }, \ +		{ SEQ4_STATUS_CB_PATH_DOWN_SESSION, \ +			"CB_PATH_DOWN_SESSION" }, \ +		{ SEQ4_STATUS_BACKCHANNEL_FAULT, \ +			"BACKCHANNEL_FAULT" }) + +TRACE_EVENT(nfs4_sequence_done, +		TP_PROTO( +			const struct nfs4_session *session, +			const struct nfs4_sequence_res *res +		), +		TP_ARGS(session, res), + +		TP_STRUCT__entry( +			__field(unsigned int, session) +			__field(unsigned int, slot_nr) +			__field(unsigned int, seq_nr) +			__field(unsigned int, highest_slotid) +			__field(unsigned int, target_highest_slotid) +			__field(unsigned int, status_flags) +			__field(int, error) +		), + +		TP_fast_assign( +			const struct nfs4_slot *sr_slot = res->sr_slot; +			__entry->session = nfs_session_id_hash(&session->sess_id); +			__entry->slot_nr = sr_slot->slot_nr; +			__entry->seq_nr = sr_slot->seq_nr; +			__entry->highest_slotid = res->sr_highest_slotid; +			__entry->target_highest_slotid = +					res->sr_target_highest_slotid; +			__entry->error = res->sr_status; +		), +		TP_printk( +			"error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u " +			"highest_slotid=%u target_highest_slotid=%u " +			"status_flags=%u (%s)", +			__entry->error, +			show_nfsv4_errors(__entry->error), +			__entry->session, +			__entry->slot_nr, +			__entry->seq_nr, +			__entry->highest_slotid, +			__entry->target_highest_slotid, +			__entry->status_flags, +			show_nfs4_sequence_status_flags(__entry->status_flags) +		) +); + +struct cb_sequenceargs; +struct cb_sequenceres; + +TRACE_EVENT(nfs4_cb_sequence, +		TP_PROTO( +			const struct cb_sequenceargs *args, +			const struct cb_sequenceres *res, +			__be32 status +		), +		TP_ARGS(args, res, status), + +		TP_STRUCT__entry( +			__field(unsigned int, session) +			__field(unsigned int, slot_nr) +			__field(unsigned int, seq_nr) +			__field(unsigned int, highest_slotid) +			__field(unsigned int, cachethis) +			__field(int, error) +		), + +		TP_fast_assign( +			__entry->session = nfs_session_id_hash(&args->csa_sessionid); +			__entry->slot_nr = args->csa_slotid; +			__entry->seq_nr = args->csa_sequenceid; +			__entry->highest_slotid = args->csa_highestslotid; +			__entry->cachethis = args->csa_cachethis; +			__entry->error = -be32_to_cpu(status); +		), + +		TP_printk( +			"error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u " +			"highest_slotid=%u", +			__entry->error, +			show_nfsv4_errors(__entry->error), +			__entry->session, +			__entry->slot_nr, +			__entry->seq_nr, +			__entry->highest_slotid +		) +); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_open_event, +		TP_PROTO( +			const struct nfs_open_context *ctx, +			int flags, +			int error +		), + +		TP_ARGS(ctx, flags, error), + +		TP_STRUCT__entry( +			__field(int, error) +			__field(unsigned int, flags) +			__field(unsigned int, fmode) +			__field(dev_t, dev) +			__field(u32, fhandle) +			__field(u64, fileid) +			__field(u64, dir) +			__string(name, ctx->dentry->d_name.name) +		), + +		TP_fast_assign( +			const struct nfs4_state *state = ctx->state; +			const struct inode *inode = NULL; + +			__entry->error = error; +			__entry->flags = flags; +			__entry->fmode = (__force unsigned int)ctx->mode; +			__entry->dev = ctx->dentry->d_sb->s_dev; +			if (!IS_ERR(state)) +				inode = state->inode; +			if (inode != NULL) { +				__entry->fileid = NFS_FILEID(inode); +				__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); +			} else { +				__entry->fileid = 0; +				__entry->fhandle = 0; +			} +			__entry->dir = NFS_FILEID(ctx->dentry->d_parent->d_inode); +			__assign_str(name, ctx->dentry->d_name.name); +		), + +		TP_printk( +			"error=%d (%s) flags=%d (%s) fmode=%s " +			"fileid=%02x:%02x:%llu fhandle=0x%08x " +			"name=%02x:%02x:%llu/%s", +			 __entry->error, +			 show_nfsv4_errors(__entry->error), +			 __entry->flags, +			 show_open_flags(__entry->flags), +			 show_fmode_flags(__entry->fmode), +			 MAJOR(__entry->dev), MINOR(__entry->dev), +			 (unsigned long long)__entry->fileid, +			 __entry->fhandle, +			 MAJOR(__entry->dev), MINOR(__entry->dev), +			 (unsigned long long)__entry->dir, +			 __get_str(name) +		) +); + +#define DEFINE_NFS4_OPEN_EVENT(name) \ +	DEFINE_EVENT(nfs4_open_event, name, \ +			TP_PROTO( \ +				const struct nfs_open_context *ctx, \ +				int flags, \ +				int error \ +			), \ +			TP_ARGS(ctx, flags, error)) +DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim); +DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired); +DEFINE_NFS4_OPEN_EVENT(nfs4_open_file); + +TRACE_EVENT(nfs4_close, +		TP_PROTO( +			const struct nfs4_state *state, +			const struct nfs_closeargs *args, +			const struct nfs_closeres *res, +			int error +		), + +		TP_ARGS(state, args, res, error), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(u32, fhandle) +			__field(u64, fileid) +			__field(unsigned int, fmode) +			__field(int, error) +		), + +		TP_fast_assign( +			const struct inode *inode = state->inode; + +			__entry->dev = inode->i_sb->s_dev; +			__entry->fileid = NFS_FILEID(inode); +			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); +			__entry->fmode = (__force unsigned int)state->state; +			__entry->error = error; +		), + +		TP_printk( +			"error=%d (%s) fmode=%s fileid=%02x:%02x:%llu " +			"fhandle=0x%08x", +			__entry->error, +			show_nfsv4_errors(__entry->error), +			__entry->fmode ?  show_fmode_flags(__entry->fmode) : +					  "closed", +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->fileid, +			__entry->fhandle +		) +); + +#define show_lock_cmd(type) \ +	__print_symbolic((int)type, \ +		{ F_GETLK, "GETLK" }, \ +		{ F_SETLK, "SETLK" }, \ +		{ F_SETLKW, "SETLKW" }) +#define show_lock_type(type) \ +	__print_symbolic((int)type, \ +		{ F_RDLCK, "RDLCK" }, \ +		{ F_WRLCK, "WRLCK" }, \ +		{ F_UNLCK, "UNLCK" }) + +DECLARE_EVENT_CLASS(nfs4_lock_event, +		TP_PROTO( +			const struct file_lock *request, +			const struct nfs4_state *state, +			int cmd, +			int error +		), + +		TP_ARGS(request, state, cmd, error), + +		TP_STRUCT__entry( +			__field(int, error) +			__field(int, cmd) +			__field(char, type) +			__field(loff_t, start) +			__field(loff_t, end) +			__field(dev_t, dev) +			__field(u32, fhandle) +			__field(u64, fileid) +		), + +		TP_fast_assign( +			const struct inode *inode = state->inode; + +			__entry->error = error; +			__entry->cmd = cmd; +			__entry->type = request->fl_type; +			__entry->start = request->fl_start; +			__entry->end = request->fl_end; +			__entry->dev = inode->i_sb->s_dev; +			__entry->fileid = NFS_FILEID(inode); +			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); +		), + +		TP_printk( +			"error=%d (%s) cmd=%s:%s range=%lld:%lld " +			"fileid=%02x:%02x:%llu fhandle=0x%08x", +			__entry->error, +			show_nfsv4_errors(__entry->error), +			show_lock_cmd(__entry->cmd), +			show_lock_type(__entry->type), +			(long long)__entry->start, +			(long long)__entry->end, +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->fileid, +			__entry->fhandle +		) +); + +#define DEFINE_NFS4_LOCK_EVENT(name) \ +	DEFINE_EVENT(nfs4_lock_event, name, \ +			TP_PROTO( \ +				const struct file_lock *request, \ +				const struct nfs4_state *state, \ +				int cmd, \ +				int error \ +			), \ +			TP_ARGS(request, state, cmd, error)) +DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock); +DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock); +DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim); +DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired); +DEFINE_NFS4_LOCK_EVENT(nfs4_unlock); + +DECLARE_EVENT_CLASS(nfs4_set_delegation_event, +		TP_PROTO( +			const struct inode *inode, +			fmode_t fmode +		), + +		TP_ARGS(inode, fmode), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(u32, fhandle) +			__field(u64, fileid) +			__field(unsigned int, fmode) +		), + +		TP_fast_assign( +			__entry->dev = inode->i_sb->s_dev; +			__entry->fileid = NFS_FILEID(inode); +			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); +			__entry->fmode = (__force unsigned int)fmode; +		), + +		TP_printk( +			"fmode=%s fileid=%02x:%02x:%llu fhandle=0x%08x", +			show_fmode_flags(__entry->fmode), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->fileid, +			__entry->fhandle +		) +); +#define DEFINE_NFS4_SET_DELEGATION_EVENT(name) \ +	DEFINE_EVENT(nfs4_set_delegation_event, name, \ +			TP_PROTO( \ +				const struct inode *inode, \ +				fmode_t fmode \ +			), \ +			TP_ARGS(inode, fmode)) +DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_set_delegation); +DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_reclaim_delegation); + +TRACE_EVENT(nfs4_delegreturn_exit, +		TP_PROTO( +			const struct nfs4_delegreturnargs *args, +			const struct nfs4_delegreturnres *res, +			int error +		), + +		TP_ARGS(args, res, error), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(u32, fhandle) +			__field(int, error) +		), + +		TP_fast_assign( +			__entry->dev = res->server->s_dev; +			__entry->fhandle = nfs_fhandle_hash(args->fhandle); +			__entry->error = error; +		), + +		TP_printk( +			"error=%d (%s) dev=%02x:%02x fhandle=0x%08x", +			__entry->error, +			show_nfsv4_errors(__entry->error), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			__entry->fhandle +		) +); + +#ifdef CONFIG_NFS_V4_1 +DECLARE_EVENT_CLASS(nfs4_test_stateid_event, +		TP_PROTO( +			const struct nfs4_state *state, +			const struct nfs4_lock_state *lsp, +			int error +		), + +		TP_ARGS(state, lsp, error), + +		TP_STRUCT__entry( +			__field(int, error) +			__field(dev_t, dev) +			__field(u32, fhandle) +			__field(u64, fileid) +		), + +		TP_fast_assign( +			const struct inode *inode = state->inode; + +			__entry->error = error; +			__entry->dev = inode->i_sb->s_dev; +			__entry->fileid = NFS_FILEID(inode); +			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); +		), + +		TP_printk( +			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", +			__entry->error, +			show_nfsv4_errors(__entry->error), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->fileid, +			__entry->fhandle +		) +); + +#define DEFINE_NFS4_TEST_STATEID_EVENT(name) \ +	DEFINE_EVENT(nfs4_test_stateid_event, name, \ +			TP_PROTO( \ +				const struct nfs4_state *state, \ +				const struct nfs4_lock_state *lsp, \ +				int error \ +			), \ +			TP_ARGS(state, lsp, error)) +DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_delegation_stateid); +DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_open_stateid); +DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_lock_stateid); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_lookup_event, +		TP_PROTO( +			const struct inode *dir, +			const struct qstr *name, +			int error +		), + +		TP_ARGS(dir, name, error), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(int, error) +			__field(u64, dir) +			__string(name, name->name) +		), + +		TP_fast_assign( +			__entry->dev = dir->i_sb->s_dev; +			__entry->dir = NFS_FILEID(dir); +			__entry->error = error; +			__assign_str(name, name->name); +		), + +		TP_printk( +			"error=%d (%s) name=%02x:%02x:%llu/%s", +			__entry->error, +			show_nfsv4_errors(__entry->error), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->dir, +			__get_str(name) +		) +); + +#define DEFINE_NFS4_LOOKUP_EVENT(name) \ +	DEFINE_EVENT(nfs4_lookup_event, name, \ +			TP_PROTO( \ +				const struct inode *dir, \ +				const struct qstr *name, \ +				int error \ +			), \ +			TP_ARGS(dir, name, error)) + +DEFINE_NFS4_LOOKUP_EVENT(nfs4_lookup); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_symlink); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_mkdir); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_mknod); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_remove); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_get_fs_locations); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_secinfo); + +TRACE_EVENT(nfs4_rename, +		TP_PROTO( +			const struct inode *olddir, +			const struct qstr *oldname, +			const struct inode *newdir, +			const struct qstr *newname, +			int error +		), + +		TP_ARGS(olddir, oldname, newdir, newname, error), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(int, error) +			__field(u64, olddir) +			__string(oldname, oldname->name) +			__field(u64, newdir) +			__string(newname, newname->name) +		), + +		TP_fast_assign( +			__entry->dev = olddir->i_sb->s_dev; +			__entry->olddir = NFS_FILEID(olddir); +			__entry->newdir = NFS_FILEID(newdir); +			__entry->error = error; +			__assign_str(oldname, oldname->name); +			__assign_str(newname, newname->name); +		), + +		TP_printk( +			"error=%d (%s) oldname=%02x:%02x:%llu/%s " +			"newname=%02x:%02x:%llu/%s", +			__entry->error, +			show_nfsv4_errors(__entry->error), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->olddir, +			__get_str(oldname), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->newdir, +			__get_str(newname) +		) +); + +DECLARE_EVENT_CLASS(nfs4_inode_event, +		TP_PROTO( +			const struct inode *inode, +			int error +		), + +		TP_ARGS(inode, error), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(u32, fhandle) +			__field(u64, fileid) +			__field(int, error) +		), + +		TP_fast_assign( +			__entry->dev = inode->i_sb->s_dev; +			__entry->fileid = NFS_FILEID(inode); +			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); +			__entry->error = error; +		), + +		TP_printk( +			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", +			__entry->error, +			show_nfsv4_errors(__entry->error), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->fileid, +			__entry->fhandle +		) +); + +#define DEFINE_NFS4_INODE_EVENT(name) \ +	DEFINE_EVENT(nfs4_inode_event, name, \ +			TP_PROTO( \ +				const struct inode *inode, \ +				int error \ +			), \ +			TP_ARGS(inode, error)) + +DEFINE_NFS4_INODE_EVENT(nfs4_setattr); +DEFINE_NFS4_INODE_EVENT(nfs4_access); +DEFINE_NFS4_INODE_EVENT(nfs4_readlink); +DEFINE_NFS4_INODE_EVENT(nfs4_readdir); +DEFINE_NFS4_INODE_EVENT(nfs4_get_acl); +DEFINE_NFS4_INODE_EVENT(nfs4_set_acl); +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label); +DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label); +#endif /* CONFIG_NFS_V4_SECURITY_LABEL */ +DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation); +DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn); + +DECLARE_EVENT_CLASS(nfs4_getattr_event, +		TP_PROTO( +			const struct nfs_server *server, +			const struct nfs_fh *fhandle, +			const struct nfs_fattr *fattr, +			int error +		), + +		TP_ARGS(server, fhandle, fattr, error), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(u32, fhandle) +			__field(u64, fileid) +			__field(unsigned int, valid) +			__field(int, error) +		), + +		TP_fast_assign( +			__entry->dev = server->s_dev; +			__entry->valid = fattr->valid; +			__entry->fhandle = nfs_fhandle_hash(fhandle); +			__entry->fileid = (fattr->valid & NFS_ATTR_FATTR_FILEID) ? fattr->fileid : 0; +			__entry->error = error; +		), + +		TP_printk( +			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " +			"valid=%s", +			__entry->error, +			show_nfsv4_errors(__entry->error), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->fileid, +			__entry->fhandle, +			show_nfs_fattr_flags(__entry->valid) +		) +); + +#define DEFINE_NFS4_GETATTR_EVENT(name) \ +	DEFINE_EVENT(nfs4_getattr_event, name, \ +			TP_PROTO( \ +				const struct nfs_server *server, \ +				const struct nfs_fh *fhandle, \ +				const struct nfs_fattr *fattr, \ +				int error \ +			), \ +			TP_ARGS(server, fhandle, fattr, error)) +DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr); +DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root); +DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo); + +DECLARE_EVENT_CLASS(nfs4_idmap_event, +		TP_PROTO( +			const char *name, +			int len, +			u32 id, +			int error +		), + +		TP_ARGS(name, len, id, error), + +		TP_STRUCT__entry( +			__field(int, error) +			__field(u32, id) +			__dynamic_array(char, name, len > 0 ? len + 1 : 1) +		), + +		TP_fast_assign( +			if (len < 0) +				len = 0; +			__entry->error = error < 0 ? error : 0; +			__entry->id = id; +			memcpy(__get_dynamic_array(name), name, len); +			((char *)__get_dynamic_array(name))[len] = 0; +		), + +		TP_printk( +			"error=%d id=%u name=%s", +			__entry->error, +			__entry->id, +			__get_str(name) +		) +); +#define DEFINE_NFS4_IDMAP_EVENT(name) \ +	DEFINE_EVENT(nfs4_idmap_event, name, \ +			TP_PROTO( \ +				const char *name, \ +				int len, \ +				u32 id, \ +				int error \ +			), \ +			TP_ARGS(name, len, id, error)) +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_name_to_uid); +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_group_to_gid); +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_uid_to_name); +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group); + +DECLARE_EVENT_CLASS(nfs4_read_event, +		TP_PROTO( +			const struct nfs_pgio_data *data, +			int error +		), + +		TP_ARGS(data, error), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(u32, fhandle) +			__field(u64, fileid) +			__field(loff_t, offset) +			__field(size_t, count) +			__field(int, error) +		), + +		TP_fast_assign( +			const struct inode *inode = data->header->inode; +			__entry->dev = inode->i_sb->s_dev; +			__entry->fileid = NFS_FILEID(inode); +			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); +			__entry->offset = data->args.offset; +			__entry->count = data->args.count; +			__entry->error = error; +		), + +		TP_printk( +			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " +			"offset=%lld count=%zu", +			__entry->error, +			show_nfsv4_errors(__entry->error), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->fileid, +			__entry->fhandle, +			(long long)__entry->offset, +			__entry->count +		) +); +#define DEFINE_NFS4_READ_EVENT(name) \ +	DEFINE_EVENT(nfs4_read_event, name, \ +			TP_PROTO( \ +				const struct nfs_pgio_data *data, \ +				int error \ +			), \ +			TP_ARGS(data, error)) +DEFINE_NFS4_READ_EVENT(nfs4_read); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_write_event, +		TP_PROTO( +			const struct nfs_pgio_data *data, +			int error +		), + +		TP_ARGS(data, error), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(u32, fhandle) +			__field(u64, fileid) +			__field(loff_t, offset) +			__field(size_t, count) +			__field(int, error) +		), + +		TP_fast_assign( +			const struct inode *inode = data->header->inode; +			__entry->dev = inode->i_sb->s_dev; +			__entry->fileid = NFS_FILEID(inode); +			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); +			__entry->offset = data->args.offset; +			__entry->count = data->args.count; +			__entry->error = error; +		), + +		TP_printk( +			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " +			"offset=%lld count=%zu", +			__entry->error, +			show_nfsv4_errors(__entry->error), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->fileid, +			__entry->fhandle, +			(long long)__entry->offset, +			__entry->count +		) +); + +#define DEFINE_NFS4_WRITE_EVENT(name) \ +	DEFINE_EVENT(nfs4_write_event, name, \ +			TP_PROTO( \ +				const struct nfs_pgio_data *data, \ +				int error \ +			), \ +			TP_ARGS(data, error)) +DEFINE_NFS4_WRITE_EVENT(nfs4_write); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_commit_event, +		TP_PROTO( +			const struct nfs_commit_data *data, +			int error +		), + +		TP_ARGS(data, error), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(u32, fhandle) +			__field(u64, fileid) +			__field(loff_t, offset) +			__field(size_t, count) +			__field(int, error) +		), + +		TP_fast_assign( +			const struct inode *inode = data->inode; +			__entry->dev = inode->i_sb->s_dev; +			__entry->fileid = NFS_FILEID(inode); +			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); +			__entry->offset = data->args.offset; +			__entry->count = data->args.count; +			__entry->error = error; +		), + +		TP_printk( +			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " +			"offset=%lld count=%zu", +			__entry->error, +			show_nfsv4_errors(__entry->error), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->fileid, +			__entry->fhandle, +			(long long)__entry->offset, +			__entry->count +		) +); +#define DEFINE_NFS4_COMMIT_EVENT(name) \ +	DEFINE_EVENT(nfs4_commit_event, name, \ +			TP_PROTO( \ +				const struct nfs_commit_data *data, \ +				int error \ +			), \ +			TP_ARGS(data, error)) +DEFINE_NFS4_COMMIT_EVENT(nfs4_commit); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_COMMIT_EVENT(nfs4_pnfs_commit_ds); + +#define show_pnfs_iomode(iomode) \ +	__print_symbolic(iomode, \ +		{ IOMODE_READ, "READ" }, \ +		{ IOMODE_RW, "RW" }, \ +		{ IOMODE_ANY, "ANY" }) + +TRACE_EVENT(nfs4_layoutget, +		TP_PROTO( +			const struct nfs_open_context *ctx, +			const struct pnfs_layout_range *args, +			const struct pnfs_layout_range *res, +			int error +		), + +		TP_ARGS(ctx, args, res, error), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(u32, fhandle) +			__field(u64, fileid) +			__field(u32, iomode) +			__field(u64, offset) +			__field(u64, count) +			__field(int, error) +		), + +		TP_fast_assign( +			const struct inode *inode = ctx->dentry->d_inode; +			__entry->dev = inode->i_sb->s_dev; +			__entry->fileid = NFS_FILEID(inode); +			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); +			__entry->iomode = args->iomode; +			__entry->offset = args->offset; +			__entry->count = args->length; +			__entry->error = error; +		), + +		TP_printk( +			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " +			"iomode=%s offset=%llu count=%llu", +			__entry->error, +			show_nfsv4_errors(__entry->error), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->fileid, +			__entry->fhandle, +			show_pnfs_iomode(__entry->iomode), +			(unsigned long long)__entry->offset, +			(unsigned long long)__entry->count +		) +); + +DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit); +DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn); + +#endif /* CONFIG_NFS_V4_1 */ + +#endif /* _TRACE_NFS4_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE nfs4trace +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index f313c4cce7e..939ae606cfa 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -44,15 +44,21 @@  #include <linux/pagemap.h>  #include <linux/proc_fs.h>  #include <linux/kdev_t.h> +#include <linux/module.h> +#include <linux/utsname.h>  #include <linux/sunrpc/clnt.h>  #include <linux/sunrpc/msg_prot.h> +#include <linux/sunrpc/gss_api.h>  #include <linux/nfs.h>  #include <linux/nfs4.h>  #include <linux/nfs_fs.h>  #include <linux/nfs_idmap.h> +  #include "nfs4_fs.h"  #include "internal.h" +#include "nfs4session.h"  #include "pnfs.h" +#include "netns.h"  #define NFSDBG_FACILITY		NFSDBG_XDR @@ -71,8 +77,8 @@ static int nfs4_stat_to_errno(int);  /* lock,open owner id:   * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)   */ -#define open_owner_id_maxsz	(1 + 4) -#define lock_owner_id_maxsz	(1 + 4) +#define open_owner_id_maxsz	(1 + 2 + 1 + 1 + 2) +#define lock_owner_id_maxsz	(1 + 1 + 4)  #define decode_lockowner_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))  #define compound_encode_hdr_maxsz	(3 + (NFS4_MAXTAGLEN >> 2))  #define compound_decode_hdr_maxsz	(3 + (NFS4_MAXTAGLEN >> 2)) @@ -90,15 +96,25 @@ static int nfs4_stat_to_errno(int);  #define encode_getfh_maxsz      (op_encode_hdr_maxsz)  #define decode_getfh_maxsz      (op_decode_hdr_maxsz + 1 + \  				((3+NFS4_FHSIZE) >> 2)) -#define nfs4_fattr_bitmap_maxsz 3 +#define nfs4_fattr_bitmap_maxsz 4  #define encode_getattr_maxsz    (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)  #define nfs4_name_maxsz		(1 + ((3 + NFS4_MAXNAMLEN) >> 2))  #define nfs4_path_maxsz		(1 + ((3 + NFS4_MAXPATHLEN) >> 2))  #define nfs4_owner_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))  #define nfs4_group_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */ +#define	nfs4_label_maxsz	(4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN)) +#else +#define	nfs4_label_maxsz	0 +#endif +/* We support only one layout type per file system */ +#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)  /* This is based on getfattr, which uses the most attributes: */  #define nfs4_fattr_value_maxsz	(1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ -				3 + 3 + 3 + nfs4_owner_maxsz + nfs4_group_maxsz)) +				3 + 3 + 3 + nfs4_owner_maxsz + \ +				nfs4_group_maxsz + nfs4_label_maxsz + \ +				 decode_mdsthreshold_maxsz))  #define nfs4_fattr_maxsz	(nfs4_fattr_bitmap_maxsz + \  				nfs4_fattr_value_maxsz)  #define decode_getattr_maxsz    (op_decode_hdr_maxsz + nfs4_fattr_maxsz) @@ -106,13 +122,18 @@ static int nfs4_stat_to_errno(int);  				 1 + 2 + 1 + \  				nfs4_owner_maxsz + \  				nfs4_group_maxsz + \ +				nfs4_label_maxsz + \  				4 + 4)  #define encode_savefh_maxsz     (op_encode_hdr_maxsz)  #define decode_savefh_maxsz     (op_decode_hdr_maxsz)  #define encode_restorefh_maxsz  (op_encode_hdr_maxsz)  #define decode_restorefh_maxsz  (op_decode_hdr_maxsz)  #define encode_fsinfo_maxsz	(encode_getattr_maxsz) -#define decode_fsinfo_maxsz	(op_decode_hdr_maxsz + 11) +/* The 5 accounts for the PNFS attributes, and assumes that at most three + * layout types will be returned. + */ +#define decode_fsinfo_maxsz	(op_decode_hdr_maxsz + \ +				 nfs4_fattr_bitmap_maxsz + 4 + 8 + 5)  #define encode_renew_maxsz	(op_encode_hdr_maxsz + 3)  #define decode_renew_maxsz	(op_decode_hdr_maxsz)  #define encode_setclientid_maxsz \ @@ -179,7 +200,8 @@ static int nfs4_stat_to_errno(int);  				 encode_stateid_maxsz + 3)  #define decode_read_maxsz	(op_decode_hdr_maxsz + 2)  #define encode_readdir_maxsz	(op_encode_hdr_maxsz + \ -				 2 + encode_verifier_maxsz + 5) +				 2 + encode_verifier_maxsz + 5 + \ +				nfs4_label_maxsz)  #define decode_readdir_maxsz	(op_decode_hdr_maxsz + \  				 decode_verifier_maxsz)  #define encode_readlink_maxsz	(op_encode_hdr_maxsz) @@ -253,9 +275,13 @@ static int nfs4_stat_to_errno(int);  				(encode_getattr_maxsz)  #define decode_fs_locations_maxsz \  				(0) +#define encode_secinfo_maxsz	(op_encode_hdr_maxsz + nfs4_name_maxsz) +#define decode_secinfo_maxsz	(op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4))  #if defined(CONFIG_NFS_V4_1)  #define NFS4_MAX_MACHINE_NAME_LEN (64) +#define IMPL_NAME_LIMIT (sizeof(utsname()->sysname) + sizeof(utsname()->release) + \ +			 sizeof(utsname()->version) + sizeof(utsname()->machine) + 8)  #define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \  				encode_verifier_maxsz + \ @@ -263,21 +289,34 @@ static int nfs4_stat_to_errno(int);  				XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \  				1 /* flags */ + \  				1 /* spa_how */ + \ -				0 /* SP4_NONE (for now) */ + \ -				1 /* zero implemetation id array */) +				/* max is SP4_MACH_CRED (for now) */ + \ +				1 + NFS4_OP_MAP_NUM_WORDS + \ +				1 + NFS4_OP_MAP_NUM_WORDS + \ +				1 /* implementation id array of size 1 */ + \ +				1 /* nii_domain */ + \ +				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ +				1 /* nii_name */ + \ +				XDR_QUADLEN(IMPL_NAME_LIMIT) + \ +				3 /* nii_date */)  #define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \  				2 /* eir_clientid */ + \  				1 /* eir_sequenceid */ + \  				1 /* eir_flags */ + \  				1 /* spr_how */ + \ -				0 /* SP4_NONE (for now) */ + \ +				  /* max is SP4_MACH_CRED (for now) */ + \ +				1 + NFS4_OP_MAP_NUM_WORDS + \ +				1 + NFS4_OP_MAP_NUM_WORDS + \  				2 /* eir_server_owner.so_minor_id */ + \  				/* eir_server_owner.so_major_id<> */ \  				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \  				/* eir_server_scope<> */ \  				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \  				1 /* eir_server_impl_id array length */ + \ -				0 /* ignored eir_server_impl_id contents */) +				1 /* nii_domain */ + \ +				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ +				1 /* nii_name */ + \ +				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ +				3 /* nii_date */)  #define encode_channel_attrs_maxsz  (6 + 1 /* ca_rdma_ird.len (0) */)  #define decode_channel_attrs_maxsz  (6 + \  				     1 /* ca_rdma_ird.len */ + \ @@ -303,14 +342,37 @@ static int nfs4_stat_to_errno(int);  				     1 /* csr_flags */ + \  				     decode_channel_attrs_maxsz + \  				     decode_channel_attrs_maxsz) +#define encode_bind_conn_to_session_maxsz  (op_encode_hdr_maxsz + \ +				     /* bctsa_sessid */ \ +				     XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ +				     1 /* bctsa_dir */ + \ +				     1 /* bctsa_use_conn_in_rdma_mode */) +#define decode_bind_conn_to_session_maxsz  (op_decode_hdr_maxsz +	\ +				     /* bctsr_sessid */ \ +				     XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ +				     1 /* bctsr_dir */ + \ +				     1 /* bctsr_use_conn_in_rdma_mode */)  #define encode_destroy_session_maxsz    (op_encode_hdr_maxsz + 4)  #define decode_destroy_session_maxsz    (op_decode_hdr_maxsz) +#define encode_destroy_clientid_maxsz   (op_encode_hdr_maxsz + 2) +#define decode_destroy_clientid_maxsz   (op_decode_hdr_maxsz)  #define encode_sequence_maxsz	(op_encode_hdr_maxsz + \  				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4)  #define decode_sequence_maxsz	(op_decode_hdr_maxsz + \  				XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)  #define encode_reclaim_complete_maxsz	(op_encode_hdr_maxsz + 4)  #define decode_reclaim_complete_maxsz	(op_decode_hdr_maxsz + 4) +#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ +				encode_verifier_maxsz) +#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ +				2 /* nfs_cookie4 gdlr_cookie */ + \ +				decode_verifier_maxsz \ +				  /* verifier4 gdlr_verifier */ + \ +				1 /* gdlr_deviceid_list count */ + \ +				XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ +					    NFS4_DEVICEID4_SIZE) \ +				  /* gdlr_deviceid_list */ + \ +				1 /* bool gdlr_eof */)  #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \  				XDR_QUADLEN(NFS4_DEVICEID4_SIZE))  #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ @@ -324,6 +386,30 @@ static int nfs4_stat_to_errno(int);  #define decode_layoutget_maxsz	(op_decode_hdr_maxsz + 8 + \  				decode_stateid_maxsz + \  				XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) +#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz +          \ +				2 /* offset */ + \ +				2 /* length */ + \ +				1 /* reclaim */ + \ +				encode_stateid_maxsz + \ +				1 /* new offset (true) */ + \ +				2 /* last byte written */ + \ +				1 /* nt_timechanged (false) */ + \ +				1 /* layoutupdate4 layout type */ + \ +				1 /* NULL filelayout layoutupdate4 payload */) +#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) +#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ +				encode_stateid_maxsz + \ +				1 /* FIXME: opaque lrf_body always empty at the moment */) +#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ +				1 + decode_stateid_maxsz) +#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1) +#define decode_secinfo_no_name_maxsz decode_secinfo_maxsz +#define encode_test_stateid_maxsz	(op_encode_hdr_maxsz + 2 + \ +					 XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_test_stateid_maxsz	(op_decode_hdr_maxsz + 2 + 1) +#define encode_free_stateid_maxsz	(op_encode_hdr_maxsz + 1 + \ +					 XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_free_stateid_maxsz	(op_decode_hdr_maxsz)  #else /* CONFIG_NFS_V4_1 */  #define encode_sequence_maxsz	0  #define decode_sequence_maxsz	0 @@ -368,30 +454,24 @@ static int nfs4_stat_to_errno(int);  #define NFS4_enc_commit_sz	(compound_encode_hdr_maxsz + \  				encode_sequence_maxsz + \  				encode_putfh_maxsz + \ -				encode_commit_maxsz + \ -				encode_getattr_maxsz) +				encode_commit_maxsz)  #define NFS4_dec_commit_sz	(compound_decode_hdr_maxsz + \  				decode_sequence_maxsz + \  				decode_putfh_maxsz + \ -				decode_commit_maxsz + \ -				decode_getattr_maxsz) +				decode_commit_maxsz)  #define NFS4_enc_open_sz        (compound_encode_hdr_maxsz + \  				encode_sequence_maxsz + \  				encode_putfh_maxsz + \ -				encode_savefh_maxsz + \  				encode_open_maxsz + \ +				encode_access_maxsz + \  				encode_getfh_maxsz + \ -				encode_getattr_maxsz + \ -				encode_restorefh_maxsz + \  				encode_getattr_maxsz)  #define NFS4_dec_open_sz        (compound_decode_hdr_maxsz + \  				decode_sequence_maxsz + \  				decode_putfh_maxsz + \ -				decode_savefh_maxsz + \  				decode_open_maxsz + \ +				decode_access_maxsz + \  				decode_getfh_maxsz + \ -				decode_getattr_maxsz + \ -				decode_restorefh_maxsz + \  				decode_getattr_maxsz)  #define NFS4_enc_open_confirm_sz \  				(compound_encode_hdr_maxsz + \ @@ -405,11 +485,13 @@ static int nfs4_stat_to_errno(int);  					encode_sequence_maxsz + \  					encode_putfh_maxsz + \  					encode_open_maxsz + \ +					encode_access_maxsz + \  					encode_getattr_maxsz)  #define NFS4_dec_open_noattr_sz	(compound_decode_hdr_maxsz + \  					decode_sequence_maxsz + \  					decode_putfh_maxsz + \  					decode_open_maxsz + \ +					decode_access_maxsz + \  					decode_getattr_maxsz)  #define NFS4_enc_open_downgrade_sz \  				(compound_encode_hdr_maxsz + \ @@ -461,14 +543,10 @@ static int nfs4_stat_to_errno(int);  				decode_setclientid_maxsz)  #define NFS4_enc_setclientid_confirm_sz \  				(compound_encode_hdr_maxsz + \ -				encode_setclientid_confirm_maxsz + \ -				encode_putrootfh_maxsz + \ -				encode_fsinfo_maxsz) +				encode_setclientid_confirm_maxsz)  #define NFS4_dec_setclientid_confirm_sz \  				(compound_decode_hdr_maxsz + \ -				decode_setclientid_confirm_maxsz + \ -				decode_putrootfh_maxsz + \ -				decode_fsinfo_maxsz) +				decode_setclientid_confirm_maxsz)  #define NFS4_enc_lock_sz        (compound_encode_hdr_maxsz + \  				encode_sequence_maxsz + \  				encode_putfh_maxsz + \ @@ -512,11 +590,13 @@ static int nfs4_stat_to_errno(int);  #define NFS4_enc_getattr_sz	(compound_encode_hdr_maxsz + \  				encode_sequence_maxsz + \  				encode_putfh_maxsz + \ -				encode_getattr_maxsz) +				encode_getattr_maxsz + \ +				encode_renew_maxsz)  #define NFS4_dec_getattr_sz	(compound_decode_hdr_maxsz + \  				decode_sequence_maxsz + \  				decode_putfh_maxsz + \ -				decode_getattr_maxsz) +				decode_getattr_maxsz + \ +				decode_renew_maxsz)  #define NFS4_enc_lookup_sz	(compound_encode_hdr_maxsz + \  				encode_sequence_maxsz + \  				encode_putfh_maxsz + \ @@ -542,47 +622,37 @@ static int nfs4_stat_to_errno(int);  #define NFS4_enc_remove_sz	(compound_encode_hdr_maxsz + \  				encode_sequence_maxsz + \  				encode_putfh_maxsz + \ -				encode_remove_maxsz + \ -				encode_getattr_maxsz) +				encode_remove_maxsz)  #define NFS4_dec_remove_sz	(compound_decode_hdr_maxsz + \  				decode_sequence_maxsz + \  				decode_putfh_maxsz + \ -				decode_remove_maxsz + \ -				decode_getattr_maxsz) +				decode_remove_maxsz)  #define NFS4_enc_rename_sz	(compound_encode_hdr_maxsz + \  				encode_sequence_maxsz + \  				encode_putfh_maxsz + \  				encode_savefh_maxsz + \  				encode_putfh_maxsz + \ -				encode_rename_maxsz + \ -				encode_getattr_maxsz + \ -				encode_restorefh_maxsz + \ -				encode_getattr_maxsz) +				encode_rename_maxsz)  #define NFS4_dec_rename_sz	(compound_decode_hdr_maxsz + \  				decode_sequence_maxsz + \  				decode_putfh_maxsz + \  				decode_savefh_maxsz + \  				decode_putfh_maxsz + \ -				decode_rename_maxsz + \ -				decode_getattr_maxsz + \ -				decode_restorefh_maxsz + \ -				decode_getattr_maxsz) +				decode_rename_maxsz)  #define NFS4_enc_link_sz	(compound_encode_hdr_maxsz + \  				encode_sequence_maxsz + \  				encode_putfh_maxsz + \  				encode_savefh_maxsz + \  				encode_putfh_maxsz + \  				encode_link_maxsz + \ -				decode_getattr_maxsz + \  				encode_restorefh_maxsz + \ -				decode_getattr_maxsz) +				encode_getattr_maxsz)  #define NFS4_dec_link_sz	(compound_decode_hdr_maxsz + \  				decode_sequence_maxsz + \  				decode_putfh_maxsz + \  				decode_savefh_maxsz + \  				decode_putfh_maxsz + \  				decode_link_maxsz + \ -				decode_getattr_maxsz + \  				decode_restorefh_maxsz + \  				decode_getattr_maxsz)  #define NFS4_enc_symlink_sz	(compound_encode_hdr_maxsz + \ @@ -600,20 +670,14 @@ static int nfs4_stat_to_errno(int);  #define NFS4_enc_create_sz	(compound_encode_hdr_maxsz + \  				encode_sequence_maxsz + \  				encode_putfh_maxsz + \ -				encode_savefh_maxsz + \  				encode_create_maxsz + \  				encode_getfh_maxsz + \ -				encode_getattr_maxsz + \ -				encode_restorefh_maxsz + \  				encode_getattr_maxsz)  #define NFS4_dec_create_sz	(compound_decode_hdr_maxsz + \  				decode_sequence_maxsz + \  				decode_putfh_maxsz + \ -				decode_savefh_maxsz + \  				decode_create_maxsz + \  				decode_getfh_maxsz + \ -				decode_getattr_maxsz + \ -				decode_restorefh_maxsz + \  				decode_getattr_maxsz)  #define NFS4_enc_pathconf_sz	(compound_encode_hdr_maxsz + \  				encode_sequence_maxsz + \ @@ -669,14 +733,42 @@ static int nfs4_stat_to_errno(int);  				 encode_sequence_maxsz + \  				 encode_putfh_maxsz + \  				 encode_lookup_maxsz + \ -				 encode_fs_locations_maxsz) +				 encode_fs_locations_maxsz + \ +				 encode_renew_maxsz)  #define NFS4_dec_fs_locations_sz \  				(compound_decode_hdr_maxsz + \  				 decode_sequence_maxsz + \  				 decode_putfh_maxsz + \  				 decode_lookup_maxsz + \ -				 decode_fs_locations_maxsz) +				 decode_fs_locations_maxsz + \ +				 decode_renew_maxsz) +#define NFS4_enc_secinfo_sz 	(compound_encode_hdr_maxsz + \ +				encode_sequence_maxsz + \ +				encode_putfh_maxsz + \ +				encode_secinfo_maxsz) +#define NFS4_dec_secinfo_sz	(compound_decode_hdr_maxsz + \ +				decode_sequence_maxsz + \ +				decode_putfh_maxsz + \ +				decode_secinfo_maxsz) +#define NFS4_enc_fsid_present_sz \ +				(compound_encode_hdr_maxsz + \ +				 encode_sequence_maxsz + \ +				 encode_putfh_maxsz + \ +				 encode_getfh_maxsz + \ +				 encode_renew_maxsz) +#define NFS4_dec_fsid_present_sz \ +				(compound_decode_hdr_maxsz + \ +				 decode_sequence_maxsz + \ +				 decode_putfh_maxsz + \ +				 decode_getfh_maxsz + \ +				 decode_renew_maxsz)  #if defined(CONFIG_NFS_V4_1) +#define NFS4_enc_bind_conn_to_session_sz \ +				(compound_encode_hdr_maxsz + \ +				 encode_bind_conn_to_session_maxsz) +#define NFS4_dec_bind_conn_to_session_sz \ +				(compound_decode_hdr_maxsz + \ +				 decode_bind_conn_to_session_maxsz)  #define NFS4_enc_exchange_id_sz \  				(compound_encode_hdr_maxsz + \  				 encode_exchange_id_maxsz) @@ -693,6 +785,10 @@ static int nfs4_stat_to_errno(int);  					 encode_destroy_session_maxsz)  #define NFS4_dec_destroy_session_sz	(compound_decode_hdr_maxsz + \  					 decode_destroy_session_maxsz) +#define NFS4_enc_destroy_clientid_sz	(compound_encode_hdr_maxsz + \ +					 encode_destroy_clientid_maxsz) +#define NFS4_dec_destroy_clientid_sz	(compound_decode_hdr_maxsz + \ +					 decode_destroy_clientid_maxsz)  #define NFS4_enc_sequence_sz \  				(compound_decode_hdr_maxsz + \  				 encode_sequence_maxsz) @@ -713,6 +809,14 @@ static int nfs4_stat_to_errno(int);  #define NFS4_dec_reclaim_complete_sz	(compound_decode_hdr_maxsz + \  					 decode_sequence_maxsz + \  					 decode_reclaim_complete_maxsz) +#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ +				encode_sequence_maxsz + \ +				encode_putfh_maxsz + \ +				encode_getdevicelist_maxsz) +#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ +				decode_sequence_maxsz + \ +				decode_putfh_maxsz + \ +				decode_getdevicelist_maxsz)  #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \  				encode_sequence_maxsz +\  				encode_getdeviceinfo_maxsz) @@ -727,6 +831,44 @@ static int nfs4_stat_to_errno(int);  				decode_sequence_maxsz + \  				decode_putfh_maxsz +        \  				decode_layoutget_maxsz) +#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ +				encode_sequence_maxsz +\ +				encode_putfh_maxsz + \ +				encode_layoutcommit_maxsz + \ +				encode_getattr_maxsz) +#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ +				decode_sequence_maxsz + \ +				decode_putfh_maxsz + \ +				decode_layoutcommit_maxsz + \ +				decode_getattr_maxsz) +#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ +				encode_sequence_maxsz + \ +				encode_putfh_maxsz + \ +				encode_layoutreturn_maxsz) +#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ +				decode_sequence_maxsz + \ +				decode_putfh_maxsz + \ +				decode_layoutreturn_maxsz) +#define NFS4_enc_secinfo_no_name_sz	(compound_encode_hdr_maxsz + \ +					encode_sequence_maxsz + \ +					encode_putrootfh_maxsz +\ +					encode_secinfo_no_name_maxsz) +#define NFS4_dec_secinfo_no_name_sz	(compound_decode_hdr_maxsz + \ +					decode_sequence_maxsz + \ +					decode_putrootfh_maxsz + \ +					decode_secinfo_no_name_maxsz) +#define NFS4_enc_test_stateid_sz	(compound_encode_hdr_maxsz + \ +					 encode_sequence_maxsz + \ +					 encode_test_stateid_maxsz) +#define NFS4_dec_test_stateid_sz	(compound_decode_hdr_maxsz + \ +					 decode_sequence_maxsz + \ +					 decode_test_stateid_maxsz) +#define NFS4_enc_free_stateid_sz	(compound_encode_hdr_maxsz + \ +					 encode_sequence_maxsz + \ +					 encode_free_stateid_maxsz) +#define NFS4_dec_free_stateid_sz	(compound_decode_hdr_maxsz + \ +					 decode_sequence_maxsz + \ +					 decode_free_stateid_maxsz)  const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +  				      compound_encode_hdr_maxsz + @@ -740,6 +882,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +  				     decode_sequence_maxsz +  				     decode_putfh_maxsz) *  				    XDR_UNIT); + +const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH + +					   compound_decode_hdr_maxsz + +					   decode_sequence_maxsz) * +					  XDR_UNIT); +EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead);  #endif /* CONFIG_NFS_V4_1 */  static const umode_t nfs_type2fmt[] = { @@ -772,15 +920,44 @@ static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)  	return p;  } +static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len) +{ +	__be32 *p; + +	p = xdr_reserve_space(xdr, len); +	xdr_encode_opaque_fixed(p, buf, len); +} +  static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)  {  	__be32 *p; -	p = xdr_reserve_space(xdr, 4 + len); -	BUG_ON(p == NULL); +	p = reserve_space(xdr, 4 + len);  	xdr_encode_opaque(p, str, len);  } +static void encode_uint32(struct xdr_stream *xdr, u32 n) +{ +	__be32 *p; + +	p = reserve_space(xdr, 4); +	*p = cpu_to_be32(n); +} + +static void encode_uint64(struct xdr_stream *xdr, u64 n) +{ +	__be32 *p; + +	p = reserve_space(xdr, 8); +	xdr_encode_hyper(p, n); +} + +static void encode_nfs4_seqid(struct xdr_stream *xdr, +		const struct nfs_seqid *seqid) +{ +	encode_uint32(xdr, seqid->sequence->counter); +} +  static void encode_compound_hdr(struct xdr_stream *xdr,  				struct rpc_rqst *req,  				struct compound_hdr *hdr) @@ -793,194 +970,189 @@ static void encode_compound_hdr(struct xdr_stream *xdr,  	 * but this is not required as a MUST for the server to do so. */  	hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen; -	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); -	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); -	p = reserve_space(xdr, 4 + hdr->taglen + 8); -	p = xdr_encode_opaque(p, hdr->tag, hdr->taglen); +	WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN); +	encode_string(xdr, hdr->taglen, hdr->tag); +	p = reserve_space(xdr, 8);  	*p++ = cpu_to_be32(hdr->minorversion);  	hdr->nops_p = p;  	*p = cpu_to_be32(hdr->nops);  } +static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op, +		uint32_t replen, +		struct compound_hdr *hdr) +{ +	encode_uint32(xdr, op); +	hdr->nops++; +	hdr->replen += replen; +} +  static void encode_nops(struct compound_hdr *hdr)  { -	BUG_ON(hdr->nops > NFS4_MAX_OPS); +	WARN_ON_ONCE(hdr->nops > NFS4_MAX_OPS);  	*hdr->nops_p = htonl(hdr->nops);  } -static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) +static void encode_nfs4_stateid(struct xdr_stream *xdr, const nfs4_stateid *stateid)  { -	__be32 *p; +	encode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); +} -	p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); -	BUG_ON(p == NULL); -	xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE); +static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) +{ +	encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);  } -static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server) +static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, +				const struct nfs4_label *label, +				const struct nfs_server *server)  {  	char owner_name[IDMAP_NAMESZ];  	char owner_group[IDMAP_NAMESZ];  	int owner_namelen = 0;  	int owner_grouplen = 0;  	__be32 *p; -	__be32 *q; -	int len; -	uint32_t bmval0 = 0; -	uint32_t bmval1 = 0; +	unsigned i; +	uint32_t len = 0; +	uint32_t bmval_len; +	uint32_t bmval[3] = { 0 };  	/*  	 * We reserve enough space to write the entire attribute buffer at once.  	 * In the worst-case, this would be -	 *   12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) -	 *          = 36 bytes, plus any contribution from variable-length fields +	 * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) +	 * = 40 bytes, plus any contribution from variable-length fields  	 *            such as owner/group.  	 */ -	len = 16; - -	/* Sigh */ -	if (iap->ia_valid & ATTR_SIZE) +	if (iap->ia_valid & ATTR_SIZE) { +		bmval[0] |= FATTR4_WORD0_SIZE;  		len += 8; -	if (iap->ia_valid & ATTR_MODE) +	} +	if (iap->ia_valid & ATTR_MODE) { +		bmval[1] |= FATTR4_WORD1_MODE;  		len += 4; +	}  	if (iap->ia_valid & ATTR_UID) { -		owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ); +		owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);  		if (owner_namelen < 0) {  			dprintk("nfs: couldn't resolve uid %d to string\n", -					iap->ia_uid); +					from_kuid(&init_user_ns, iap->ia_uid));  			/* XXX */  			strcpy(owner_name, "nobody");  			owner_namelen = sizeof("nobody") - 1;  			/* goto out; */  		} +		bmval[1] |= FATTR4_WORD1_OWNER;  		len += 4 + (XDR_QUADLEN(owner_namelen) << 2);  	}  	if (iap->ia_valid & ATTR_GID) { -		owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ); +		owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);  		if (owner_grouplen < 0) {  			dprintk("nfs: couldn't resolve gid %d to string\n", -					iap->ia_gid); +					from_kgid(&init_user_ns, iap->ia_gid));  			strcpy(owner_group, "nobody");  			owner_grouplen = sizeof("nobody") - 1;  			/* goto out; */  		} +		bmval[1] |= FATTR4_WORD1_OWNER_GROUP;  		len += 4 + (XDR_QUADLEN(owner_grouplen) << 2);  	} -	if (iap->ia_valid & ATTR_ATIME_SET) +	if (iap->ia_valid & ATTR_ATIME_SET) { +		bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;  		len += 16; -	else if (iap->ia_valid & ATTR_ATIME) +	} else if (iap->ia_valid & ATTR_ATIME) { +		bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET;  		len += 4; -	if (iap->ia_valid & ATTR_MTIME_SET) +	} +	if (iap->ia_valid & ATTR_MTIME_SET) { +		bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;  		len += 16; -	else if (iap->ia_valid & ATTR_MTIME) +	} else if (iap->ia_valid & ATTR_MTIME) { +		bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;  		len += 4; -	p = reserve_space(xdr, len); +	} +	if (label) { +		len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); +		bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; +	} -	/* -	 * We write the bitmap length now, but leave the bitmap and the attribute -	 * buffer length to be backfilled at the end of this routine. -	 */ -	*p++ = cpu_to_be32(2); -	q = p; -	p += 3; +	if (bmval[2] != 0) +		bmval_len = 3; +	else if (bmval[1] != 0) +		bmval_len = 2; +	else +		bmval_len = 1; -	if (iap->ia_valid & ATTR_SIZE) { -		bmval0 |= FATTR4_WORD0_SIZE; +	p = reserve_space(xdr, 4 + (bmval_len << 2) + 4 + len); + +	*p++ = cpu_to_be32(bmval_len); +	for (i = 0; i < bmval_len; i++) +		*p++ = cpu_to_be32(bmval[i]); +	*p++ = cpu_to_be32(len); + +	if (bmval[0] & FATTR4_WORD0_SIZE)  		p = xdr_encode_hyper(p, iap->ia_size); -	} -	if (iap->ia_valid & ATTR_MODE) { -		bmval1 |= FATTR4_WORD1_MODE; +	if (bmval[1] & FATTR4_WORD1_MODE)  		*p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO); -	} -	if (iap->ia_valid & ATTR_UID) { -		bmval1 |= FATTR4_WORD1_OWNER; +	if (bmval[1] & FATTR4_WORD1_OWNER)  		p = xdr_encode_opaque(p, owner_name, owner_namelen); -	} -	if (iap->ia_valid & ATTR_GID) { -		bmval1 |= FATTR4_WORD1_OWNER_GROUP; +	if (bmval[1] & FATTR4_WORD1_OWNER_GROUP)  		p = xdr_encode_opaque(p, owner_group, owner_grouplen); +	if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { +		if (iap->ia_valid & ATTR_ATIME_SET) { +			*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); +			p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec); +			*p++ = cpu_to_be32(iap->ia_atime.tv_nsec); +		} else +			*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);  	} -	if (iap->ia_valid & ATTR_ATIME_SET) { -		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; -		*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); -		*p++ = cpu_to_be32(0); -		*p++ = cpu_to_be32(iap->ia_atime.tv_sec); -		*p++ = cpu_to_be32(iap->ia_atime.tv_nsec); -	} -	else if (iap->ia_valid & ATTR_ATIME) { -		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; -		*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); -	} -	if (iap->ia_valid & ATTR_MTIME_SET) { -		bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; -		*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); -		*p++ = cpu_to_be32(0); -		*p++ = cpu_to_be32(iap->ia_mtime.tv_sec); -		*p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); -	} -	else if (iap->ia_valid & ATTR_MTIME) { -		bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; -		*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); +	if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { +		if (iap->ia_valid & ATTR_MTIME_SET) { +			*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); +			p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec); +			*p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); +		} else +			*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);  	} - -	/* -	 * Now we backfill the bitmap and the attribute buffer length. -	 */ -	if (len != ((char *)p - (char *)q) + 4) { -		printk(KERN_ERR "nfs: Attr length error, %u != %Zu\n", -				len, ((char *)p - (char *)q) + 4); -		BUG(); +	if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { +		*p++ = cpu_to_be32(label->lfs); +		*p++ = cpu_to_be32(label->pi); +		*p++ = cpu_to_be32(label->len); +		p = xdr_encode_opaque_fixed(p, label->label, label->len);  	} -	len = (char *)p - (char *)q - 12; -	*q++ = htonl(bmval0); -	*q++ = htonl(bmval1); -	*q = htonl(len);  /* out: */  }  static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 8); -	*p++ = cpu_to_be32(OP_ACCESS); -	*p = cpu_to_be32(access); -	hdr->nops++; -	hdr->replen += decode_access_maxsz; +	encode_op_hdr(xdr, OP_ACCESS, decode_access_maxsz, hdr); +	encode_uint32(xdr, access);  }  static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); -	*p++ = cpu_to_be32(OP_CLOSE); -	*p++ = cpu_to_be32(arg->seqid->sequence->counter); -	xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); -	hdr->nops++; -	hdr->replen += decode_close_maxsz; +	encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr); +	encode_nfs4_seqid(xdr, arg->seqid); +	encode_nfs4_stateid(xdr, arg->stateid);  } -static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) +static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 16); -	*p++ = cpu_to_be32(OP_COMMIT); +	encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr); +	p = reserve_space(xdr, 12);  	p = xdr_encode_hyper(p, args->offset);  	*p = cpu_to_be32(args->count); -	hdr->nops++; -	hdr->replen += decode_commit_maxsz;  }  static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 8); -	*p++ = cpu_to_be32(OP_CREATE); -	*p = cpu_to_be32(create->ftype); +	encode_op_hdr(xdr, OP_CREATE, decode_create_maxsz, hdr); +	encode_uint32(xdr, create->ftype);  	switch (create->ftype) {  	case NF4LNK: @@ -1000,47 +1172,82 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *  	}  	encode_string(xdr, create->name->len, create->name->name); -	hdr->nops++; -	hdr->replen += decode_create_maxsz; - -	encode_attrs(xdr, create->attrs, create->server); +	encode_attrs(xdr, create->attrs, create->label, create->server);  }  static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 12); -	*p++ = cpu_to_be32(OP_GETATTR); +	encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); +	p = reserve_space(xdr, 8);  	*p++ = cpu_to_be32(1);  	*p = cpu_to_be32(bitmap); -	hdr->nops++; -	hdr->replen += decode_getattr_maxsz;  }  static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 16); -	*p++ = cpu_to_be32(OP_GETATTR); +	encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); +	p = reserve_space(xdr, 12);  	*p++ = cpu_to_be32(2);  	*p++ = cpu_to_be32(bm0);  	*p = cpu_to_be32(bm1); -	hdr->nops++; -	hdr->replen += decode_getattr_maxsz; +} + +static void +encode_getattr_three(struct xdr_stream *xdr, +		     uint32_t bm0, uint32_t bm1, uint32_t bm2, +		     struct compound_hdr *hdr) +{ +	__be32 *p; + +	encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); +	if (bm2) { +		p = reserve_space(xdr, 16); +		*p++ = cpu_to_be32(3); +		*p++ = cpu_to_be32(bm0); +		*p++ = cpu_to_be32(bm1); +		*p = cpu_to_be32(bm2); +	} else if (bm1) { +		p = reserve_space(xdr, 12); +		*p++ = cpu_to_be32(2); +		*p++ = cpu_to_be32(bm0); +		*p = cpu_to_be32(bm1); +	} else { +		p = reserve_space(xdr, 8); +		*p++ = cpu_to_be32(1); +		*p = cpu_to_be32(bm0); +	}  }  static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)  { -	encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], -			   bitmask[1] & nfs4_fattr_bitmap[1], hdr); +	encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0], +			   bitmask[1] & nfs4_fattr_bitmap[1], +			   bitmask[2] & nfs4_fattr_bitmap[2], +			   hdr); +} + +static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask, +				 const u32 *open_bitmap, +				 struct compound_hdr *hdr) +{ +	encode_getattr_three(xdr, +			     bitmask[0] & open_bitmap[0], +			     bitmask[1] & open_bitmap[1], +			     bitmask[2] & open_bitmap[2], +			     hdr);  }  static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)  { -	encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], -			   bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); +	encode_getattr_three(xdr, +			     bitmask[0] & nfs4_fsinfo_bitmap[0], +			     bitmask[1] & nfs4_fsinfo_bitmap[1], +			     bitmask[2] & nfs4_fsinfo_bitmap[2], +			     hdr);  }  static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) @@ -1051,28 +1258,18 @@ static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, stru  static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_GETFH); -	hdr->nops++; -	hdr->replen += decode_getfh_maxsz; +	encode_op_hdr(xdr, OP_GETFH, decode_getfh_maxsz, hdr);  }  static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 8 + name->len); -	*p++ = cpu_to_be32(OP_LINK); -	xdr_encode_opaque(p, name->name, name->len); -	hdr->nops++; -	hdr->replen += decode_link_maxsz; +	encode_op_hdr(xdr, OP_LINK, decode_link_maxsz, hdr); +	encode_string(xdr, name->len, name->name);  }  static inline int nfs4_lock_type(struct file_lock *fl, int block)  { -	if ((fl->fl_type & (F_RDLCK|F_WRLCK|F_UNLCK)) == F_RDLCK) +	if (fl->fl_type == F_RDLCK)  		return block ? NFS4_READW_LT : NFS4_READ_LT;  	return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT;  } @@ -1088,10 +1285,11 @@ static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lo  {  	__be32 *p; -	p = reserve_space(xdr, 28); +	p = reserve_space(xdr, 32);  	p = xdr_encode_hyper(p, lowner->clientid); -	*p++ = cpu_to_be32(16); +	*p++ = cpu_to_be32(20);  	p = xdr_encode_opaque_fixed(p, "lock id:", 8); +	*p++ = cpu_to_be32(lowner->s_dev);  	xdr_encode_hyper(p, lowner->id);  } @@ -1103,79 +1301,60 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args  {  	__be32 *p; -	p = reserve_space(xdr, 32); -	*p++ = cpu_to_be32(OP_LOCK); +	encode_op_hdr(xdr, OP_LOCK, decode_lock_maxsz, hdr); +	p = reserve_space(xdr, 28);  	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));  	*p++ = cpu_to_be32(args->reclaim);  	p = xdr_encode_hyper(p, args->fl->fl_start);  	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));  	*p = cpu_to_be32(args->new_lock_owner);  	if (args->new_lock_owner){ -		p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); -		*p++ = cpu_to_be32(args->open_seqid->sequence->counter); -		p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); -		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter); +		encode_nfs4_seqid(xdr, args->open_seqid); +		encode_nfs4_stateid(xdr, args->open_stateid); +		encode_nfs4_seqid(xdr, args->lock_seqid);  		encode_lockowner(xdr, &args->lock_owner);  	}  	else { -		p = reserve_space(xdr, NFS4_STATEID_SIZE+4); -		p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); -		*p = cpu_to_be32(args->lock_seqid->sequence->counter); +		encode_nfs4_stateid(xdr, args->lock_stateid); +		encode_nfs4_seqid(xdr, args->lock_seqid);  	} -	hdr->nops++; -	hdr->replen += decode_lock_maxsz;  }  static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 24); -	*p++ = cpu_to_be32(OP_LOCKT); +	encode_op_hdr(xdr, OP_LOCKT, decode_lockt_maxsz, hdr); +	p = reserve_space(xdr, 20);  	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));  	p = xdr_encode_hyper(p, args->fl->fl_start);  	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));  	encode_lockowner(xdr, &args->lock_owner); -	hdr->nops++; -	hdr->replen += decode_lockt_maxsz;  }  static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16); -	*p++ = cpu_to_be32(OP_LOCKU); -	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); -	*p++ = cpu_to_be32(args->seqid->sequence->counter); -	p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); +	encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr); +	encode_uint32(xdr, nfs4_lock_type(args->fl, 0)); +	encode_nfs4_seqid(xdr, args->seqid); +	encode_nfs4_stateid(xdr, args->stateid); +	p = reserve_space(xdr, 16);  	p = xdr_encode_hyper(p, args->fl->fl_start);  	xdr_encode_hyper(p, nfs4_lock_length(args->fl)); -	hdr->nops++; -	hdr->replen += decode_locku_maxsz;  }  static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_RELEASE_LOCKOWNER); +	encode_op_hdr(xdr, OP_RELEASE_LOCKOWNER, decode_release_lockowner_maxsz, hdr);  	encode_lockowner(xdr, lowner); -	hdr->nops++; -	hdr->replen += decode_release_lockowner_maxsz;  }  static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)  { -	int len = name->len; -	__be32 *p; - -	p = reserve_space(xdr, 8 + len); -	*p++ = cpu_to_be32(OP_LOOKUP); -	xdr_encode_opaque(p, name->name, len); -	hdr->nops++; -	hdr->replen += decode_lookup_maxsz; +	encode_op_hdr(xdr, OP_LOOKUP, decode_lookup_maxsz, hdr); +	encode_string(xdr, name->len, name->name);  }  static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) @@ -1206,46 +1385,41 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena   * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,   * owner 4 = 32   */ -	p = reserve_space(xdr, 8); -	*p++ = cpu_to_be32(OP_OPEN); -	*p = cpu_to_be32(arg->seqid->sequence->counter); +	encode_nfs4_seqid(xdr, arg->seqid);  	encode_share_access(xdr, arg->fmode); -	p = reserve_space(xdr, 28); +	p = reserve_space(xdr, 36);  	p = xdr_encode_hyper(p, arg->clientid); -	*p++ = cpu_to_be32(16); +	*p++ = cpu_to_be32(24);  	p = xdr_encode_opaque_fixed(p, "open id:", 8); -	xdr_encode_hyper(p, arg->id); +	*p++ = cpu_to_be32(arg->server->s_dev); +	*p++ = cpu_to_be32(arg->id.uniquifier); +	xdr_encode_hyper(p, arg->id.create_time);  }  static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)  { +	struct iattr dummy;  	__be32 *p; -	struct nfs_client *clp;  	p = reserve_space(xdr, 4); -	switch(arg->open_flags & O_EXCL) { -	case 0: +	switch(arg->createmode) { +	case NFS4_CREATE_UNCHECKED:  		*p = cpu_to_be32(NFS4_CREATE_UNCHECKED); -		encode_attrs(xdr, arg->u.attrs, arg->server); +		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);  		break; -	default: -		clp = arg->server->nfs_client; -		if (clp->cl_mvops->minor_version > 0) { -			if (nfs4_has_persistent_session(clp)) { -				*p = cpu_to_be32(NFS4_CREATE_GUARDED); -				encode_attrs(xdr, arg->u.attrs, arg->server); -			} else { -				struct iattr dummy; - -				*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); -				encode_nfs4_verifier(xdr, &arg->u.verifier); -				dummy.ia_valid = 0; -				encode_attrs(xdr, &dummy, arg->server); -			} -		} else { -			*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); -			encode_nfs4_verifier(xdr, &arg->u.verifier); -		} +	case NFS4_CREATE_GUARDED: +		*p = cpu_to_be32(NFS4_CREATE_GUARDED); +		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); +		break; +	case NFS4_CREATE_EXCLUSIVE: +		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); +		encode_nfs4_verifier(xdr, &arg->u.verifier); +		break; +	case NFS4_CREATE_EXCLUSIVE4_1: +		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); +		encode_nfs4_verifier(xdr, &arg->u.verifier); +		dummy.ia_valid = 0; +		encode_attrs(xdr, &dummy, arg->label, arg->server);  	}  } @@ -1259,7 +1433,6 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a  		*p = cpu_to_be32(NFS4_OPEN_NOCREATE);  		break;  	default: -		BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);  		*p = cpu_to_be32(NFS4_OPEN_CREATE);  		encode_createmode(xdr, arg);  	} @@ -1307,14 +1480,32 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc  {  	__be32 *p; -	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); -	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); -	xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); +	p = reserve_space(xdr, 4); +	*p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); +	encode_nfs4_stateid(xdr, stateid);  	encode_string(xdr, name->len, name->name);  } +static inline void encode_claim_fh(struct xdr_stream *xdr) +{ +	__be32 *p; + +	p = reserve_space(xdr, 4); +	*p = cpu_to_be32(NFS4_OPEN_CLAIM_FH); +} + +static inline void encode_claim_delegate_cur_fh(struct xdr_stream *xdr, const nfs4_stateid *stateid) +{ +	__be32 *p; + +	p = reserve_space(xdr, 4); +	*p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEG_CUR_FH); +	encode_nfs4_stateid(xdr, stateid); +} +  static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)  { +	encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr);  	encode_openhdr(xdr, arg);  	encode_opentype(xdr, arg);  	switch (arg->claim) { @@ -1327,240 +1518,172 @@ static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg,  	case NFS4_OPEN_CLAIM_DELEGATE_CUR:  		encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);  		break; +	case NFS4_OPEN_CLAIM_FH: +		encode_claim_fh(xdr); +		break; +	case NFS4_OPEN_CLAIM_DELEG_CUR_FH: +		encode_claim_delegate_cur_fh(xdr, &arg->u.delegation); +		break;  	default:  		BUG();  	} -	hdr->nops++; -	hdr->replen += decode_open_maxsz;  }  static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); -	*p++ = cpu_to_be32(OP_OPEN_CONFIRM); -	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); -	*p = cpu_to_be32(arg->seqid->sequence->counter); -	hdr->nops++; -	hdr->replen += decode_open_confirm_maxsz; +	encode_op_hdr(xdr, OP_OPEN_CONFIRM, decode_open_confirm_maxsz, hdr); +	encode_nfs4_stateid(xdr, arg->stateid); +	encode_nfs4_seqid(xdr, arg->seqid);  }  static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); -	*p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); -	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); -	*p = cpu_to_be32(arg->seqid->sequence->counter); +	encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr); +	encode_nfs4_stateid(xdr, arg->stateid); +	encode_nfs4_seqid(xdr, arg->seqid);  	encode_share_access(xdr, arg->fmode); -	hdr->nops++; -	hdr->replen += decode_open_downgrade_maxsz;  }  static void  encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr)  { -	int len = fh->size; -	__be32 *p; - -	p = reserve_space(xdr, 8 + len); -	*p++ = cpu_to_be32(OP_PUTFH); -	xdr_encode_opaque(p, fh->data, len); -	hdr->nops++; -	hdr->replen += decode_putfh_maxsz; +	encode_op_hdr(xdr, OP_PUTFH, decode_putfh_maxsz, hdr); +	encode_string(xdr, fh->size, fh->data);  }  static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_PUTROOTFH); -	hdr->nops++; -	hdr->replen += decode_putrootfh_maxsz; +	encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr);  } -static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) +static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args, +			struct compound_hdr *hdr)  { -	nfs4_stateid stateid;  	__be32 *p; -	p = reserve_space(xdr, NFS4_STATEID_SIZE); -	if (ctx->state != NULL) { -		nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); -		xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); -	} else -		xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); -} - -static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) -{ -	__be32 *p; - -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_READ); - -	encode_stateid(xdr, args->context, args->lock_context); +	encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr); +	encode_nfs4_stateid(xdr, &args->stateid);  	p = reserve_space(xdr, 12);  	p = xdr_encode_hyper(p, args->offset);  	*p = cpu_to_be32(args->count); -	hdr->nops++; -	hdr->replen += decode_read_maxsz;  }  static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)  { -	uint32_t attrs[2] = {0, 0}; +	uint32_t attrs[3] = { +		FATTR4_WORD0_RDATTR_ERROR, +		FATTR4_WORD1_MOUNTED_ON_FILEID, +	};  	uint32_t dircount = readdir->count >> 1; -	__be32 *p; +	__be32 *p, verf[2]; +	uint32_t attrlen = 0; +	unsigned int i;  	if (readdir->plus) {  		attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| -			FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE; +			FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID;  		attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|  			FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|  			FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|  			FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; +		attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;  		dircount >>= 1;  	} -	attrs[0] |= FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID; -	attrs[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; -	/* Switch to mounted_on_fileid if the server supports it */ -	if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) -		attrs[0] &= ~FATTR4_WORD0_FILEID; -	else -		attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; +	/* Use mounted_on_fileid only if the server supports it */ +	if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) +		attrs[0] |= FATTR4_WORD0_FILEID; +	for (i = 0; i < ARRAY_SIZE(attrs); i++) { +		attrs[i] &= readdir->bitmask[i]; +		if (attrs[i] != 0) +			attrlen = i+1; +	} -	p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); -	*p++ = cpu_to_be32(OP_READDIR); -	p = xdr_encode_hyper(p, readdir->cookie); -	p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); +	encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr); +	encode_uint64(xdr, readdir->cookie); +	encode_nfs4_verifier(xdr, &readdir->verifier); +	p = reserve_space(xdr, 12 + (attrlen << 2));  	*p++ = cpu_to_be32(dircount);  	*p++ = cpu_to_be32(readdir->count); -	*p++ = cpu_to_be32(2); +	*p++ = cpu_to_be32(attrlen); +	for (i = 0; i < attrlen; i++) +		*p++ = cpu_to_be32(attrs[i]); +	memcpy(verf, readdir->verifier.data, sizeof(verf)); -	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); -	*p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); -	hdr->nops++; -	hdr->replen += decode_readdir_maxsz; -	dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", +	dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",  			__func__,  			(unsigned long long)readdir->cookie, -			((u32 *)readdir->verifier.data)[0], -			((u32 *)readdir->verifier.data)[1], +			verf[0], verf[1],  			attrs[0] & readdir->bitmask[0], -			attrs[1] & readdir->bitmask[1]); +			attrs[1] & readdir->bitmask[1], +			attrs[2] & readdir->bitmask[2]);  }  static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_READLINK); -	hdr->nops++; -	hdr->replen += decode_readlink_maxsz; +	encode_op_hdr(xdr, OP_READLINK, decode_readlink_maxsz, hdr);  }  static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 8 + name->len); -	*p++ = cpu_to_be32(OP_REMOVE); -	xdr_encode_opaque(p, name->name, name->len); -	hdr->nops++; -	hdr->replen += decode_remove_maxsz; +	encode_op_hdr(xdr, OP_REMOVE, decode_remove_maxsz, hdr); +	encode_string(xdr, name->len, name->name);  }  static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_RENAME); +	encode_op_hdr(xdr, OP_RENAME, decode_rename_maxsz, hdr);  	encode_string(xdr, oldname->len, oldname->name);  	encode_string(xdr, newname->len, newname->name); -	hdr->nops++; -	hdr->replen += decode_rename_maxsz;  } -static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr) +static void encode_renew(struct xdr_stream *xdr, clientid4 clid, +			 struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 12); -	*p++ = cpu_to_be32(OP_RENEW); -	xdr_encode_hyper(p, client_stateid->cl_clientid); -	hdr->nops++; -	hdr->replen += decode_renew_maxsz; +	encode_op_hdr(xdr, OP_RENEW, decode_renew_maxsz, hdr); +	encode_uint64(xdr, clid);  }  static void  encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_RESTOREFH); -	hdr->nops++; -	hdr->replen += decode_restorefh_maxsz; +	encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr);  } -static int +static void  encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); -	*p++ = cpu_to_be32(OP_SETATTR); -	xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); +	encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr); +	encode_nfs4_stateid(xdr, &zero_stateid);  	p = reserve_space(xdr, 2*4);  	*p++ = cpu_to_be32(1);  	*p = cpu_to_be32(FATTR4_WORD0_ACL); -	if (arg->acl_len % 4) -		return -EINVAL;  	p = reserve_space(xdr, 4);  	*p = cpu_to_be32(arg->acl_len);  	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); -	hdr->nops++; -	hdr->replen += decode_setacl_maxsz; -	return 0;  }  static void  encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_SAVEFH); -	hdr->nops++; -	hdr->replen += decode_savefh_maxsz; +	encode_op_hdr(xdr, OP_SAVEFH, decode_savefh_maxsz, hdr);  }  static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); -	*p++ = cpu_to_be32(OP_SETATTR); -	xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); -	hdr->nops++; -	hdr->replen += decode_setattr_maxsz; -	encode_attrs(xdr, arg->iap, server); +	encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); +	encode_nfs4_stateid(xdr, &arg->stateid); +	encode_attrs(xdr, arg->iap, arg->label, server);  }  static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE); -	*p++ = cpu_to_be32(OP_SETCLIENTID); -	xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); +	encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr); +	encode_nfs4_verifier(xdr, setclientid->sc_verifier);  	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);  	p = reserve_space(xdr, 4); @@ -1569,30 +1692,23 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie  	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);  	p = reserve_space(xdr, 4);  	*p = cpu_to_be32(setclientid->sc_cb_ident); -	hdr->nops++; -	hdr->replen += decode_setclientid_maxsz;  }  static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); -	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); -	p = xdr_encode_hyper(p, arg->clientid); -	xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE); -	hdr->nops++; -	hdr->replen += decode_setclientid_confirm_maxsz; +	encode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM, +			decode_setclientid_confirm_maxsz, hdr); +	encode_uint64(xdr, arg->clientid); +	encode_nfs4_verifier(xdr, &arg->confirm);  } -static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) +static void encode_write(struct xdr_stream *xdr, const struct nfs_pgio_args *args, +			 struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_WRITE); - -	encode_stateid(xdr, args->context, args->lock_context); +	encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr); +	encode_nfs4_stateid(xdr, &args->stateid);  	p = reserve_space(xdr, 16);  	p = xdr_encode_hyper(p, args->offset); @@ -1600,42 +1716,93 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg  	*p = cpu_to_be32(args->count);  	xdr_write_pages(xdr, args->pages, args->pgbase, args->count); -	hdr->nops++; -	hdr->replen += decode_write_maxsz;  }  static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); +	encode_op_hdr(xdr, OP_DELEGRETURN, decode_delegreturn_maxsz, hdr); +	encode_nfs4_stateid(xdr, stateid); +} -	*p++ = cpu_to_be32(OP_DELEGRETURN); -	xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); -	hdr->nops++; -	hdr->replen += decode_delegreturn_maxsz; +static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) +{ +	encode_op_hdr(xdr, OP_SECINFO, decode_secinfo_maxsz, hdr); +	encode_string(xdr, name->len, name->name);  }  #if defined(CONFIG_NFS_V4_1)  /* NFSv4.1 operations */ +static void encode_bind_conn_to_session(struct xdr_stream *xdr, +				   struct nfs4_session *session, +				   struct compound_hdr *hdr) +{ +	__be32 *p; + +	encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION, +		decode_bind_conn_to_session_maxsz, hdr); +	encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); +	p = xdr_reserve_space(xdr, 8); +	*p++ = cpu_to_be32(NFS4_CDFC4_BACK_OR_BOTH); +	*p = 0;	/* use_conn_in_rdma_mode = False */ +} + +static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) +{ +	unsigned int i; +	encode_uint32(xdr, NFS4_OP_MAP_NUM_WORDS); +	for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) +		encode_uint32(xdr, op_map->u.words[i]); +} +  static void encode_exchange_id(struct xdr_stream *xdr,  			       struct nfs41_exchange_id_args *args,  			       struct compound_hdr *hdr)  {  	__be32 *p; +	char impl_name[IMPL_NAME_LIMIT]; +	int len = 0; -	p = reserve_space(xdr, 4 + sizeof(args->verifier->data)); -	*p++ = cpu_to_be32(OP_EXCHANGE_ID); -	xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data)); +	encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); +	encode_nfs4_verifier(xdr, args->verifier);  	encode_string(xdr, args->id_len, args->id); -	p = reserve_space(xdr, 12); -	*p++ = cpu_to_be32(args->flags); -	*p++ = cpu_to_be32(0);	/* zero length state_protect4_a */ -	*p = cpu_to_be32(0);	/* zero length implementation id array */ -	hdr->nops++; -	hdr->replen += decode_exchange_id_maxsz; +	encode_uint32(xdr, args->flags); +	encode_uint32(xdr, args->state_protect.how); + +	switch (args->state_protect.how) { +	case SP4_NONE: +		break; +	case SP4_MACH_CRED: +		encode_op_map(xdr, &args->state_protect.enforce); +		encode_op_map(xdr, &args->state_protect.allow); +		break; +	default: +		WARN_ON_ONCE(1); +		break; +	} + +	if (send_implementation_id && +	    sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 && +	    sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) +		<= sizeof(impl_name) + 1) +		len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s", +			       utsname()->sysname, utsname()->release, +			       utsname()->version, utsname()->machine); + +	if (len > 0) { +		encode_uint32(xdr, 1);	/* implementation id array length=1 */ + +		encode_string(xdr, +			sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - 1, +			CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN); +		encode_string(xdr, len, impl_name); +		/* just send zeros for nii_date - the date is in nii_name */ +		p = reserve_space(xdr, 12); +		p = xdr_encode_hyper(p, 0); +		*p = cpu_to_be32(0); +	} else +		encode_uint32(xdr, 0);	/* implementation id array length=0 */  }  static void encode_create_session(struct xdr_stream *xdr, @@ -1646,6 +1813,7 @@ static void encode_create_session(struct xdr_stream *xdr,  	char machine_name[NFS4_MAX_MACHINE_NAME_LEN];  	uint32_t len;  	struct nfs_client *clp = args->client; +	struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);  	u32 max_resp_sz_cached;  	/* @@ -1658,14 +1826,14 @@ static void encode_create_session(struct xdr_stream *xdr,  	len = scnprintf(machine_name, sizeof(machine_name), "%s",  			clp->cl_ipaddr); -	p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); -	*p++ = cpu_to_be32(OP_CREATE_SESSION); -	p = xdr_encode_hyper(p, clp->cl_ex_clid); +	encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr); +	p = reserve_space(xdr, 16 + 2*28 + 20 + len + 12); +	p = xdr_encode_hyper(p, clp->cl_clientid);  	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */  	*p++ = cpu_to_be32(args->flags);			/*flags */  	/* Fore Channel */ -	*p++ = cpu_to_be32(args->fc_attrs.headerpadsz);	/* header padding size */ +	*p++ = cpu_to_be32(0);				/* header padding size */  	*p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz);	/* max req size */  	*p++ = cpu_to_be32(args->fc_attrs.max_resp_sz);	/* max resp size */  	*p++ = cpu_to_be32(max_resp_sz_cached);		/* Max resp sz cached */ @@ -1674,7 +1842,7 @@ static void encode_create_session(struct xdr_stream *xdr,  	*p++ = cpu_to_be32(0);				/* rdmachannel_attrs */  	/* Back Channel */ -	*p++ = cpu_to_be32(args->fc_attrs.headerpadsz);	/* header padding size */ +	*p++ = cpu_to_be32(0);				/* header padding size */  	*p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz);	/* max req size */  	*p++ = cpu_to_be32(args->bc_attrs.max_resp_sz);	/* max resp size */  	*p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached);	/* Max resp sz cached */ @@ -1687,38 +1855,35 @@ static void encode_create_session(struct xdr_stream *xdr,  	*p++ = cpu_to_be32(RPC_AUTH_UNIX);			/* auth_sys */  	/* authsys_parms rfc1831 */ -	*p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec);	/* stamp */ +	*p++ = cpu_to_be32(nn->boot_time.tv_nsec);	/* stamp */  	p = xdr_encode_opaque(p, machine_name, len);  	*p++ = cpu_to_be32(0);				/* UID */  	*p++ = cpu_to_be32(0);				/* GID */  	*p = cpu_to_be32(0);				/* No more gids */ -	hdr->nops++; -	hdr->replen += decode_create_session_maxsz;  }  static void encode_destroy_session(struct xdr_stream *xdr,  				   struct nfs4_session *session,  				   struct compound_hdr *hdr)  { -	__be32 *p; -	p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN); -	*p++ = cpu_to_be32(OP_DESTROY_SESSION); -	xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); -	hdr->nops++; -	hdr->replen += decode_destroy_session_maxsz; +	encode_op_hdr(xdr, OP_DESTROY_SESSION, decode_destroy_session_maxsz, hdr); +	encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); +} + +static void encode_destroy_clientid(struct xdr_stream *xdr, +				   uint64_t clientid, +				   struct compound_hdr *hdr) +{ +	encode_op_hdr(xdr, OP_DESTROY_CLIENTID, decode_destroy_clientid_maxsz, hdr); +	encode_uint64(xdr, clientid);  }  static void encode_reclaim_complete(struct xdr_stream *xdr,  				    struct nfs41_reclaim_complete_args *args,  				    struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 8); -	*p++ = cpu_to_be32(OP_RECLAIM_COMPLETE); -	*p++ = cpu_to_be32(args->one_fs); -	hdr->nops++; -	hdr->replen += decode_reclaim_complete_maxsz; +	encode_op_hdr(xdr, OP_RECLAIM_COMPLETE, decode_reclaim_complete_maxsz, hdr); +	encode_uint32(xdr, args->one_fs);  }  #endif /* CONFIG_NFS_V4_1 */ @@ -1727,21 +1892,17 @@ static void encode_sequence(struct xdr_stream *xdr,  			    struct compound_hdr *hdr)  {  #if defined(CONFIG_NFS_V4_1) -	struct nfs4_session *session = args->sa_session; +	struct nfs4_session *session;  	struct nfs4_slot_table *tp; -	struct nfs4_slot *slot; +	struct nfs4_slot *slot = args->sa_slot;  	__be32 *p; +	tp = slot->table; +	session = tp->session;  	if (!session)  		return; -	tp = &session->fc_slot_table; - -	WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE); -	slot = tp->slots + args->sa_slotid; - -	p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16); -	*p++ = cpu_to_be32(OP_SEQUENCE); +	encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr);  	/*  	 * Sessionid + seqid + slotid + max slotid + cache_this @@ -1753,35 +1914,50 @@ static void encode_sequence(struct xdr_stream *xdr,  		((u32 *)session->sess_id.data)[1],  		((u32 *)session->sess_id.data)[2],  		((u32 *)session->sess_id.data)[3], -		slot->seq_nr, args->sa_slotid, +		slot->seq_nr, slot->slot_nr,  		tp->highest_used_slotid, args->sa_cache_this); +	p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16);  	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);  	*p++ = cpu_to_be32(slot->seq_nr); -	*p++ = cpu_to_be32(args->sa_slotid); +	*p++ = cpu_to_be32(slot->slot_nr);  	*p++ = cpu_to_be32(tp->highest_used_slotid);  	*p = cpu_to_be32(args->sa_cache_this); -	hdr->nops++; -	hdr->replen += decode_sequence_maxsz;  #endif /* CONFIG_NFS_V4_1 */  }  #ifdef CONFIG_NFS_V4_1  static void +encode_getdevicelist(struct xdr_stream *xdr, +		     const struct nfs4_getdevicelist_args *args, +		     struct compound_hdr *hdr) +{ +	__be32 *p; +	nfs4_verifier dummy = { +		.data = "dummmmmy", +	}; + +	encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr); +	p = reserve_space(xdr, 16); +	*p++ = cpu_to_be32(args->layoutclass); +	*p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); +	xdr_encode_hyper(p, 0ULL);                          /* cookie */ +	encode_nfs4_verifier(xdr, &dummy); +} + +static void  encode_getdeviceinfo(struct xdr_stream *xdr,  		     const struct nfs4_getdeviceinfo_args *args,  		     struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE); -	*p++ = cpu_to_be32(OP_GETDEVICEINFO); +	encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr); +	p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE);  	p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,  				    NFS4_DEVICEID4_SIZE);  	*p++ = cpu_to_be32(args->pdev->layout_type); -	*p++ = cpu_to_be32(args->pdev->pglen);		/* gdia_maxcount */ +	*p++ = cpu_to_be32(args->pdev->maxcount);	/* gdia_maxcount */  	*p++ = cpu_to_be32(0);				/* bitmap length 0 */ -	hdr->nops++; -	hdr->replen += decode_getdeviceinfo_maxsz;  }  static void @@ -1789,21 +1965,18 @@ encode_layoutget(struct xdr_stream *xdr,  		      const struct nfs4_layoutget_args *args,  		      struct compound_hdr *hdr)  { -	nfs4_stateid stateid;  	__be32 *p; -	p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); -	*p++ = cpu_to_be32(OP_LAYOUTGET); +	encode_op_hdr(xdr, OP_LAYOUTGET, decode_layoutget_maxsz, hdr); +	p = reserve_space(xdr, 36);  	*p++ = cpu_to_be32(0);     /* Signal layout available */  	*p++ = cpu_to_be32(args->type);  	*p++ = cpu_to_be32(args->range.iomode);  	p = xdr_encode_hyper(p, args->range.offset);  	p = xdr_encode_hyper(p, args->range.length);  	p = xdr_encode_hyper(p, args->minlength); -	pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout, -				args->ctx->state); -	p = xdr_encode_opaque_fixed(p, &stateid.data, NFS4_STATEID_SIZE); -	*p = cpu_to_be32(args->maxcount); +	encode_nfs4_stateid(xdr, &args->stateid); +	encode_uint32(xdr, args->maxcount);  	dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",  		__func__, @@ -1812,8 +1985,92 @@ encode_layoutget(struct xdr_stream *xdr,  		(unsigned long)args->range.offset,  		(unsigned long)args->range.length,  		args->maxcount); -	hdr->nops++; -	hdr->replen += decode_layoutget_maxsz; +} + +static int +encode_layoutcommit(struct xdr_stream *xdr, +		    struct inode *inode, +		    const struct nfs4_layoutcommit_args *args, +		    struct compound_hdr *hdr) +{ +	__be32 *p; + +	dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten, +		NFS_SERVER(args->inode)->pnfs_curr_ld->id); + +	encode_op_hdr(xdr, OP_LAYOUTCOMMIT, decode_layoutcommit_maxsz, hdr); +	p = reserve_space(xdr, 20); +	/* Only whole file layouts */ +	p = xdr_encode_hyper(p, 0); /* offset */ +	p = xdr_encode_hyper(p, args->lastbytewritten + 1);	/* length */ +	*p = cpu_to_be32(0); /* reclaim */ +	encode_nfs4_stateid(xdr, &args->stateid); +	p = reserve_space(xdr, 20); +	*p++ = cpu_to_be32(1); /* newoffset = TRUE */ +	p = xdr_encode_hyper(p, args->lastbytewritten); +	*p++ = cpu_to_be32(0); /* Never send time_modify_changed */ +	*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ + +	if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) +		NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( +			NFS_I(inode)->layout, xdr, args); +	else +		encode_uint32(xdr, 0); /* no layout-type payload */ + +	return 0; +} + +static void +encode_layoutreturn(struct xdr_stream *xdr, +		    const struct nfs4_layoutreturn_args *args, +		    struct compound_hdr *hdr) +{ +	__be32 *p; + +	encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr); +	p = reserve_space(xdr, 16); +	*p++ = cpu_to_be32(0);		/* reclaim. always 0 for now */ +	*p++ = cpu_to_be32(args->layout_type); +	*p++ = cpu_to_be32(IOMODE_ANY); +	*p = cpu_to_be32(RETURN_FILE); +	p = reserve_space(xdr, 16); +	p = xdr_encode_hyper(p, 0); +	p = xdr_encode_hyper(p, NFS4_MAX_UINT64); +	spin_lock(&args->inode->i_lock); +	encode_nfs4_stateid(xdr, &args->stateid); +	spin_unlock(&args->inode->i_lock); +	if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) { +		NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn( +			NFS_I(args->inode)->layout, xdr, args); +	} else +		encode_uint32(xdr, 0); +} + +static int +encode_secinfo_no_name(struct xdr_stream *xdr, +		       const struct nfs41_secinfo_no_name_args *args, +		       struct compound_hdr *hdr) +{ +	encode_op_hdr(xdr, OP_SECINFO_NO_NAME, decode_secinfo_no_name_maxsz, hdr); +	encode_uint32(xdr, args->style); +	return 0; +} + +static void encode_test_stateid(struct xdr_stream *xdr, +				struct nfs41_test_stateid_args *args, +				struct compound_hdr *hdr) +{ +	encode_op_hdr(xdr, OP_TEST_STATEID, decode_test_stateid_maxsz, hdr); +	encode_uint32(xdr, 1); +	encode_nfs4_stateid(xdr, args->stateid); +} + +static void encode_free_stateid(struct xdr_stream *xdr, +				struct nfs41_free_stateid_args *args, +				struct compound_hdr *hdr) +{ +	encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr); +	encode_nfs4_stateid(xdr, &args->stateid);  }  #endif /* CONFIG_NFS_V4_1 */ @@ -1824,8 +2081,9 @@ encode_layoutget(struct xdr_stream *xdr,  static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)  {  #if defined(CONFIG_NFS_V4_1) -	if (args->sa_session) -		return args->sa_session->clp->cl_mvops->minor_version; +	struct nfs4_session *session = args->sa_slot->table->session; +	if (session) +		return session->clp->cl_mvops->minor_version;  #endif /* CONFIG_NFS_V4_1 */  	return 0;  } @@ -1833,393 +2091,355 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)  /*   * Encode an ACCESS request   */ -static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args) +static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr, +				const struct nfs4_accessargs *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_access(&xdr, args->access, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_access(xdr, args->access, &hdr); +	encode_getfattr(xdr, args->bitmask, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode LOOKUP request   */ -static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args) +static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr, +				const struct nfs4_lookup_arg *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->dir_fh, &hdr); -	encode_lookup(&xdr, args->name, &hdr); -	encode_getfh(&xdr, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->dir_fh, &hdr); +	encode_lookup(xdr, args->name, &hdr); +	encode_getfh(xdr, &hdr); +	encode_getfattr(xdr, args->bitmask, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode LOOKUP_ROOT request   */ -static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args) +static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     const struct nfs4_lookup_root_arg *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putrootfh(&xdr, &hdr); -	encode_getfh(&xdr, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putrootfh(xdr, &hdr); +	encode_getfh(xdr, &hdr); +	encode_getfattr(xdr, args->bitmask, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode REMOVE request   */ -static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args) +static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr, +				const struct nfs_removeargs *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_remove(&xdr, &args->name, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_remove(xdr, &args->name, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode RENAME request   */ -static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs_renameargs *args) +static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr, +				const struct nfs_renameargs *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->old_dir, &hdr); -	encode_savefh(&xdr, &hdr); -	encode_putfh(&xdr, args->new_dir, &hdr); -	encode_rename(&xdr, args->old_name, args->new_name, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); -	encode_restorefh(&xdr, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->old_dir, &hdr); +	encode_savefh(xdr, &hdr); +	encode_putfh(xdr, args->new_dir, &hdr); +	encode_rename(xdr, args->old_name, args->new_name, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode LINK request   */ -static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args) +static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr, +			     const struct nfs4_link_arg *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_savefh(&xdr, &hdr); -	encode_putfh(&xdr, args->dir_fh, &hdr); -	encode_link(&xdr, args->name, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); -	encode_restorefh(&xdr, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_savefh(xdr, &hdr); +	encode_putfh(xdr, args->dir_fh, &hdr); +	encode_link(xdr, args->name, &hdr); +	encode_restorefh(xdr, &hdr); +	encode_getfattr(xdr, args->bitmask, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode CREATE request   */ -static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args) +static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr, +				const struct nfs4_create_arg *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->dir_fh, &hdr); -	encode_savefh(&xdr, &hdr); -	encode_create(&xdr, args, &hdr); -	encode_getfh(&xdr, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); -	encode_restorefh(&xdr, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->dir_fh, &hdr); +	encode_create(xdr, args, &hdr); +	encode_getfh(xdr, &hdr); +	encode_getfattr(xdr, args->bitmask, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode SYMLINK request   */ -static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args) +static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr, +				 const struct nfs4_create_arg *args)  { -	return nfs4_xdr_enc_create(req, p, args); +	nfs4_xdr_enc_create(req, xdr, args);  }  /*   * Encode GETATTR request   */ -static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args) +static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr, +				 const struct nfs4_getattr_arg *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_getfattr(xdr, args->bitmask, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode a CLOSE request   */ -static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args) +static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr, +			       struct nfs_closeargs *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_close(&xdr, args, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_close(xdr, args, &hdr); +	encode_getfattr(xdr, args->bitmask, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode an OPEN request   */ -static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args) +static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, +			      struct nfs_openargs *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_savefh(&xdr, &hdr); -	encode_open(&xdr, args, &hdr); -	encode_getfh(&xdr, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); -	encode_restorefh(&xdr, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_open(xdr, args, &hdr); +	encode_getfh(xdr, &hdr); +	if (args->access) +		encode_access(xdr, args->access, &hdr); +	encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode an OPEN_CONFIRM request   */ -static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args) +static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, +				      struct xdr_stream *xdr, +				      struct nfs_open_confirmargs *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.nops   = 0,  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_open_confirm(&xdr, args, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_open_confirm(xdr, args, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode an OPEN request with no attributes.   */ -static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args) +static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     struct nfs_openargs *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_open(&xdr, args, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_open(xdr, args, &hdr); +	if (args->access) +		encode_access(xdr, args->access, &hdr); +	encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode an OPEN_DOWNGRADE request   */ -static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args) +static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, +					struct xdr_stream *xdr, +					struct nfs_closeargs *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_open_downgrade(&xdr, args, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_open_downgrade(xdr, args, &hdr); +	encode_getfattr(xdr, args->bitmask, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode a LOCK request   */ -static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args) +static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr, +			      struct nfs_lock_args *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_lock(&xdr, args, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_lock(xdr, args, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode a LOCKT request   */ -static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args) +static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr, +			       struct nfs_lockt_args *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_lockt(&xdr, args, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_lockt(xdr, args, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode a LOCKU request   */ -static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args) +static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr, +			       struct nfs_locku_args *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_locku(&xdr, args, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_locku(xdr, args, &hdr);  	encode_nops(&hdr); -	return 0;  } -static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args) +static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, +					   struct xdr_stream *xdr, +					struct nfs_release_lockowner_args *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = 0,  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_release_lockowner(&xdr, &args->lock_owner, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_release_lockowner(xdr, &args->lock_owner, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode a READLINK request   */ -static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args) +static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr, +				  const struct nfs4_readlink *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_readlink(&xdr, args, req, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_readlink(xdr, args, req, &hdr);  	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,  			args->pgbase, args->pglen);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode a READDIR request   */ -static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args) +static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr, +				 const struct nfs4_readdir_arg *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_readdir(&xdr, args, req, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_readdir(xdr, args, req, &hdr);  	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages,  			 args->pgbase, args->count); @@ -2227,428 +2447,483 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf  			__func__, hdr.replen << 2, args->pages,  			args->pgbase, args->count);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode a READ request   */ -static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) +static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr, +			      struct nfs_pgio_args *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_read(&xdr, args, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_read(xdr, args, &hdr);  	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,  			 args->pages, args->pgbase, args->count);  	req->rq_rcv_buf.flags |= XDRBUF_READ;  	encode_nops(&hdr); -	return 0;  }  /*   * Encode an SETATTR request   */ -static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args) +static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr, +				 struct nfs_setattrargs *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_setattr(&xdr, args, args->server, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_setattr(xdr, args, args->server, &hdr); +	encode_getfattr(xdr, args->bitmask, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode a GETACL request   */ -static int -nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p, -		struct nfs_getaclargs *args) +static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, +				struct nfs_getaclargs *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	};  	uint32_t replen; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1; -	encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	replen = hdr.replen + op_decode_hdr_maxsz + 1; +	encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);  	xdr_inline_pages(&req->rq_rcv_buf, replen << 2,  		args->acl_pages, args->acl_pgbase, args->acl_len); +  	encode_nops(&hdr); -	return 0;  }  /*   * Encode a WRITE request   */ -static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) +static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr, +			       struct nfs_pgio_args *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_write(&xdr, args, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_write(xdr, args, &hdr);  	req->rq_snd_buf.flags |= XDRBUF_WRITE; -	encode_getfattr(&xdr, args->bitmask, &hdr); +	if (args->bitmask) +		encode_getfattr(xdr, args->bitmask, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   *  a COMMIT request   */ -static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) +static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr, +				struct nfs_commitargs *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_commit(&xdr, args, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_commit(xdr, args, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * FSINFO request   */ -static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args) +static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr, +				struct nfs4_fsinfo_arg *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_fsinfo(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_fsinfo(xdr, args->bitmask, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * a PATHCONF request   */ -static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args) +static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr, +				  const struct nfs4_pathconf_arg *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0], +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],  			   &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * a STATFS request   */ -static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args) +static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr, +				const struct nfs4_statfs_arg *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0], +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],  			   args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * GETATTR_BITMAP request   */ -static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, -				    struct nfs4_server_caps_arg *args) +static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     struct nfs4_server_caps_arg *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fhandle, &hdr); -	encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS| +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fhandle, &hdr); +	encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS| +			   FATTR4_WORD0_FH_EXPIRE_TYPE|  			   FATTR4_WORD0_LINK_SUPPORT|  			   FATTR4_WORD0_SYMLINK_SUPPORT|  			   FATTR4_WORD0_ACLSUPPORT, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * a RENEW request   */ -static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) +static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr, +			       struct nfs_client *clp)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.nops	= 0,  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_renew(&xdr, clp, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_renew(xdr, clp->cl_clientid, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * a SETCLIENTID request   */ -static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc) +static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     struct nfs4_setclientid *sc)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.nops	= 0,  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_setclientid(&xdr, sc, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_setclientid(xdr, sc, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * a SETCLIENTID_CONFIRM request   */ -static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg) +static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, +					     struct xdr_stream *xdr, +					     struct nfs4_setclientid_res *arg)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.nops	= 0,  	}; -	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_setclientid_confirm(&xdr, arg, &hdr); -	encode_putrootfh(&xdr, &hdr); -	encode_fsinfo(&xdr, lease_bitmap, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_setclientid_confirm(xdr, arg, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * DELEGRETURN request   */ -static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args) +static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     const struct nfs4_delegreturnargs *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fhandle, &hdr); -	encode_delegreturn(&xdr, args->stateid, &hdr); -	encode_getfattr(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fhandle, &hdr); +	encode_getfattr(xdr, args->bitmask, &hdr); +	encode_delegreturn(xdr, args->stateid, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode FS_LOCATIONS request   */ -static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args) +static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, +				      struct xdr_stream *xdr, +				      struct nfs4_fs_locations_arg *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	};  	uint32_t replen; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->dir_fh, &hdr); -	encode_lookup(&xdr, args->name, &hdr); -	replen = hdr.replen;	/* get the attribute into args->page */ -	encode_fs_locations(&xdr, args->bitmask, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	if (args->migration) { +		encode_putfh(xdr, args->fh, &hdr); +		replen = hdr.replen; +		encode_fs_locations(xdr, args->bitmask, &hdr); +		if (args->renew) +			encode_renew(xdr, args->clientid, &hdr); +	} else { +		encode_putfh(xdr, args->dir_fh, &hdr); +		encode_lookup(xdr, args->name, &hdr); +		replen = hdr.replen; +		encode_fs_locations(xdr, args->bitmask, &hdr); +	} +	/* Set up reply kvec to capture returned fs_locations array. */  	xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,  			0, PAGE_SIZE);  	encode_nops(&hdr); -	return 0; +} + +/* + * Encode SECINFO request + */ +static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req, +				struct xdr_stream *xdr, +				struct nfs4_secinfo_arg *args) +{ +	struct compound_hdr hdr = { +		.minorversion = nfs4_xdr_minorversion(&args->seq_args), +	}; + +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->dir_fh, &hdr); +	encode_secinfo(xdr, args->name, &hdr); +	encode_nops(&hdr); +} + +/* + * Encode FSID_PRESENT request + */ +static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req, +				      struct xdr_stream *xdr, +				      struct nfs4_fsid_present_arg *args) +{ +	struct compound_hdr hdr = { +		.minorversion = nfs4_xdr_minorversion(&args->seq_args), +	}; + +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_getfh(xdr, &hdr); +	if (args->renew) +		encode_renew(xdr, args->clientid, &hdr); +	encode_nops(&hdr);  }  #if defined(CONFIG_NFS_V4_1)  /* + * BIND_CONN_TO_SESSION request + */ +static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req, +				struct xdr_stream *xdr, +				struct nfs_client *clp) +{ +	struct compound_hdr hdr = { +		.minorversion = clp->cl_mvops->minor_version, +	}; + +	encode_compound_hdr(xdr, req, &hdr); +	encode_bind_conn_to_session(xdr, clp->cl_session, &hdr); +	encode_nops(&hdr); +} + +/*   * EXCHANGE_ID request   */ -static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p, -				    struct nfs41_exchange_id_args *args) +static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     struct nfs41_exchange_id_args *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = args->client->cl_mvops->minor_version,  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_exchange_id(&xdr, args, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_exchange_id(xdr, args, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * a CREATE_SESSION request   */ -static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p, -				       struct nfs41_create_session_args *args) +static void nfs4_xdr_enc_create_session(struct rpc_rqst *req, +					struct xdr_stream *xdr, +					struct nfs41_create_session_args *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = args->client->cl_mvops->minor_version,  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_create_session(&xdr, args, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_create_session(xdr, args, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * a DESTROY_SESSION request   */ -static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p, -					struct nfs4_session *session) +static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, +					 struct xdr_stream *xdr, +					 struct nfs4_session *session)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = session->clp->cl_mvops->minor_version,  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_destroy_session(&xdr, session, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_destroy_session(xdr, session, &hdr); +	encode_nops(&hdr); +} + +/* + * a DESTROY_CLIENTID request + */ +static void nfs4_xdr_enc_destroy_clientid(struct rpc_rqst *req, +					 struct xdr_stream *xdr, +					 struct nfs_client *clp) +{ +	struct compound_hdr hdr = { +		.minorversion = clp->cl_mvops->minor_version, +	}; + +	encode_compound_hdr(xdr, req, &hdr); +	encode_destroy_clientid(xdr, clp->cl_clientid, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * a SEQUENCE request   */ -static int nfs4_xdr_enc_sequence(struct rpc_rqst *req, uint32_t *p, -				 struct nfs4_sequence_args *args) +static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr, +				  struct nfs4_sequence_args *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, args, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, args, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * a GET_LEASE_TIME request   */ -static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, -				       struct nfs4_get_lease_time_args *args) +static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, +					struct xdr_stream *xdr, +					struct nfs4_get_lease_time_args *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->la_seq_args),  	}; -	const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; +	const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME }; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->la_seq_args, &hdr); -	encode_putrootfh(&xdr, &hdr); -	encode_fsinfo(&xdr, lease_bitmap, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->la_seq_args, &hdr); +	encode_putrootfh(xdr, &hdr); +	encode_fsinfo(xdr, lease_bitmap, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * a RECLAIM_COMPLETE request   */ -static int nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, uint32_t *p, -				     struct nfs41_reclaim_complete_args *args) +static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, +					  struct xdr_stream *xdr, +				struct nfs41_reclaim_complete_args *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args)  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_reclaim_complete(&xdr, args, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_reclaim_complete(xdr, args, &hdr); +	encode_nops(&hdr); +} + +/* + * Encode GETDEVICELIST request + */ +static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, +				       struct xdr_stream *xdr, +				       struct nfs4_getdevicelist_args *args) +{ +	struct compound_hdr hdr = { +		.minorversion = nfs4_xdr_minorversion(&args->seq_args), +	}; + +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_getdevicelist(xdr, args, &hdr);  	encode_nops(&hdr); -	return 0;  }  /*   * Encode GETDEVICEINFO request   */ -static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, -				      struct nfs4_getdeviceinfo_args *args) +static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, +				       struct xdr_stream *xdr, +				       struct nfs4_getdeviceinfo_args *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_getdeviceinfo(&xdr, args, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_getdeviceinfo(xdr, args, &hdr);  	/* set up reply kvec. Subtract notification bitmap max size (2)  	 * so that notification bitmap is put in xdr_buf tail */ @@ -2657,28 +2932,121 @@ static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p,  			 args->pdev->pglen);  	encode_nops(&hdr); -	return 0;  }  /*   *  Encode LAYOUTGET request   */ -static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, -				  struct nfs4_layoutget_args *args) +static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, +				   struct xdr_stream *xdr, +				   struct nfs4_layoutget_args *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, NFS_FH(args->inode), &hdr); -	encode_layoutget(&xdr, args, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, NFS_FH(args->inode), &hdr); +	encode_layoutget(xdr, args, &hdr); + +	xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, +	    args->layout.pages, 0, args->layout.pglen); + +	encode_nops(&hdr); +} + +/* + *  Encode LAYOUTCOMMIT request + */ +static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, +				      struct xdr_stream *xdr, +				      struct nfs4_layoutcommit_args *args) +{ +	struct nfs4_layoutcommit_data *data = +		container_of(args, struct nfs4_layoutcommit_data, args); +	struct compound_hdr hdr = { +		.minorversion = nfs4_xdr_minorversion(&args->seq_args), +	}; + +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, NFS_FH(args->inode), &hdr); +	encode_layoutcommit(xdr, data->args.inode, args, &hdr); +	encode_getfattr(xdr, args->bitmask, &hdr); +	encode_nops(&hdr); +} + +/* + * Encode LAYOUTRETURN request + */ +static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, +				      struct xdr_stream *xdr, +				      struct nfs4_layoutreturn_args *args) +{ +	struct compound_hdr hdr = { +		.minorversion = nfs4_xdr_minorversion(&args->seq_args), +	}; + +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, NFS_FH(args->inode), &hdr); +	encode_layoutreturn(xdr, args, &hdr); +	encode_nops(&hdr); +} + +/* + * Encode SECINFO_NO_NAME request + */ +static int nfs4_xdr_enc_secinfo_no_name(struct rpc_rqst *req, +					struct xdr_stream *xdr, +					struct nfs41_secinfo_no_name_args *args) +{ +	struct compound_hdr hdr = { +		.minorversion = nfs4_xdr_minorversion(&args->seq_args), +	}; + +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putrootfh(xdr, &hdr); +	encode_secinfo_no_name(xdr, args, &hdr);  	encode_nops(&hdr);  	return 0;  } + +/* + *  Encode TEST_STATEID request + */ +static void nfs4_xdr_enc_test_stateid(struct rpc_rqst *req, +				      struct xdr_stream *xdr, +				      struct nfs41_test_stateid_args *args) +{ +	struct compound_hdr hdr = { +		.minorversion = nfs4_xdr_minorversion(&args->seq_args), +	}; + +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_test_stateid(xdr, args, &hdr); +	encode_nops(&hdr); +} + +/* + *  Encode FREE_STATEID request + */ +static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req, +				     struct xdr_stream *xdr, +				     struct nfs41_free_stateid_args *args) +{ +	struct compound_hdr hdr = { +		.minorversion = nfs4_xdr_minorversion(&args->seq_args), +	}; + +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_free_stateid(xdr, args, &hdr); +	encode_nops(&hdr); +}  #endif /* CONFIG_NFS_V4_1 */  static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -2730,7 +3098,8 @@ out_overflow:  	return -EIO;  } -static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, +		int *nfs_retval)  {  	__be32 *p;  	uint32_t opnum; @@ -2740,19 +3109,32 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)  	if (unlikely(!p))  		goto out_overflow;  	opnum = be32_to_cpup(p++); -	if (opnum != expected) { -		dprintk("nfs: Server returned operation" -			" %d but we issued a request for %d\n", -				opnum, expected); -		return -EIO; -	} +	if (unlikely(opnum != expected)) +		goto out_bad_operation;  	nfserr = be32_to_cpup(p); -	if (nfserr != NFS_OK) -		return nfs4_stat_to_errno(nfserr); -	return 0; +	if (nfserr == NFS_OK) +		*nfs_retval = 0; +	else +		*nfs_retval = nfs4_stat_to_errno(nfserr); +	return true; +out_bad_operation: +	dprintk("nfs: Server returned operation" +		" %d but we issued a request for %d\n", +			opnum, expected); +	*nfs_retval = -EREMOTEIO; +	return false;  out_overflow:  	print_overflow_msg(__func__, xdr); -	return -EIO; +	*nfs_retval = -EIO; +	return false; +} + +static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +{ +	int retval; + +	__decode_op_hdr(xdr, expected, &retval); +	return retval;  }  /* Dummy routine */ @@ -2779,14 +3161,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)  		goto out_overflow;  	bmlen = be32_to_cpup(p); -	bitmap[0] = bitmap[1] = 0; +	bitmap[0] = bitmap[1] = bitmap[2] = 0;  	p = xdr_inline_decode(xdr, (bmlen << 2));  	if (unlikely(!p))  		goto out_overflow;  	if (bmlen > 0) {  		bitmap[0] = be32_to_cpup(p++); -		if (bmlen > 1) -			bitmap[1] = be32_to_cpup(p); +		if (bmlen > 1) { +			bitmap[1] = be32_to_cpup(p++); +			if (bmlen > 2) +				bitmap[2] = be32_to_cpup(p); +		}  	}  	return 0;  out_overflow: @@ -2794,7 +3179,7 @@ out_overflow:  	return -EIO;  } -static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep) +static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigned int *savep)  {  	__be32 *p; @@ -2802,7 +3187,7 @@ static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen,  	if (unlikely(!p))  		goto out_overflow;  	*attrlen = be32_to_cpup(p); -	*savep = xdr->p; +	*savep = xdr_stream_pos(xdr);  	return 0;  out_overflow:  	print_overflow_msg(__func__, xdr); @@ -2818,8 +3203,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3  			return ret;  		bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;  	} else -		bitmask[0] = bitmask[1] = 0; -	dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); +		bitmask[0] = bitmask[1] = bitmask[2] = 0; +	dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, +		bitmask[0], bitmask[1], bitmask[2]);  	return 0;  } @@ -2850,6 +3236,28 @@ out_overflow:  	return -EIO;  } +static int decode_attr_fh_expire_type(struct xdr_stream *xdr, +				      uint32_t *bitmap, uint32_t *type) +{ +	__be32 *p; + +	*type = 0; +	if (unlikely(bitmap[0] & (FATTR4_WORD0_FH_EXPIRE_TYPE - 1U))) +		return -EIO; +	if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) { +		p = xdr_inline_decode(xdr, 4); +		if (unlikely(!p)) +			goto out_overflow; +		*type = be32_to_cpup(p); +		bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE; +	} +	dprintk("%s: expire type=0x%x\n", __func__, *type); +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} +  static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)  {  	__be32 *p; @@ -2987,7 +3395,7 @@ out_overflow:  	return -EIO;  } -static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap) +static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res)  {  	__be32 *p; @@ -2998,6 +3406,7 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap)  		if (unlikely(!p))  			goto out_overflow;  		bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; +		*res = -be32_to_cpup(p);  	}  	return 0;  out_overflow: @@ -3041,7 +3450,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint  {  	__be32 *p; -	*res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL; +	*res = 0;  	if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))  		return -EIO;  	if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { @@ -3095,7 +3504,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma  			goto out_overflow;  		xdr_decode_hyper(p, fileid);  		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; -		ret = NFS_ATTR_FATTR_FILEID; +		ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID;  	}  	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);  	return ret; @@ -3182,32 +3591,29 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)  	n = be32_to_cpup(p);  	if (n == 0)  		goto root_path; -	dprintk("path "); -	path->ncomponents = 0; -	while (path->ncomponents < n) { +	dprintk("pathname4: "); +	if (n > NFS4_PATHNAME_MAXCOMPONENTS) { +		dprintk("cannot parse %d components in path\n", n); +		goto out_eio; +	} +	for (path->ncomponents = 0; path->ncomponents < n; path->ncomponents++) {  		struct nfs4_string *component = &path->components[path->ncomponents];  		status = decode_opaque_inline(xdr, &component->len, &component->data);  		if (unlikely(status != 0))  			goto out_eio; -		if (path->ncomponents != n) -			dprintk("/"); -		dprintk("%s", component->data); -		if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS) -			path->ncomponents++; -		else { -			dprintk("cannot parse %d components in path\n", n); -			goto out_eio; -		} +		ifdebug (XDR) +			pr_cont("%s%.*s ", +				(path->ncomponents != n ? "/ " : ""), +				component->len, component->data);  	}  out: -	dprintk("\n");  	return status;  root_path:  /* a root pathname is sent as a zero component4 */  	path->ncomponents = 1;  	path->components[0].len=0;  	path->components[0].data=NULL; -	dprintk("path /\n"); +	dprintk("pathname4: /\n");  	goto out;  out_eio:  	dprintk(" status %d", status); @@ -3229,7 +3635,11 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st  	status = 0;  	if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))  		goto out; -	dprintk("%s: fsroot ", __func__); +	status = -EIO; +	/* Ignore borken servers that return unrequested attrs */ +	if (unlikely(res == NULL)) +		goto out; +	dprintk("%s: fsroot:\n", __func__);  	status = decode_pathname(xdr, &res->fs_path);  	if (unlikely(status != 0))  		goto out; @@ -3239,27 +3649,23 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st  	n = be32_to_cpup(p);  	if (n <= 0)  		goto out_eio; -	res->nlocations = 0; -	while (res->nlocations < n) { +	for (res->nlocations = 0; res->nlocations < n; res->nlocations++) {  		u32 m; -		struct nfs4_fs_location *loc = &res->locations[res->nlocations]; +		struct nfs4_fs_location *loc; +		if (res->nlocations == NFS4_FS_LOCATIONS_MAXENTRIES) +			break; +		loc = &res->locations[res->nlocations];  		p = xdr_inline_decode(xdr, 4);  		if (unlikely(!p))  			goto out_overflow;  		m = be32_to_cpup(p); -		loc->nservers = 0; -		dprintk("%s: servers ", __func__); -		while (loc->nservers < m) { -			struct nfs4_string *server = &loc->servers[loc->nservers]; -			status = decode_opaque_inline(xdr, &server->len, &server->data); -			if (unlikely(status != 0)) -				goto out_eio; -			dprintk("%s ", server->data); -			if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS) -				loc->nservers++; -			else { +		dprintk("%s: servers:\n", __func__); +		for (loc->nservers = 0; loc->nservers < m; loc->nservers++) { +			struct nfs4_string *server; + +			if (loc->nservers == NFS4_FS_LOCATION_MAXSERVERS) {  				unsigned int i;  				dprintk("%s: using first %u of %u servers "  					"returned for location %u\n", @@ -3273,16 +3679,20 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st  					if (unlikely(status != 0))  						goto out_eio;  				} +				break;  			} +			server = &loc->servers[loc->nservers]; +			status = decode_opaque_inline(xdr, &server->len, &server->data); +			if (unlikely(status != 0)) +				goto out_eio; +			dprintk("%s ", server->data);  		}  		status = decode_pathname(xdr, &loc->rootpath);  		if (unlikely(status != 0))  			goto out_eio; -		if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES) -			res->nlocations++;  	}  	if (res->nlocations != 0) -		status = NFS_ATTR_FATTR_V4_REFERRAL; +		status = NFS_ATTR_FATTR_V4_LOCATIONS;  out:  	dprintk("%s: fs_locations done, error = %d\n", __func__, status);  	return status; @@ -3460,13 +3870,14 @@ out_overflow:  }  static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, -		struct nfs_client *clp, uint32_t *uid, int may_sleep) +		const struct nfs_server *server, kuid_t *uid, +		struct nfs4_string *owner_name)  {  	uint32_t len;  	__be32 *p;  	int ret = 0; -	*uid = -2; +	*uid = make_kuid(&init_user_ns, -2);  	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))  		return -EIO;  	if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { @@ -3477,10 +3888,14 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,  		p = xdr_inline_decode(xdr, len);  		if (unlikely(!p))  			goto out_overflow; -		if (!may_sleep) { -			/* do nothing */ +		if (owner_name != NULL) { +			owner_name->data = kmemdup(p, len, GFP_NOWAIT); +			if (owner_name->data != NULL) { +				owner_name->len = len; +				ret = NFS_ATTR_FATTR_OWNER_NAME; +			}  		} else if (len < XDR_MAX_NETOBJ) { -			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) +			if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)  				ret = NFS_ATTR_FATTR_OWNER;  			else  				dprintk("%s: nfs_map_name_to_uid failed!\n", @@ -3490,7 +3905,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,  					__func__, len);  		bitmap[1] &= ~FATTR4_WORD1_OWNER;  	} -	dprintk("%s: uid=%d\n", __func__, (int)*uid); +	dprintk("%s: uid=%d\n", __func__, (int)from_kuid(&init_user_ns, *uid));  	return ret;  out_overflow:  	print_overflow_msg(__func__, xdr); @@ -3498,13 +3913,14 @@ out_overflow:  }  static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, -		struct nfs_client *clp, uint32_t *gid, int may_sleep) +		const struct nfs_server *server, kgid_t *gid, +		struct nfs4_string *group_name)  {  	uint32_t len;  	__be32 *p;  	int ret = 0; -	*gid = -2; +	*gid = make_kgid(&init_user_ns, -2);  	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))  		return -EIO;  	if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { @@ -3515,10 +3931,14 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,  		p = xdr_inline_decode(xdr, len);  		if (unlikely(!p))  			goto out_overflow; -		if (!may_sleep) { -			/* do nothing */ +		if (group_name != NULL) { +			group_name->data = kmemdup(p, len, GFP_NOWAIT); +			if (group_name->data != NULL) { +				group_name->len = len; +				ret = NFS_ATTR_FATTR_GROUP_NAME; +			}  		} else if (len < XDR_MAX_NETOBJ) { -			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) +			if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)  				ret = NFS_ATTR_FATTR_GROUP;  			else  				dprintk("%s: nfs_map_group_to_gid failed!\n", @@ -3528,7 +3948,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,  					__func__, len);  		bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;  	} -	dprintk("%s: gid=%d\n", __func__, (int)*gid); +	dprintk("%s: gid=%d\n", __func__, (int)from_kgid(&init_user_ns, *gid));  	return ret;  out_overflow:  	print_overflow_msg(__func__, xdr); @@ -3728,6 +4148,56 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,  	return status;  } +static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, +					struct nfs4_label *label) +{ +	uint32_t pi = 0; +	uint32_t lfs = 0; +	__u32 len; +	__be32 *p; +	int status = 0; + +	if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U))) +		return -EIO; +	if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) { +		p = xdr_inline_decode(xdr, 4); +		if (unlikely(!p)) +			goto out_overflow; +		lfs = be32_to_cpup(p++); +		p = xdr_inline_decode(xdr, 4); +		if (unlikely(!p)) +			goto out_overflow; +		pi = be32_to_cpup(p++); +		p = xdr_inline_decode(xdr, 4); +		if (unlikely(!p)) +			goto out_overflow; +		len = be32_to_cpup(p++); +		p = xdr_inline_decode(xdr, len); +		if (unlikely(!p)) +			goto out_overflow; +		if (len < NFS4_MAXLABELLEN) { +			if (label) { +				memcpy(label->label, p, len); +				label->len = len; +				label->pi = pi; +				label->lfs = lfs; +				status = NFS_ATTR_FATTR_V4_SECURITY_LABEL; +			} +			bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL; +		} else +			printk(KERN_WARNING "%s: label too long (%u)!\n", +					__func__, len); +	} +	if (label && label->label) +		dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__, +			(char *)label->label, label->len, label->pi, label->lfs); +	return status; + +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} +  static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)  {  	int status = 0; @@ -3746,10 +4216,10 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str  	return status;  } -static int verify_attr_len(struct xdr_stream *xdr, __be32 *savep, uint32_t attrlen) +static int verify_attr_len(struct xdr_stream *xdr, unsigned int savep, uint32_t attrlen)  {  	unsigned int attrwords = XDR_QUADLEN(attrlen); -	unsigned int nwords = xdr->p - savep; +	unsigned int nwords = (xdr_stream_pos(xdr) - savep) >> 2;  	if (unlikely(attrwords != nwords)) {  		dprintk("%s: server returned incorrect attribute length: " @@ -3779,7 +4249,7 @@ out_overflow:  	return -EIO;  } -static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) +static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access)  {  	__be32 *p;  	uint32_t supp, acc; @@ -3793,8 +4263,8 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)  		goto out_overflow;  	supp = be32_to_cpup(p++);  	acc = be32_to_cpup(p); -	access->supported = supp; -	access->access = acc; +	*supported = supp; +	*access = acc;  	return 0;  out_overflow:  	print_overflow_msg(__func__, xdr); @@ -3816,7 +4286,7 @@ static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)  static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)  { -	return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); +	return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);  }  static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) @@ -3833,16 +4303,21 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)  static int decode_verifier(struct xdr_stream *xdr, void *verifier)  { -	return decode_opaque_fixed(xdr, verifier, 8); +	return decode_opaque_fixed(xdr, verifier, NFS4_VERIFIER_SIZE);  } -static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) +static int decode_write_verifier(struct xdr_stream *xdr, struct nfs_write_verifier *verifier) +{ +	return decode_opaque_fixed(xdr, verifier->data, NFS4_VERIFIER_SIZE); +} + +static int decode_commit(struct xdr_stream *xdr, struct nfs_commitres *res)  {  	int status;  	status = decode_op_hdr(xdr, OP_COMMIT);  	if (!status) -		status = decode_verifier(xdr, res->verf->verifier); +		status = decode_write_verifier(xdr, &res->verf->verifier);  	return status;  } @@ -3871,8 +4346,8 @@ out_overflow:  static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)  { -	__be32 *savep; -	uint32_t attrlen, bitmap[2] = {0}; +	unsigned int savep; +	uint32_t attrlen, bitmap[3] = {0};  	int status;  	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -3883,6 +4358,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re  		goto xdr_error;  	if ((status = decode_attr_supported(xdr, bitmap, res->attr_bitmask)) != 0)  		goto xdr_error; +	if ((status = decode_attr_fh_expire_type(xdr, bitmap, +						 &res->fh_expire_type)) != 0) +		goto xdr_error;  	if ((status = decode_attr_link_support(xdr, bitmap, &res->has_links)) != 0)  		goto xdr_error;  	if ((status = decode_attr_symlink_support(xdr, bitmap, &res->has_symlinks)) != 0) @@ -3897,8 +4375,8 @@ xdr_error:  static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)  { -	__be32 *savep; -	uint32_t attrlen, bitmap[2] = {0}; +	unsigned int savep; +	uint32_t attrlen, bitmap[3] = {0};  	int status;  	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -3929,8 +4407,8 @@ xdr_error:  static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)  { -	__be32 *savep; -	uint32_t attrlen, bitmap[2] = {0}; +	unsigned int savep; +	uint32_t attrlen, bitmap[3] = {0};  	int status;  	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -3951,14 +4429,124 @@ xdr_error:  	return status;  } +static int decode_threshold_hint(struct xdr_stream *xdr, +				  uint32_t *bitmap, +				  uint64_t *res, +				  uint32_t hint_bit) +{ +	__be32 *p; + +	*res = 0; +	if (likely(bitmap[0] & hint_bit)) { +		p = xdr_inline_decode(xdr, 8); +		if (unlikely(!p)) +			goto out_overflow; +		xdr_decode_hyper(p, res); +	} +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +static int decode_first_threshold_item4(struct xdr_stream *xdr, +					struct nfs4_threshold *res) +{ +	__be32 *p; +	unsigned int savep; +	uint32_t bitmap[3] = {0,}, attrlen; +	int status; + +	/* layout type */ +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(!p)) { +		print_overflow_msg(__func__, xdr); +		return -EIO; +	} +	res->l_type = be32_to_cpup(p); + +	/* thi_hintset bitmap */ +	status = decode_attr_bitmap(xdr, bitmap); +	if (status < 0) +		goto xdr_error; + +	/* thi_hintlist length */ +	status = decode_attr_length(xdr, &attrlen, &savep); +	if (status < 0) +		goto xdr_error; +	/* thi_hintlist */ +	status = decode_threshold_hint(xdr, bitmap, &res->rd_sz, THRESHOLD_RD); +	if (status < 0) +		goto xdr_error; +	status = decode_threshold_hint(xdr, bitmap, &res->wr_sz, THRESHOLD_WR); +	if (status < 0) +		goto xdr_error; +	status = decode_threshold_hint(xdr, bitmap, &res->rd_io_sz, +				       THRESHOLD_RD_IO); +	if (status < 0) +		goto xdr_error; +	status = decode_threshold_hint(xdr, bitmap, &res->wr_io_sz, +				       THRESHOLD_WR_IO); +	if (status < 0) +		goto xdr_error; + +	status = verify_attr_len(xdr, savep, attrlen); +	res->bm = bitmap[0]; + +	dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n", +		 __func__, res->bm, res->rd_sz, res->wr_sz, res->rd_io_sz, +		res->wr_io_sz); +xdr_error: +	dprintk("%s ret=%d!\n", __func__, status); +	return status; +} + +/* + * Thresholds on pNFS direct I/O vrs MDS I/O + */ +static int decode_attr_mdsthreshold(struct xdr_stream *xdr, +				    uint32_t *bitmap, +				    struct nfs4_threshold *res) +{ +	__be32 *p; +	int status = 0; +	uint32_t num; + +	if (unlikely(bitmap[2] & (FATTR4_WORD2_MDSTHRESHOLD - 1U))) +		return -EIO; +	if (bitmap[2] & FATTR4_WORD2_MDSTHRESHOLD) { +		/* Did the server return an unrequested attribute? */ +		if (unlikely(res == NULL)) +			return -EREMOTEIO; +		p = xdr_inline_decode(xdr, 4); +		if (unlikely(!p)) +			goto out_overflow; +		num = be32_to_cpup(p); +		if (num == 0) +			return 0; +		if (num > 1) +			printk(KERN_INFO "%s: Warning: Multiple pNFS layout " +				"drivers per filesystem not supported\n", +				__func__); + +		status = decode_first_threshold_item4(xdr, res); +		bitmap[2] &= ~FATTR4_WORD2_MDSTHRESHOLD; +	} +	return status; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} +  static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,  		struct nfs_fattr *fattr, struct nfs_fh *fh, -		const struct nfs_server *server, int may_sleep) +		struct nfs4_fs_locations *fs_loc, struct nfs4_label *label, +		const struct nfs_server *server)  {  	int status;  	umode_t fmode = 0; -	uint64_t fileid;  	uint32_t type; +	int32_t err;  	status = decode_attr_type(xdr, bitmap, &type);  	if (status < 0) @@ -3984,7 +4572,8 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,  		goto xdr_error;  	fattr->valid |= status; -	status = decode_attr_error(xdr, bitmap); +	err = 0; +	status = decode_attr_error(xdr, bitmap, &err);  	if (status < 0)  		goto xdr_error; @@ -3997,9 +4586,7 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,  		goto xdr_error;  	fattr->valid |= status; -	status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, -						struct nfs4_fs_locations, -						fattr)); +	status = decode_attr_fs_locations(xdr, bitmap, fs_loc);  	if (status < 0)  		goto xdr_error;  	fattr->valid |= status; @@ -4017,14 +4604,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,  		goto xdr_error;  	fattr->valid |= status; -	status = decode_attr_owner(xdr, bitmap, server->nfs_client, -			&fattr->uid, may_sleep); +	status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, fattr->owner_name);  	if (status < 0)  		goto xdr_error;  	fattr->valid |= status; -	status = decode_attr_group(xdr, bitmap, server->nfs_client, -			&fattr->gid, may_sleep); +	status = decode_attr_group(xdr, bitmap, server, &fattr->gid, fattr->group_name);  	if (status < 0)  		goto xdr_error;  	fattr->valid |= status; @@ -4054,11 +4639,19 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,  		goto xdr_error;  	fattr->valid |= status; -	status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid); +	status = decode_attr_mounted_on_fileid(xdr, bitmap, &fattr->mounted_on_fileid); +	if (status < 0) +		goto xdr_error; +	fattr->valid |= status; + +	status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold);  	if (status < 0)  		goto xdr_error; -	if (status != 0 && !(fattr->valid & status)) { -		fattr->fileid = fileid; + +	if (label) { +		status = decode_attr_security_label(xdr, bitmap, label); +		if (status < 0) +			goto xdr_error;  		fattr->valid |= status;  	} @@ -4068,11 +4661,12 @@ xdr_error:  }  static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, -		struct nfs_fh *fh, const struct nfs_server *server, int may_sleep) +		struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc, +		struct nfs4_label *label, const struct nfs_server *server)  { -	__be32 *savep; +	unsigned int savep;  	uint32_t attrlen, -		 bitmap[2] = {0}; +		 bitmap[3] = {0};  	int status;  	status = decode_op_hdr(xdr, OP_GETATTR); @@ -4087,7 +4681,8 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat  	if (status < 0)  		goto xdr_error; -	status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep); +	status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, +					label, server);  	if (status < 0)  		goto xdr_error; @@ -4097,10 +4692,16 @@ xdr_error:  	return status;  } +static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr, +		struct nfs4_label *label, const struct nfs_server *server) +{ +	return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server); +} +  static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, -		const struct nfs_server *server, int may_sleep) +		const struct nfs_server *server)  { -	return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep); +	return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server);  }  /* @@ -4110,7 +4711,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,  static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,  					 uint32_t *layouttype)  { -	uint32_t *p; +	__be32 *p;  	int num;  	p = xdr_inline_decode(xdr, 4); @@ -4124,8 +4725,8 @@ static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,  		return 0;  	}  	if (num > 1) -		printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " -			"per filesystem not supported\n", __func__); +		printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout " +			"drivers per filesystem not supported\n", __func__);  	/* Decode and set first layout type, move xdr->p past unused types */  	p = xdr_inline_decode(xdr, num * 4); @@ -4158,10 +4759,32 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,  	return status;  } +/* + * The prefered block size for layout directed io + */ +static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, +				      uint32_t *res) +{ +	__be32 *p; + +	dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); +	*res = 0; +	if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { +		p = xdr_inline_decode(xdr, 4); +		if (unlikely(!p)) { +			print_overflow_msg(__func__, xdr); +			return -EIO; +		} +		*res = be32_to_cpup(p); +		bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; +	} +	return 0; +} +  static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)  { -	__be32 *savep; -	uint32_t attrlen, bitmap[2]; +	unsigned int savep; +	uint32_t attrlen, bitmap[3];  	int status;  	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -4189,6 +4812,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)  	status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);  	if (status != 0)  		goto xdr_error; +	status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); +	if (status) +		goto xdr_error;  	status = verify_attr_len(xdr, savep, attrlen);  xdr_error: @@ -4390,11 +5016,12 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)  	uint32_t savewords, bmlen, i;  	int status; -	status = decode_op_hdr(xdr, OP_OPEN); -	if (status != -EIO) -		nfs_increment_open_seqid(status, res->seqid); -	if (!status) -		status = decode_stateid(xdr, &res->stateid); +	if (!__decode_op_hdr(xdr, OP_OPEN, &status)) +		return status; +	nfs_increment_open_seqid(status, res->seqid); +	if (status) +		return status; +	status = decode_stateid(xdr, &res->stateid);  	if (unlikely(status))  		return status; @@ -4460,11 +5087,11 @@ static int decode_putrootfh(struct xdr_stream *xdr)  	return decode_op_hdr(xdr, OP_PUTROOTFH);  } -static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res) +static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, +		       struct nfs_pgio_res *res)  { -	struct kvec *iov = req->rq_rcv_buf.head;  	__be32 *p; -	uint32_t count, eof, recvd, hdrlen; +	uint32_t count, eof, recvd;  	int status;  	status = decode_op_hdr(xdr, OP_READ); @@ -4475,15 +5102,13 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_  		goto out_overflow;  	eof = be32_to_cpup(p++);  	count = be32_to_cpup(p); -	hdrlen = (u8 *) p - (u8 *) iov->iov_base; -	recvd = req->rq_rcv_buf.len - hdrlen; +	recvd = xdr_read_pages(xdr, count);  	if (count > recvd) {  		dprintk("NFS: server cheating in read reply: "  				"count %u > recvd %u\n", count, recvd);  		count = recvd;  		eof = 0;  	} -	xdr_read_pages(xdr, count);  	res->eof = eof;  	res->count = count;  	return 0; @@ -4494,38 +5119,23 @@ out_overflow:  static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)  { -	struct xdr_buf	*rcvbuf = &req->rq_rcv_buf; -	struct kvec	*iov = rcvbuf->head; -	size_t		hdrlen; -	u32		recvd, pglen = rcvbuf->page_len;  	int		status; +	__be32		verf[2];  	status = decode_op_hdr(xdr, OP_READDIR);  	if (!status)  		status = decode_verifier(xdr, readdir->verifier.data);  	if (unlikely(status))  		return status; +	memcpy(verf, readdir->verifier.data, sizeof(verf));  	dprintk("%s: verifier = %08x:%08x\n", -			__func__, -			((u32 *)readdir->verifier.data)[0], -			((u32 *)readdir->verifier.data)[1]); - - -	hdrlen = (char *) xdr->p - (char *) iov->iov_base; -	recvd = rcvbuf->len - hdrlen; -	if (pglen > recvd) -		pglen = recvd; -	xdr_read_pages(xdr, pglen); - - -	return 0; +			__func__, verf[0], verf[1]); +	return xdr_read_pages(xdr, xdr->buf->page_len);  }  static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)  {  	struct xdr_buf *rcvbuf = &req->rq_rcv_buf; -	struct kvec *iov = rcvbuf->head; -	size_t hdrlen;  	u32 len, recvd;  	__be32 *p;  	int status; @@ -4543,14 +5153,12 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)  		dprintk("nfs: server returned giant symlink!\n");  		return -ENAMETOOLONG;  	} -	hdrlen = (char *) xdr->p - (char *) iov->iov_base; -	recvd = req->rq_rcv_buf.len - hdrlen; +	recvd = xdr_read_pages(xdr, len);  	if (recvd < len) {  		dprintk("NFS: server cheating in readlink reply: "  				"count %u > recvd %u\n", len, recvd);  		return -EIO;  	} -	xdr_read_pages(xdr, len);  	/*  	 * The XDR encode routine has set things up so that  	 * the link text will be copied directly into the @@ -4604,17 +5212,23 @@ decode_restorefh(struct xdr_stream *xdr)  }  static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, -		size_t *acl_len) +			 struct nfs_getaclres *res)  { -	__be32 *savep; +	unsigned int savep;  	uint32_t attrlen, -		 bitmap[2] = {0}; -	struct kvec *iov = req->rq_rcv_buf.head; +		 bitmap[3] = {0};  	int status; +	unsigned int pg_offset; -	*acl_len = 0; +	res->acl_len = 0;  	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)  		goto out; + +	xdr_enter_page(xdr, xdr->buf->page_len); + +	/* Calculate the offset of the page data */ +	pg_offset = xdr->buf->head[0].iov_len; +  	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)  		goto out;  	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) @@ -4623,21 +5237,20 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,  	if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))  		return -EIO;  	if (likely(bitmap[0] & FATTR4_WORD0_ACL)) { -		size_t hdrlen; -		u32 recvd; - -		/* We ignore &savep and don't do consistency checks on -		 * the attr length.  Let userspace figure it out.... */ -		hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base; -		recvd = req->rq_rcv_buf.len - hdrlen; -		if (attrlen > recvd) { -			dprintk("NFS: server cheating in getattr" -					" acl reply: attrlen %u > recvd %u\n", -					attrlen, recvd); -			return -EINVAL; + +		/* The bitmap (xdr len + bitmaps) and the attr xdr len words +		 * are stored with the acl data to handle the problem of +		 * variable length bitmaps.*/ +		res->acl_data_offset = xdr_stream_pos(xdr) - pg_offset; +		res->acl_len = attrlen; + +		/* Check for receive buffer overflow */ +		if (res->acl_len > (xdr->nwords << 2) || +		    res->acl_len + res->acl_data_offset > xdr->buf->page_len) { +			res->acl_flags |= NFS4_ACL_TRUNC; +			dprintk("NFS: acl reply: attrlen %u > page_len %u\n", +					attrlen, xdr->nwords << 2);  		} -		xdr_read_pages(xdr, attrlen); -		*acl_len = attrlen;  	} else  		status = -EOPNOTSUPP; @@ -4729,7 +5342,7 @@ static int decode_setclientid_confirm(struct xdr_stream *xdr)  	return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM);  } -static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) +static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res)  {  	__be32 *p;  	int status; @@ -4738,13 +5351,12 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)  	if (status)  		return status; -	p = xdr_inline_decode(xdr, 16); +	p = xdr_inline_decode(xdr, 8);  	if (unlikely(!p))  		goto out_overflow;  	res->count = be32_to_cpup(p++);  	res->verf->committed = be32_to_cpup(p++); -	memcpy(res->verf->verifier, p, 8); -	return 0; +	return decode_write_verifier(xdr, &res->verf->verifier);  out_overflow:  	print_overflow_msg(__func__, xdr);  	return -EIO; @@ -4755,7 +5367,114 @@ static int decode_delegreturn(struct xdr_stream *xdr)  	return decode_op_hdr(xdr, OP_DELEGRETURN);  } +static int decode_secinfo_gss(struct xdr_stream *xdr, +			      struct nfs4_secinfo4 *flavor) +{ +	u32 oid_len; +	__be32 *p; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(!p)) +		goto out_overflow; +	oid_len = be32_to_cpup(p); +	if (oid_len > GSS_OID_MAX_LEN) +		goto out_err; + +	p = xdr_inline_decode(xdr, oid_len); +	if (unlikely(!p)) +		goto out_overflow; +	memcpy(flavor->flavor_info.oid.data, p, oid_len); +	flavor->flavor_info.oid.len = oid_len; + +	p = xdr_inline_decode(xdr, 8); +	if (unlikely(!p)) +		goto out_overflow; +	flavor->flavor_info.qop = be32_to_cpup(p++); +	flavor->flavor_info.service = be32_to_cpup(p); + +	return 0; + +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +out_err: +	return -EINVAL; +} + +static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) +{ +	struct nfs4_secinfo4 *sec_flavor; +	unsigned int i, num_flavors; +	int status; +	__be32 *p; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(!p)) +		goto out_overflow; + +	res->flavors->num_flavors = 0; +	num_flavors = be32_to_cpup(p); + +	for (i = 0; i < num_flavors; i++) { +		sec_flavor = &res->flavors->flavors[i]; +		if ((char *)&sec_flavor[1] - (char *)res->flavors > PAGE_SIZE) +			break; + +		p = xdr_inline_decode(xdr, 4); +		if (unlikely(!p)) +			goto out_overflow; +		sec_flavor->flavor = be32_to_cpup(p); + +		if (sec_flavor->flavor == RPC_AUTH_GSS) { +			status = decode_secinfo_gss(xdr, sec_flavor); +			if (status) +				goto out; +		} +		res->flavors->num_flavors++; +	} + +	status = 0; +out: +	return status; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) +{ +	int status = decode_op_hdr(xdr, OP_SECINFO); +	if (status) +		return status; +	return decode_secinfo_common(xdr, res); +} +  #if defined(CONFIG_NFS_V4_1) +static int decode_secinfo_no_name(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) +{ +	int status = decode_op_hdr(xdr, OP_SECINFO_NO_NAME); +	if (status) +		return status; +	return decode_secinfo_common(xdr, res); +} + +static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) +{ +	__be32 *p; +	uint32_t bitmap_words; +	unsigned int i; + +	p = xdr_inline_decode(xdr, 4); +	bitmap_words = be32_to_cpup(p++); +	if (bitmap_words > NFS4_OP_MAP_NUM_WORDS) +		return -EIO; +	p = xdr_inline_decode(xdr, 4 * bitmap_words); +	for (i = 0; i < bitmap_words; i++) +		op_map->u.words[i] = be32_to_cpup(p++); + +	return 0; +} +  static int decode_exchange_id(struct xdr_stream *xdr,  			      struct nfs41_exchange_id_res *res)  { @@ -4763,7 +5482,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,  	uint32_t dummy;  	char *dummy_str;  	int status; -	struct nfs_client *clp = res->client; +	uint32_t impl_id_count;  	status = decode_op_hdr(xdr, OP_EXCHANGE_ID);  	if (status) @@ -4772,38 +5491,86 @@ static int decode_exchange_id(struct xdr_stream *xdr,  	p = xdr_inline_decode(xdr, 8);  	if (unlikely(!p))  		goto out_overflow; -	xdr_decode_hyper(p, &clp->cl_ex_clid); +	xdr_decode_hyper(p, &res->clientid);  	p = xdr_inline_decode(xdr, 12);  	if (unlikely(!p))  		goto out_overflow; -	clp->cl_seqid = be32_to_cpup(p++); -	clp->cl_exchange_flags = be32_to_cpup(p++); +	res->seqid = be32_to_cpup(p++); +	res->flags = be32_to_cpup(p++); -	/* We ask for SP4_NONE */ -	dummy = be32_to_cpup(p); -	if (dummy != SP4_NONE) +	res->state_protect.how = be32_to_cpup(p); +	switch (res->state_protect.how) { +	case SP4_NONE: +		break; +	case SP4_MACH_CRED: +		status = decode_op_map(xdr, &res->state_protect.enforce); +		if (status) +			return status; +		status = decode_op_map(xdr, &res->state_protect.allow); +		if (status) +			return status; +		break; +	default: +		WARN_ON_ONCE(1);  		return -EIO; +	} -	/* Throw away minor_id */ +	/* server_owner4.so_minor_id */  	p = xdr_inline_decode(xdr, 8);  	if (unlikely(!p))  		goto out_overflow; +	p = xdr_decode_hyper(p, &res->server_owner->minor_id); -	/* Throw away Major id */ +	/* server_owner4.so_major_id */  	status = decode_opaque_inline(xdr, &dummy, &dummy_str);  	if (unlikely(status))  		return status; +	if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) +		return -EIO; +	memcpy(res->server_owner->major_id, dummy_str, dummy); +	res->server_owner->major_id_sz = dummy; -	/* Throw away server_scope */ +	/* server_scope4 */  	status = decode_opaque_inline(xdr, &dummy, &dummy_str);  	if (unlikely(status))  		return status; +	if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) +		return -EIO; +	memcpy(res->server_scope->server_scope, dummy_str, dummy); +	res->server_scope->server_scope_sz = dummy; -	/* Throw away Implementation id array */ -	status = decode_opaque_inline(xdr, &dummy, &dummy_str); -	if (unlikely(status)) -		return status; +	/* Implementation Id */ +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(!p)) +		goto out_overflow; +	impl_id_count = be32_to_cpup(p++); +	if (impl_id_count) { +		/* nii_domain */ +		status = decode_opaque_inline(xdr, &dummy, &dummy_str); +		if (unlikely(status)) +			return status; +		if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) +			return -EIO; +		memcpy(res->impl_id->domain, dummy_str, dummy); + +		/* nii_name */ +		status = decode_opaque_inline(xdr, &dummy, &dummy_str); +		if (unlikely(status)) +			return status; +		if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) +			return -EIO; +		memcpy(res->impl_id->name, dummy_str, dummy); + +		/* nii_date */ +		p = xdr_inline_decode(xdr, 12); +		if (unlikely(!p)) +			goto out_overflow; +		p = xdr_decode_hyper(p, &res->impl_id->date.seconds); +		res->impl_id->date.nseconds = be32_to_cpup(p); + +		/* if there's more than one entry, ignore the rest */ +	}  	return 0;  out_overflow:  	print_overflow_msg(__func__, xdr); @@ -4814,12 +5581,14 @@ static int decode_chan_attrs(struct xdr_stream *xdr,  			     struct nfs4_channel_attrs *attrs)  {  	__be32 *p; -	u32 nr_attrs; +	u32 nr_attrs, val;  	p = xdr_inline_decode(xdr, 28);  	if (unlikely(!p))  		goto out_overflow; -	attrs->headerpadsz = be32_to_cpup(p++); +	val = be32_to_cpup(p++);	/* headerpadsz */ +	if (val) +		return -EINVAL;		/* no support for header padding yet */  	attrs->max_rqst_sz = be32_to_cpup(p++);  	attrs->max_resp_sz = be32_to_cpup(p++);  	attrs->max_resp_sz_cached = be32_to_cpup(p++); @@ -4827,8 +5596,8 @@ static int decode_chan_attrs(struct xdr_stream *xdr,  	attrs->max_reqs = be32_to_cpup(p++);  	nr_attrs = be32_to_cpup(p);  	if (unlikely(nr_attrs > 1)) { -		printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n", -			__func__, nr_attrs); +		printk(KERN_WARNING "NFS: %s: Invalid rdma channel attrs " +			"count %u\n", __func__, nr_attrs);  		return -EINVAL;  	}  	if (nr_attrs == 1) { @@ -4847,6 +5616,37 @@ static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)  	return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);  } +static int decode_bind_conn_to_session(struct xdr_stream *xdr, +				struct nfs41_bind_conn_to_session_res *res) +{ +	__be32 *p; +	int status; + +	status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION); +	if (!status) +		status = decode_sessionid(xdr, &res->session->sess_id); +	if (unlikely(status)) +		return status; + +	/* dir flags, rdma mode bool */ +	p = xdr_inline_decode(xdr, 8); +	if (unlikely(!p)) +		goto out_overflow; + +	res->dir = be32_to_cpup(p++); +	if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH) +		return -EIO; +	if (be32_to_cpup(p) == 0) +		res->use_conn_in_rdma_mode = false; +	else +		res->use_conn_in_rdma_mode = true; + +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} +  static int decode_create_session(struct xdr_stream *xdr,  				 struct nfs41_create_session_res *res)  { @@ -4883,6 +5683,11 @@ static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)  	return decode_op_hdr(xdr, OP_DESTROY_SESSION);  } +static int decode_destroy_clientid(struct xdr_stream *xdr, void *dummy) +{ +	return decode_op_hdr(xdr, OP_DESTROY_CLIENTID); +} +  static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy)  {  	return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE); @@ -4894,12 +5699,15 @@ static int decode_sequence(struct xdr_stream *xdr,  			   struct rpc_rqst *rqstp)  {  #if defined(CONFIG_NFS_V4_1) +	struct nfs4_session *session;  	struct nfs4_sessionid id;  	u32 dummy;  	int status;  	__be32 *p; -	if (!res->sr_session) +	if (res->sr_slot == NULL) +		return 0; +	if (!res->sr_slot->table->session)  		return 0;  	status = decode_op_hdr(xdr, OP_SEQUENCE); @@ -4913,8 +5721,9 @@ static int decode_sequence(struct xdr_stream *xdr,  	 * sequence number, the server is looney tunes.  	 */  	status = -EREMOTEIO; +	session = res->sr_slot->table->session; -	if (memcmp(id.data, res->sr_session->sess_id.data, +	if (memcmp(id.data, session->sess_id.data,  		   NFS4_MAX_SESSIONID_LEN)) {  		dprintk("%s Invalid session id\n", __func__);  		goto out_err; @@ -4932,14 +5741,14 @@ static int decode_sequence(struct xdr_stream *xdr,  	}  	/* slot id */  	dummy = be32_to_cpup(p++); -	if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) { +	if (dummy != res->sr_slot->slot_nr) {  		dprintk("%s Invalid slot id\n", __func__);  		goto out_err;  	} -	/* highest slot id - currently not processed */ -	dummy = be32_to_cpup(p++); -	/* target highest slot id - currently not processed */ -	dummy = be32_to_cpup(p++); +	/* highest slot id */ +	res->sr_highest_slotid = be32_to_cpup(p++); +	/* target highest slot id */ +	res->sr_target_highest_slotid = be32_to_cpup(p++);  	/* result flags */  	res->sr_status_flags = be32_to_cpup(p);  	status = 0; @@ -4956,6 +5765,53 @@ out_overflow:  }  #if defined(CONFIG_NFS_V4_1) +/* + * TODO: Need to handle case when EOF != true; + */ +static int decode_getdevicelist(struct xdr_stream *xdr, +				struct pnfs_devicelist *res) +{ +	__be32 *p; +	int status, i; +	nfs4_verifier verftemp; + +	status = decode_op_hdr(xdr, OP_GETDEVICELIST); +	if (status) +		return status; + +	p = xdr_inline_decode(xdr, 8 + 8 + 4); +	if (unlikely(!p)) +		goto out_overflow; + +	/* TODO: Skip cookie for now */ +	p += 2; + +	/* Read verifier */ +	p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE); + +	res->num_devs = be32_to_cpup(p); + +	dprintk("%s: num_dev %d\n", __func__, res->num_devs); + +	if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) { +		printk(KERN_ERR "NFS: %s too many result dev_num %u\n", +				__func__, res->num_devs); +		return -EIO; +	} + +	p = xdr_inline_decode(xdr, +			      res->num_devs * NFS4_DEVICEID4_SIZE + 4); +	if (unlikely(!p)) +		goto out_overflow; +	for (i = 0; i < res->num_devs; i++) +		p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, +					    NFS4_DEVICEID4_SIZE); +	res->eof = be32_to_cpup(p); +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +}  static int decode_getdeviceinfo(struct xdr_stream *xdr,  				struct pnfs_device *pdev) @@ -4992,7 +5848,8 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,  	 * and places the remaining xdr data in xdr_buf->tail  	 */  	pdev->mincount = be32_to_cpup(p); -	xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ +	if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount) +		goto out_overflow;  	/* Parse notification bitmap, verifying that it is zero. */  	p = xdr_inline_decode(xdr, 4); @@ -5000,7 +5857,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,  		goto out_overflow;  	len = be32_to_cpup(p);  	if (len) { -		int i; +		uint32_t i;  		p = xdr_inline_decode(xdr, 4 * len);  		if (unlikely(!p)) @@ -5025,15 +5882,19 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,  	__be32 *p;  	int status;  	u32 layout_count; +	u32 recvd;  	status = decode_op_hdr(xdr, OP_LAYOUTGET);  	if (status)  		return status; -	p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(!p)) +		goto out_overflow; +	res->return_on_close = be32_to_cpup(p); +	decode_stateid(xdr, &res->stateid); +	p = xdr_inline_decode(xdr, 4);  	if (unlikely(!p))  		goto out_overflow; -	res->return_on_close = be32_to_cpup(p++); -	p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);  	layout_count = be32_to_cpup(p);  	if (!layout_count) {  		dprintk("%s: server responded with empty layout array\n", @@ -5041,17 +5902,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,  		return -EINVAL;  	} -	p = xdr_inline_decode(xdr, 24); +	p = xdr_inline_decode(xdr, 28);  	if (unlikely(!p))  		goto out_overflow;  	p = xdr_decode_hyper(p, &res->range.offset);  	p = xdr_decode_hyper(p, &res->range.length);  	res->range.iomode = be32_to_cpup(p++);  	res->type = be32_to_cpup(p++); - -	status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); -	if (unlikely(status)) -		return status; +	res->layoutp->len = be32_to_cpup(p);  	dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",  		__func__, @@ -5059,12 +5917,15 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,  		(unsigned long)res->range.length,  		res->range.iomode,  		res->type, -		res->layout.len); +		res->layoutp->len); -	/* nfs4_proc_layoutget allocated a single page */ -	if (res->layout.len > PAGE_SIZE) -		return -ENOMEM; -	memcpy(res->layout.buf, p, res->layout.len); +	recvd = xdr_read_pages(xdr, res->layoutp->len); +	if (res->layoutp->len > recvd) { +		dprintk("NFS: server cheating in layoutget reply: " +				"layout len %u > recvd %u\n", +				res->layoutp->len, recvd); +		return -EINVAL; +	}  	if (layout_count > 1) {  		/* We only handle a length one array at the moment.  Any @@ -5081,6 +5942,94 @@ out_overflow:  	print_overflow_msg(__func__, xdr);  	return -EIO;  } + +static int decode_layoutreturn(struct xdr_stream *xdr, +			       struct nfs4_layoutreturn_res *res) +{ +	__be32 *p; +	int status; + +	status = decode_op_hdr(xdr, OP_LAYOUTRETURN); +	if (status) +		return status; +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(!p)) +		goto out_overflow; +	res->lrs_present = be32_to_cpup(p); +	if (res->lrs_present) +		status = decode_stateid(xdr, &res->stateid); +	return status; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +static int decode_layoutcommit(struct xdr_stream *xdr, +			       struct rpc_rqst *req, +			       struct nfs4_layoutcommit_res *res) +{ +	__be32 *p; +	__u32 sizechanged; +	int status; + +	status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); +	res->status = status; +	if (status) +		return status; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(!p)) +		goto out_overflow; +	sizechanged = be32_to_cpup(p); + +	if (sizechanged) { +		/* throw away new size */ +		p = xdr_inline_decode(xdr, 8); +		if (unlikely(!p)) +			goto out_overflow; +	} +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} + +static int decode_test_stateid(struct xdr_stream *xdr, +			       struct nfs41_test_stateid_res *res) +{ +	__be32 *p; +	int status; +	int num_res; + +	status = decode_op_hdr(xdr, OP_TEST_STATEID); +	if (status) +		return status; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(!p)) +		goto out_overflow; +	num_res = be32_to_cpup(p++); +	if (num_res != 1) +		goto out; + +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(!p)) +		goto out_overflow; +	res->status = be32_to_cpup(p++); + +	return status; +out_overflow: +	print_overflow_msg(__func__, xdr); +out: +	return -EIO; +} + +static int decode_free_stateid(struct xdr_stream *xdr, +			       struct nfs41_free_stateid_res *res) +{ +	res->status = decode_op_hdr(xdr, OP_FREE_STATEID); +	return res->status; +}  #endif /* CONFIG_NFS_V4_1 */  /* @@ -5090,27 +6039,26 @@ out_overflow:  /*   * Decode OPEN_DOWNGRADE response   */ -static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) +static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, +				       struct xdr_stream *xdr, +				       struct nfs_closeres *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_open_downgrade(&xdr, res); +	status = decode_open_downgrade(xdr, res);  	if (status != 0)  		goto out; -	decode_getfattr(&xdr, res->fattr, res->server, -			!RPC_IS_ASYNC(rqstp->rq_task)); +	decode_getfattr(xdr, res->fattr, res->server);  out:  	return status;  } @@ -5118,27 +6066,25 @@ out:  /*   * Decode ACCESS response   */ -static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res) +static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +			       struct nfs4_accessres *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status != 0)  		goto out; -	status = decode_access(&xdr, res); +	status = decode_access(xdr, &res->supported, &res->access);  	if (status != 0)  		goto out; -	decode_getfattr(&xdr, res->fattr, res->server, -			!RPC_IS_ASYNC(rqstp->rq_task)); +	decode_getfattr(xdr, res->fattr, res->server);  out:  	return status;  } @@ -5146,27 +6092,28 @@ out:  /*   * Decode LOOKUP response   */ -static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res) +static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +			       struct nfs4_lookup_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	if ((status = decode_putfh(&xdr)) != 0) +	status = decode_putfh(xdr); +	if (status)  		goto out; -	if ((status = decode_lookup(&xdr)) != 0) +	status = decode_lookup(xdr); +	if (status)  		goto out; -	if ((status = decode_getfh(&xdr, res->fh)) != 0) +	status = decode_getfh(xdr, res->fh); +	if (status)  		goto out; -	status = decode_getfattr(&xdr, res->fattr, res->server -			,!RPC_IS_ASYNC(rqstp->rq_task)); +	status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);  out:  	return status;  } @@ -5174,24 +6121,26 @@ out:  /*   * Decode LOOKUP_ROOT response   */ -static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res) +static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, +				    struct xdr_stream *xdr, +				    struct nfs4_lookup_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	if ((status = decode_putrootfh(&xdr)) != 0) +	status = decode_putrootfh(xdr); +	if (status)  		goto out; -	if ((status = decode_getfh(&xdr, res->fh)) == 0) -		status = decode_getfattr(&xdr, res->fattr, res->server, -				!RPC_IS_ASYNC(rqstp->rq_task)); +	status = decode_getfh(xdr, res->fh); +	if (status == 0) +		status = decode_getfattr_label(xdr, res->fattr, +						res->label, res->server);  out:  	return status;  } @@ -5199,25 +6148,22 @@ out:  /*   * Decode REMOVE response   */ -static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_removeres *res) +static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +			       struct nfs_removeres *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	if ((status = decode_putfh(&xdr)) != 0) -		goto out; -	if ((status = decode_remove(&xdr, &res->cinfo)) != 0) +	status = decode_putfh(xdr); +	if (status)  		goto out; -	decode_getfattr(&xdr, res->dir_attr, res->server, -			!RPC_IS_ASYNC(rqstp->rq_task)); +	status = decode_remove(xdr, &res->cinfo);  out:  	return status;  } @@ -5225,35 +6171,28 @@ out:  /*   * Decode RENAME response   */ -static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs_renameres *res) +static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +			       struct nfs_renameres *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	if ((status = decode_putfh(&xdr)) != 0) -		goto out; -	if ((status = decode_savefh(&xdr)) != 0) -		goto out; -	if ((status = decode_putfh(&xdr)) != 0) -		goto out; -	if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0) +	status = decode_putfh(xdr); +	if (status)  		goto out; -	/* Current FH is target directory */ -	if (decode_getfattr(&xdr, res->new_fattr, res->server, -				!RPC_IS_ASYNC(rqstp->rq_task)) != 0) +	status = decode_savefh(xdr); +	if (status)  		goto out; -	if ((status = decode_restorefh(&xdr)) != 0) +	status = decode_putfh(xdr); +	if (status)  		goto out; -	decode_getfattr(&xdr, res->old_fattr, res->server, -			!RPC_IS_ASYNC(rqstp->rq_task)); +	status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo);  out:  	return status;  } @@ -5261,38 +6200,38 @@ out:  /*   * Decode LINK response   */ -static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res) +static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +			     struct nfs4_link_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	if ((status = decode_putfh(&xdr)) != 0) +	status = decode_putfh(xdr); +	if (status)  		goto out; -	if ((status = decode_savefh(&xdr)) != 0) +	status = decode_savefh(xdr); +	if (status)  		goto out; -	if ((status = decode_putfh(&xdr)) != 0) +	status = decode_putfh(xdr); +	if (status)  		goto out; -	if ((status = decode_link(&xdr, &res->cinfo)) != 0) +	status = decode_link(xdr, &res->cinfo); +	if (status)  		goto out;  	/*  	 * Note order: OP_LINK leaves the directory as the current  	 *             filehandle.  	 */ -	if (decode_getfattr(&xdr, res->dir_attr, res->server, -				!RPC_IS_ASYNC(rqstp->rq_task)) != 0) -		goto out; -	if ((status = decode_restorefh(&xdr)) != 0) +	status = decode_restorefh(xdr); +	if (status)  		goto out; -	decode_getfattr(&xdr, res->fattr, res->server, -			!RPC_IS_ASYNC(rqstp->rq_task)); +	decode_getfattr_label(xdr, res->fattr, res->label, res->server);  out:  	return status;  } @@ -5300,34 +6239,28 @@ out:  /*   * Decode CREATE response   */ -static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res) +static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +			       struct nfs4_create_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	if ((status = decode_putfh(&xdr)) != 0) -		goto out; -	if ((status = decode_savefh(&xdr)) != 0) -		goto out; -	if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0) -		goto out; -	if ((status = decode_getfh(&xdr, res->fh)) != 0) +	status = decode_putfh(xdr); +	if (status)  		goto out; -	if (decode_getfattr(&xdr, res->fattr, res->server, -				!RPC_IS_ASYNC(rqstp->rq_task)) != 0) +	status = decode_create(xdr, &res->dir_cinfo); +	if (status)  		goto out; -	if ((status = decode_restorefh(&xdr)) != 0) +	status = decode_getfh(xdr, res->fh); +	if (status)  		goto out; -	decode_getfattr(&xdr, res->dir_fattr, res->server, -			!RPC_IS_ASYNC(rqstp->rq_task)); +	decode_getfattr_label(xdr, res->fattr, res->label, res->server);  out:  	return status;  } @@ -5335,32 +6268,31 @@ out:  /*   * Decode SYMLINK response   */ -static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res) +static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +				struct nfs4_create_res *res)  { -	return nfs4_xdr_dec_create(rqstp, p, res); +	return nfs4_xdr_dec_create(rqstp, xdr, res);  }  /*   * Decode GETATTR response   */ -static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res) +static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +				struct nfs4_getattr_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_getfattr(&xdr, res->fattr, res->server, -			!RPC_IS_ASYNC(rqstp->rq_task)); +	status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);  out:  	return status;  } @@ -5368,46 +6300,40 @@ out:  /*   * Encode an SETACL request   */ -static int -nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args) +static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr, +				struct nfs_setaclargs *args)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr = {  		.minorversion = nfs4_xdr_minorversion(&args->seq_args),  	}; -	int status; -	xdr_init_encode(&xdr, &req->rq_snd_buf, p); -	encode_compound_hdr(&xdr, req, &hdr); -	encode_sequence(&xdr, &args->seq_args, &hdr); -	encode_putfh(&xdr, args->fh, &hdr); -	status = encode_setacl(&xdr, args, &hdr); +	encode_compound_hdr(xdr, req, &hdr); +	encode_sequence(xdr, &args->seq_args, &hdr); +	encode_putfh(xdr, args->fh, &hdr); +	encode_setacl(xdr, args, &hdr);  	encode_nops(&hdr); -	return status;  }  /*   * Decode SETACL response   */  static int -nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, +nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,  		    struct nfs_setaclres *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_setattr(&xdr); +	status = decode_setattr(xdr);  out:  	return status;  } @@ -5416,24 +6342,26 @@ out:   * Decode GETACL response   */  static int -nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, +nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,  		    struct nfs_getaclres *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	if (res->acl_scratch != NULL) { +		void *p = page_address(res->acl_scratch); +		xdr_set_scratch_buffer(xdr, p, PAGE_SIZE); +	} +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_getacl(&xdr, rqstp, &res->acl_len); +	status = decode_getacl(xdr, rqstp, res);  out:  	return status; @@ -5442,23 +6370,22 @@ out:  /*   * Decode CLOSE response   */ -static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) +static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +			      struct nfs_closeres *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_close(&xdr, res); +	status = decode_close(xdr, res);  	if (status != 0)  		goto out;  	/* @@ -5467,8 +6394,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos  	 * 	an ESTALE error. Shouldn't be a problem,  	 * 	though, since fattr->valid will remain unset.  	 */ -	decode_getfattr(&xdr, res->fattr, res->server, -			!RPC_IS_ASYNC(rqstp->rq_task)); +	decode_getfattr(xdr, res->fattr, res->server);  out:  	return status;  } @@ -5476,37 +6402,30 @@ out:  /*   * Decode OPEN response   */ -static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) +static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +			     struct nfs_openres *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_savefh(&xdr); +	status = decode_open(xdr, res);  	if (status)  		goto out; -	status = decode_open(&xdr, res); +	status = decode_getfh(xdr, &res->fh);  	if (status)  		goto out; -	if (decode_getfh(&xdr, &res->fh) != 0) -		goto out; -	if (decode_getfattr(&xdr, res->f_attr, res->server, -				!RPC_IS_ASYNC(rqstp->rq_task)) != 0) -		goto out; -	if (decode_restorefh(&xdr) != 0) -		goto out; -	decode_getfattr(&xdr, res->dir_attr, res->server, -			!RPC_IS_ASYNC(rqstp->rq_task)); +	if (res->access_request) +		decode_access(xdr, &res->access_supported, &res->access_result); +	decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server);  out:  	return status;  } @@ -5514,20 +6433,20 @@ out:  /*   * Decode OPEN_CONFIRM response   */ -static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res) +static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, +				     struct xdr_stream *xdr, +				     struct nfs_open_confirmres *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_open_confirm(&xdr, res); +	status = decode_open_confirm(xdr, res);  out:  	return status;  } @@ -5535,27 +6454,28 @@ out:  /*   * Decode OPEN response   */ -static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) +static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, +				    struct xdr_stream *xdr, +				    struct nfs_openres *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_open(&xdr, res); +	status = decode_open(xdr, res);  	if (status)  		goto out; -	decode_getfattr(&xdr, res->f_attr, res->server, -			!RPC_IS_ASYNC(rqstp->rq_task)); +	if (res->access_request) +		decode_access(xdr, &res->access_supported, &res->access_result); +	decode_getfattr(xdr, res->f_attr, res->server);  out:  	return status;  } @@ -5563,27 +6483,26 @@ out:  /*   * Decode SETATTR response   */ -static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res) +static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, +				struct xdr_stream *xdr, +				struct nfs_setattrres *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_setattr(&xdr); +	status = decode_setattr(xdr);  	if (status)  		goto out; -	decode_getfattr(&xdr, res->fattr, res->server, -			!RPC_IS_ASYNC(rqstp->rq_task)); +	decode_getfattr_label(xdr, res->fattr, res->label, res->server);  out:  	return status;  } @@ -5591,23 +6510,22 @@ out:  /*   * Decode LOCK response   */ -static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res) +static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +			     struct nfs_lock_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_lock(&xdr, res); +	status = decode_lock(xdr, res);  out:  	return status;  } @@ -5615,23 +6533,22 @@ out:  /*   * Decode LOCKT response   */ -static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res) +static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +			      struct nfs_lockt_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_lockt(&xdr, res); +	status = decode_lockt(xdr, res);  out:  	return status;  } @@ -5639,61 +6556,58 @@ out:  /*   * Decode LOCKU response   */ -static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res) +static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +			      struct nfs_locku_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_locku(&xdr, res); +	status = decode_locku(xdr, res);  out:  	return status;  } -static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy) +static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, +					  struct xdr_stream *xdr, void *dummy)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (!status) -		status = decode_release_lockowner(&xdr); +		status = decode_release_lockowner(xdr);  	return status;  }  /*   * Decode READLINK response   */ -static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, +static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, +				 struct xdr_stream *xdr,  				 struct nfs4_readlink_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_readlink(&xdr, rqstp); +	status = decode_readlink(xdr, rqstp);  out:  	return status;  } @@ -5701,23 +6615,22 @@ out:  /*   * Decode READDIR response   */ -static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res) +static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +				struct nfs4_readdir_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_readdir(&xdr, rqstp, res); +	status = decode_readdir(xdr, rqstp, res);  out:  	return status;  } @@ -5725,23 +6638,22 @@ out:  /*   * Decode Read response   */ -static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res) +static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +			     struct nfs_pgio_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_read(&xdr, rqstp, res); +	status = decode_read(xdr, rqstp, res);  	if (!status)  		status = res->count;  out: @@ -5751,27 +6663,26 @@ out:  /*   * Decode WRITE response   */ -static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res) +static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +			      struct nfs_pgio_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_write(&xdr, res); +	status = decode_write(xdr, res);  	if (status)  		goto out; -	decode_getfattr(&xdr, res->fattr, res->server, -			!RPC_IS_ASYNC(rqstp->rq_task)); +	if (res->fattr) +		decode_getfattr(xdr, res->fattr, res->server);  	if (!status)  		status = res->count;  out: @@ -5781,27 +6692,22 @@ out:  /*   * Decode COMMIT response   */ -static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res) +static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +			       struct nfs_commitres *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_commit(&xdr, res); -	if (status) -		goto out; -	decode_getfattr(&xdr, res->fattr, res->server, -			!RPC_IS_ASYNC(rqstp->rq_task)); +	status = decode_commit(xdr, res);  out:  	return status;  } @@ -5809,85 +6715,80 @@ out:  /*   * Decode FSINFO response   */ -static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, +static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr,  			       struct nfs4_fsinfo_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &req->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (!status) -		status = decode_sequence(&xdr, &res->seq_res, req); +		status = decode_sequence(xdr, &res->seq_res, req);  	if (!status) -		status = decode_putfh(&xdr); +		status = decode_putfh(xdr);  	if (!status) -		status = decode_fsinfo(&xdr, res->fsinfo); +		status = decode_fsinfo(xdr, res->fsinfo);  	return status;  }  /*   * Decode PATHCONF response   */ -static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, +static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr,  				 struct nfs4_pathconf_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &req->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (!status) -		status = decode_sequence(&xdr, &res->seq_res, req); +		status = decode_sequence(xdr, &res->seq_res, req);  	if (!status) -		status = decode_putfh(&xdr); +		status = decode_putfh(xdr);  	if (!status) -		status = decode_pathconf(&xdr, res->pathconf); +		status = decode_pathconf(xdr, res->pathconf);  	return status;  }  /*   * Decode STATFS response   */ -static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, +static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr,  			       struct nfs4_statfs_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &req->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (!status) -		status = decode_sequence(&xdr, &res->seq_res, req); +		status = decode_sequence(xdr, &res->seq_res, req);  	if (!status) -		status = decode_putfh(&xdr); +		status = decode_putfh(xdr);  	if (!status) -		status = decode_statfs(&xdr, res->fsstat); +		status = decode_statfs(xdr, res->fsstat);  	return status;  }  /*   * Decode GETATTR_BITMAP response   */ -static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res) +static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, +				    struct xdr_stream *xdr, +				    struct nfs4_server_caps_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &req->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, req); +	status = decode_sequence(xdr, &res->seq_res, req);  	if (status)  		goto out; -	if ((status = decode_putfh(&xdr)) != 0) +	status = decode_putfh(xdr); +	if (status)  		goto out; -	status = decode_server_caps(&xdr, res); +	status = decode_server_caps(xdr, res);  out:  	return status;  } @@ -5895,80 +6796,72 @@ out:  /*   * Decode RENEW response   */ -static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy) +static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr, +			      void *__unused)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (!status) -		status = decode_renew(&xdr); +		status = decode_renew(xdr);  	return status;  }  /*   * Decode SETCLIENTID response   */ -static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, -		struct nfs4_setclientid_res *res) +static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, +				    struct xdr_stream *xdr, +				    struct nfs4_setclientid_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &req->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (!status) -		status = decode_setclientid(&xdr, res); +		status = decode_setclientid(xdr, res);  	return status;  }  /*   * Decode SETCLIENTID_CONFIRM response   */ -static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo) +static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, +					    struct xdr_stream *xdr)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &req->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); -	if (!status) -		status = decode_setclientid_confirm(&xdr); -	if (!status) -		status = decode_putrootfh(&xdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (!status) -		status = decode_fsinfo(&xdr, fsinfo); +		status = decode_setclientid_confirm(xdr);  	return status;  }  /*   * Decode DELEGRETURN response   */ -static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res) +static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, +				    struct xdr_stream *xdr, +				    struct nfs4_delegreturnres *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status != 0)  		goto out; -	status = decode_delegreturn(&xdr); +	status = decode_getfattr(xdr, res->fattr, res->server);  	if (status != 0)  		goto out; -	decode_getfattr(&xdr, res->fattr, res->server, -			!RPC_IS_ASYNC(rqstp->rq_task)); +	status = decode_delegreturn(xdr);  out:  	return status;  } @@ -5976,159 +6869,276 @@ out:  /*   * Decode FS_LOCATIONS response   */ -static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, +static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, +				     struct xdr_stream *xdr,  				     struct nfs4_fs_locations_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &req->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, req); +	status = decode_sequence(xdr, &res->seq_res, req);  	if (status)  		goto out; -	if ((status = decode_putfh(&xdr)) != 0) +	status = decode_putfh(xdr); +	if (status)  		goto out; -	if ((status = decode_lookup(&xdr)) != 0) +	if (res->migration) { +		xdr_enter_page(xdr, PAGE_SIZE); +		status = decode_getfattr_generic(xdr, +					&res->fs_locations->fattr, +					 NULL, res->fs_locations, +					 NULL, res->fs_locations->server); +		if (status) +			goto out; +		if (res->renew) +			status = decode_renew(xdr); +	} else { +		status = decode_lookup(xdr); +		if (status) +			goto out; +		xdr_enter_page(xdr, PAGE_SIZE); +		status = decode_getfattr_generic(xdr, +					&res->fs_locations->fattr, +					 NULL, res->fs_locations, +					 NULL, res->fs_locations->server); +	} +out: +	return status; +} + +/* + * Decode SECINFO response + */ +static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp, +				struct xdr_stream *xdr, +				struct nfs4_secinfo_res *res) +{ +	struct compound_hdr hdr; +	int status; + +	status = decode_compound_hdr(xdr, &hdr); +	if (status) +		goto out; +	status = decode_sequence(xdr, &res->seq_res, rqstp); +	if (status)  		goto out; -	xdr_enter_page(&xdr, PAGE_SIZE); -	status = decode_getfattr(&xdr, &res->fs_locations->fattr, -				 res->fs_locations->server, -				 !RPC_IS_ASYNC(req->rq_task)); +	status = decode_putfh(xdr); +	if (status) +		goto out; +	status = decode_secinfo(xdr, res); +out: +	return status; +} + +/* + * Decode FSID_PRESENT response + */ +static int nfs4_xdr_dec_fsid_present(struct rpc_rqst *rqstp, +				     struct xdr_stream *xdr, +				     struct nfs4_fsid_present_res *res) +{ +	struct compound_hdr hdr; +	int status; + +	status = decode_compound_hdr(xdr, &hdr); +	if (status) +		goto out; +	status = decode_sequence(xdr, &res->seq_res, rqstp); +	if (status) +		goto out; +	status = decode_putfh(xdr); +	if (status) +		goto out; +	status = decode_getfh(xdr, res->fh); +	if (status) +		goto out; +	if (res->renew) +		status = decode_renew(xdr);  out:  	return status;  }  #if defined(CONFIG_NFS_V4_1)  /* + * Decode BIND_CONN_TO_SESSION response + */ +static int nfs4_xdr_dec_bind_conn_to_session(struct rpc_rqst *rqstp, +					struct xdr_stream *xdr, +					void *res) +{ +	struct compound_hdr hdr; +	int status; + +	status = decode_compound_hdr(xdr, &hdr); +	if (!status) +		status = decode_bind_conn_to_session(xdr, res); +	return status; +} + +/*   * Decode EXCHANGE_ID response   */ -static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, uint32_t *p, +static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, +				    struct xdr_stream *xdr,  				    void *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (!status) -		status = decode_exchange_id(&xdr, res); +		status = decode_exchange_id(xdr, res);  	return status;  }  /*   * Decode CREATE_SESSION response   */ -static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, uint32_t *p, +static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, +				       struct xdr_stream *xdr,  				       struct nfs41_create_session_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (!status) -		status = decode_create_session(&xdr, res); +		status = decode_create_session(xdr, res);  	return status;  }  /*   * Decode DESTROY_SESSION response   */ -static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, uint32_t *p, -					void *dummy) +static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, +					struct xdr_stream *xdr, +					void *res) +{ +	struct compound_hdr hdr; +	int status; + +	status = decode_compound_hdr(xdr, &hdr); +	if (!status) +		status = decode_destroy_session(xdr, res); +	return status; +} + +/* + * Decode DESTROY_CLIENTID response + */ +static int nfs4_xdr_dec_destroy_clientid(struct rpc_rqst *rqstp, +					struct xdr_stream *xdr, +					void *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (!status) -		status = decode_destroy_session(&xdr, dummy); +		status = decode_destroy_clientid(xdr, res);  	return status;  }  /*   * Decode SEQUENCE response   */ -static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, uint32_t *p, +static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, +				 struct xdr_stream *xdr,  				 struct nfs4_sequence_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (!status) -		status = decode_sequence(&xdr, res, rqstp); +		status = decode_sequence(xdr, res, rqstp);  	return status;  }  /*   * Decode GET_LEASE_TIME response   */ -static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, uint32_t *p, +static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, +				       struct xdr_stream *xdr,  				       struct nfs4_get_lease_time_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (!status) -		status = decode_sequence(&xdr, &res->lr_seq_res, rqstp); +		status = decode_sequence(xdr, &res->lr_seq_res, rqstp);  	if (!status) -		status = decode_putrootfh(&xdr); +		status = decode_putrootfh(xdr);  	if (!status) -		status = decode_fsinfo(&xdr, res->lr_fsinfo); +		status = decode_fsinfo(xdr, res->lr_fsinfo);  	return status;  }  /*   * Decode RECLAIM_COMPLETE response   */ -static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, uint32_t *p, +static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, +					 struct xdr_stream *xdr,  					 struct nfs41_reclaim_complete_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (!status) -		status = decode_sequence(&xdr, &res->seq_res, rqstp); +		status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (!status) -		status = decode_reclaim_complete(&xdr, (void *)NULL); +		status = decode_reclaim_complete(xdr, (void *)NULL); +	return status; +} + +/* + * Decode GETDEVICELIST response + */ +static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, +				      struct xdr_stream *xdr, +				      struct nfs4_getdevicelist_res *res) +{ +	struct compound_hdr hdr; +	int status; + +	dprintk("encoding getdevicelist!\n"); + +	status = decode_compound_hdr(xdr, &hdr); +	if (status != 0) +		goto out; +	status = decode_sequence(xdr, &res->seq_res, rqstp); +	if (status != 0) +		goto out; +	status = decode_putfh(xdr); +	if (status != 0) +		goto out; +	status = decode_getdevicelist(xdr, res->devlist); +out:  	return status;  }  /*   * Decode GETDEVINFO response   */ -static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, +static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, +				      struct xdr_stream *xdr,  				      struct nfs4_getdeviceinfo_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status != 0)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status != 0)  		goto out; -	status = decode_getdeviceinfo(&xdr, res->pdev); +	status = decode_getdeviceinfo(xdr, res->pdev);  out:  	return status;  } @@ -6136,45 +7146,176 @@ out:  /*   * Decode LAYOUTGET response   */ -static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, +static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, +				  struct xdr_stream *xdr,  				  struct nfs4_layoutget_res *res)  { -	struct xdr_stream xdr;  	struct compound_hdr hdr;  	int status; -	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); -	status = decode_compound_hdr(&xdr, &hdr); +	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; -	status = decode_sequence(&xdr, &res->seq_res, rqstp); +	status = decode_sequence(xdr, &res->seq_res, rqstp);  	if (status)  		goto out; -	status = decode_putfh(&xdr); +	status = decode_putfh(xdr);  	if (status)  		goto out; -	status = decode_layoutget(&xdr, rqstp, res); +	status = decode_layoutget(xdr, rqstp, res); +out: +	return status; +} + +/* + * Decode LAYOUTRETURN response + */ +static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, +				     struct xdr_stream *xdr, +				     struct nfs4_layoutreturn_res *res) +{ +	struct compound_hdr hdr; +	int status; + +	status = decode_compound_hdr(xdr, &hdr); +	if (status) +		goto out; +	status = decode_sequence(xdr, &res->seq_res, rqstp); +	if (status) +		goto out; +	status = decode_putfh(xdr); +	if (status) +		goto out; +	status = decode_layoutreturn(xdr, res); +out: +	return status; +} + +/* + * Decode LAYOUTCOMMIT response + */ +static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, +				     struct xdr_stream *xdr, +				     struct nfs4_layoutcommit_res *res) +{ +	struct compound_hdr hdr; +	int status; + +	status = decode_compound_hdr(xdr, &hdr); +	if (status) +		goto out; +	status = decode_sequence(xdr, &res->seq_res, rqstp); +	if (status) +		goto out; +	status = decode_putfh(xdr); +	if (status) +		goto out; +	status = decode_layoutcommit(xdr, rqstp, res); +	if (status) +		goto out; +	decode_getfattr(xdr, res->fattr, res->server); +out: +	return status; +} + +/* + * Decode SECINFO_NO_NAME response + */ +static int nfs4_xdr_dec_secinfo_no_name(struct rpc_rqst *rqstp, +					struct xdr_stream *xdr, +					struct nfs4_secinfo_res *res) +{ +	struct compound_hdr hdr; +	int status; + +	status = decode_compound_hdr(xdr, &hdr); +	if (status) +		goto out; +	status = decode_sequence(xdr, &res->seq_res, rqstp); +	if (status) +		goto out; +	status = decode_putrootfh(xdr); +	if (status) +		goto out; +	status = decode_secinfo_no_name(xdr, res); +out: +	return status; +} + +/* + * Decode TEST_STATEID response + */ +static int nfs4_xdr_dec_test_stateid(struct rpc_rqst *rqstp, +				     struct xdr_stream *xdr, +				     struct nfs41_test_stateid_res *res) +{ +	struct compound_hdr hdr; +	int status; + +	status = decode_compound_hdr(xdr, &hdr); +	if (status) +		goto out; +	status = decode_sequence(xdr, &res->seq_res, rqstp); +	if (status) +		goto out; +	status = decode_test_stateid(xdr, res); +out: +	return status; +} + +/* + * Decode FREE_STATEID response + */ +static int nfs4_xdr_dec_free_stateid(struct rpc_rqst *rqstp, +				     struct xdr_stream *xdr, +				     struct nfs41_free_stateid_res *res) +{ +	struct compound_hdr hdr; +	int status; + +	status = decode_compound_hdr(xdr, &hdr); +	if (status) +		goto out; +	status = decode_sequence(xdr, &res->seq_res, rqstp); +	if (status) +		goto out; +	status = decode_free_stateid(xdr, res);  out:  	return status;  }  #endif /* CONFIG_NFS_V4_1 */ -__be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, -			   struct nfs_server *server, int plus) +/** + * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in + *                      the local page cache. + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + */ +int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, +		       int plus)  { -	uint32_t bitmap[2] = {0}; +	unsigned int savep; +	uint32_t bitmap[3] = {0};  	uint32_t len;  	__be32 *p = xdr_inline_decode(xdr, 4);  	if (unlikely(!p))  		goto out_overflow; -	if (!ntohl(*p++)) { +	if (*p == xdr_zero) {  		p = xdr_inline_decode(xdr, 4);  		if (unlikely(!p))  			goto out_overflow; -		if (!ntohl(*p++)) -			return ERR_PTR(-EAGAIN); +		if (*p == xdr_zero) +			return -EAGAIN;  		entry->eof = 1; -		return ERR_PTR(-EBADCOOKIE); +		return -EBADCOOKIE;  	}  	p = xdr_inline_decode(xdr, 12); @@ -6182,7 +7323,7 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,  		goto out_overflow;  	entry->prev_cookie = entry->cookie;  	p = xdr_decode_hyper(p, &entry->cookie); -	entry->len = ntohl(*p++); +	entry->len = be32_to_cpup(p);  	p = xdr_inline_decode(xdr, entry->len);  	if (unlikely(!p)) @@ -6200,28 +7341,26 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,  	if (decode_attr_bitmap(xdr, bitmap) < 0)  		goto out_overflow; -	if (decode_attr_length(xdr, &len, &p) < 0) +	if (decode_attr_length(xdr, &len, &savep) < 0)  		goto out_overflow; -	if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, server, 1) < 0) +	if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, +			NULL, entry->label, entry->server) < 0)  		goto out_overflow; -	if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) +	if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) +		entry->ino = entry->fattr->mounted_on_fileid; +	else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID)  		entry->ino = entry->fattr->fileid; -	if (verify_attr_len(xdr, p, len) < 0) -		goto out_overflow; - -	p = xdr_inline_peek(xdr, 8); -	if (p != NULL) -		entry->eof = !p[0] && p[1]; -	else -		entry->eof = 0; +	entry->d_type = DT_UNKNOWN; +	if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) +		entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); -	return p; +	return 0;  out_overflow:  	print_overflow_msg(__func__, xdr); -	return ERR_PTR(-EIO); +	return -EAGAIN;  }  /* @@ -6252,8 +7391,6 @@ static struct {  	{ NFS4ERR_DQUOT,	-EDQUOT		},  	{ NFS4ERR_STALE,	-ESTALE		},  	{ NFS4ERR_BADHANDLE,	-EBADHANDLE	}, -	{ NFS4ERR_BADOWNER,	-EINVAL		}, -	{ NFS4ERR_BADNAME,	-EINVAL		},  	{ NFS4ERR_BAD_COOKIE,	-EBADCOOKIE	},  	{ NFS4ERR_NOTSUPP,	-ENOTSUPP	},  	{ NFS4ERR_TOOSMALL,	-ETOOSMALL	}, @@ -6263,10 +7400,6 @@ static struct {  	{ NFS4ERR_SYMLINK,	-ELOOP		},  	{ NFS4ERR_OP_ILLEGAL,	-EOPNOTSUPP	},  	{ NFS4ERR_DEADLOCK,	-EDEADLK	}, -	{ NFS4ERR_WRONGSEC,	-EPERM		}, /* FIXME: this needs -						    * to be handled by a -						    * middle-layer. -						    */  	{ -1,			-EIO		}  }; @@ -6297,8 +7430,8 @@ nfs4_stat_to_errno(int stat)  #define PROC(proc, argtype, restype)				\  [NFSPROC4_CLNT_##proc] = {					\  	.p_proc   = NFSPROC4_COMPOUND,				\ -	.p_encode = (kxdrproc_t) nfs4_xdr_##argtype,		\ -	.p_decode = (kxdrproc_t) nfs4_xdr_##restype,		\ +	.p_encode = (kxdreproc_t)nfs4_xdr_##argtype,		\ +	.p_decode = (kxdrdproc_t)nfs4_xdr_##restype,		\  	.p_arglen = NFS4_##argtype##_sz,			\  	.p_replen = NFS4_##restype##_sz,			\  	.p_statidx = NFSPROC4_CLNT_##proc,			\ @@ -6306,54 +7439,65 @@ nfs4_stat_to_errno(int stat)  }  struct rpc_procinfo	nfs4_procedures[] = { -  PROC(READ,		enc_read,	dec_read), -  PROC(WRITE,		enc_write,	dec_write), -  PROC(COMMIT,		enc_commit,	dec_commit), -  PROC(OPEN,		enc_open,	dec_open), -  PROC(OPEN_CONFIRM,	enc_open_confirm,	dec_open_confirm), -  PROC(OPEN_NOATTR,	enc_open_noattr,	dec_open_noattr), -  PROC(OPEN_DOWNGRADE,	enc_open_downgrade,	dec_open_downgrade), -  PROC(CLOSE,		enc_close,	dec_close), -  PROC(SETATTR,		enc_setattr,	dec_setattr), -  PROC(FSINFO,		enc_fsinfo,	dec_fsinfo), -  PROC(RENEW,		enc_renew,	dec_renew), -  PROC(SETCLIENTID,	enc_setclientid,	dec_setclientid), -  PROC(SETCLIENTID_CONFIRM,	enc_setclientid_confirm,	dec_setclientid_confirm), -  PROC(LOCK,            enc_lock,       dec_lock), -  PROC(LOCKT,           enc_lockt,      dec_lockt), -  PROC(LOCKU,           enc_locku,      dec_locku), -  PROC(ACCESS,		enc_access,	dec_access), -  PROC(GETATTR,		enc_getattr,	dec_getattr), -  PROC(LOOKUP,		enc_lookup,	dec_lookup), -  PROC(LOOKUP_ROOT,	enc_lookup_root,	dec_lookup_root), -  PROC(REMOVE,		enc_remove,	dec_remove), -  PROC(RENAME,		enc_rename,	dec_rename), -  PROC(LINK,		enc_link,	dec_link), -  PROC(SYMLINK,		enc_symlink,	dec_symlink), -  PROC(CREATE,		enc_create,	dec_create), -  PROC(PATHCONF,	enc_pathconf,	dec_pathconf), -  PROC(STATFS,		enc_statfs,	dec_statfs), -  PROC(READLINK,	enc_readlink,	dec_readlink), -  PROC(READDIR,		enc_readdir,	dec_readdir), -  PROC(SERVER_CAPS,	enc_server_caps, dec_server_caps), -  PROC(DELEGRETURN,	enc_delegreturn, dec_delegreturn), -  PROC(GETACL,		enc_getacl,	dec_getacl), -  PROC(SETACL,		enc_setacl,	dec_setacl), -  PROC(FS_LOCATIONS,	enc_fs_locations, dec_fs_locations), -  PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), +	PROC(READ,		enc_read,		dec_read), +	PROC(WRITE,		enc_write,		dec_write), +	PROC(COMMIT,		enc_commit,		dec_commit), +	PROC(OPEN,		enc_open,		dec_open), +	PROC(OPEN_CONFIRM,	enc_open_confirm,	dec_open_confirm), +	PROC(OPEN_NOATTR,	enc_open_noattr,	dec_open_noattr), +	PROC(OPEN_DOWNGRADE,	enc_open_downgrade,	dec_open_downgrade), +	PROC(CLOSE,		enc_close,		dec_close), +	PROC(SETATTR,		enc_setattr,		dec_setattr), +	PROC(FSINFO,		enc_fsinfo,		dec_fsinfo), +	PROC(RENEW,		enc_renew,		dec_renew), +	PROC(SETCLIENTID,	enc_setclientid,	dec_setclientid), +	PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm), +	PROC(LOCK,		enc_lock,		dec_lock), +	PROC(LOCKT,		enc_lockt,		dec_lockt), +	PROC(LOCKU,		enc_locku,		dec_locku), +	PROC(ACCESS,		enc_access,		dec_access), +	PROC(GETATTR,		enc_getattr,		dec_getattr), +	PROC(LOOKUP,		enc_lookup,		dec_lookup), +	PROC(LOOKUP_ROOT,	enc_lookup_root,	dec_lookup_root), +	PROC(REMOVE,		enc_remove,		dec_remove), +	PROC(RENAME,		enc_rename,		dec_rename), +	PROC(LINK,		enc_link,		dec_link), +	PROC(SYMLINK,		enc_symlink,		dec_symlink), +	PROC(CREATE,		enc_create,		dec_create), +	PROC(PATHCONF,		enc_pathconf,		dec_pathconf), +	PROC(STATFS,		enc_statfs,		dec_statfs), +	PROC(READLINK,		enc_readlink,		dec_readlink), +	PROC(READDIR,		enc_readdir,		dec_readdir), +	PROC(SERVER_CAPS,	enc_server_caps,	dec_server_caps), +	PROC(DELEGRETURN,	enc_delegreturn,	dec_delegreturn), +	PROC(GETACL,		enc_getacl,		dec_getacl), +	PROC(SETACL,		enc_setacl,		dec_setacl), +	PROC(FS_LOCATIONS,	enc_fs_locations,	dec_fs_locations), +	PROC(RELEASE_LOCKOWNER,	enc_release_lockowner,	dec_release_lockowner), +	PROC(SECINFO,		enc_secinfo,		dec_secinfo), +	PROC(FSID_PRESENT,	enc_fsid_present,	dec_fsid_present),  #if defined(CONFIG_NFS_V4_1) -  PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id), -  PROC(CREATE_SESSION,	enc_create_session,	dec_create_session), -  PROC(DESTROY_SESSION,	enc_destroy_session,	dec_destroy_session), -  PROC(SEQUENCE,	enc_sequence,	dec_sequence), -  PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time), -  PROC(RECLAIM_COMPLETE, enc_reclaim_complete,  dec_reclaim_complete), -  PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), -  PROC(LAYOUTGET,  enc_layoutget,     dec_layoutget), +	PROC(EXCHANGE_ID,	enc_exchange_id,	dec_exchange_id), +	PROC(CREATE_SESSION,	enc_create_session,	dec_create_session), +	PROC(DESTROY_SESSION,	enc_destroy_session,	dec_destroy_session), +	PROC(SEQUENCE,		enc_sequence,		dec_sequence), +	PROC(GET_LEASE_TIME,	enc_get_lease_time,	dec_get_lease_time), +	PROC(RECLAIM_COMPLETE,	enc_reclaim_complete,	dec_reclaim_complete), +	PROC(GETDEVICEINFO,	enc_getdeviceinfo,	dec_getdeviceinfo), +	PROC(LAYOUTGET,		enc_layoutget,		dec_layoutget), +	PROC(LAYOUTCOMMIT,	enc_layoutcommit,	dec_layoutcommit), +	PROC(LAYOUTRETURN,	enc_layoutreturn,	dec_layoutreturn), +	PROC(SECINFO_NO_NAME,	enc_secinfo_no_name,	dec_secinfo_no_name), +	PROC(TEST_STATEID,	enc_test_stateid,	dec_test_stateid), +	PROC(FREE_STATEID,	enc_free_stateid,	dec_free_stateid), +	PROC(GETDEVICELIST,	enc_getdevicelist,	dec_getdevicelist), +	PROC(BIND_CONN_TO_SESSION, +			enc_bind_conn_to_session, dec_bind_conn_to_session), +	PROC(DESTROY_CLIENTID,	enc_destroy_clientid,	dec_destroy_clientid),  #endif /* CONFIG_NFS_V4_1 */  }; -struct rpc_version		nfs_version4 = { +const struct rpc_version nfs_version4 = {  	.number			= 4,  	.nrprocs		= ARRAY_SIZE(nfs4_procedures),  	.procs			= nfs4_procedures diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 903908a2002..cd3c910d2d1 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -86,11 +86,14 @@  /* Default path we try to mount. "%s" gets replaced by our IP address */  #define NFS_ROOT		"/tftpboot/%s" +/* Default NFSROOT mount options. */ +#define NFS_DEF_OPTIONS		"vers=2,udp,rsize=4096,wsize=4096" +  /* Parameters passed from the kernel command line */  static char nfs_root_parms[256] __initdata = "";  /* Text-based mount options passed to super.c */ -static char nfs_root_options[256] __initdata = ""; +static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;  /* Address of NFS server */  static __be32 servaddr __initdata = htonl(INADDR_NONE); @@ -101,7 +104,7 @@ static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";  /* server:export path string passed to super.c */  static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = ""; -#ifdef RPC_DEBUG +#ifdef NFS_DEBUG  /*   * When the "nfsrootdebug" kernel command line option is specified,   * enable debugging messages for NFSROOT. @@ -160,8 +163,14 @@ static int __init root_nfs_copy(char *dest, const char *src,  }  static int __init root_nfs_cat(char *dest, const char *src, -				  const size_t destlen) +			       const size_t destlen)  { +	size_t len = strlen(dest); + +	if (len && dest[len - 1] != ',') +		if (strlcat(dest, ",", destlen) > destlen) +			return -1; +  	if (strlcat(dest, src, destlen) > destlen)  		return -1;  	return 0; @@ -194,16 +203,6 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,  		if (root_nfs_cat(nfs_root_options, incoming,  						sizeof(nfs_root_options)))  			return -1; - -	/* -	 * Possibly prepare for more options to be appended -	 */ -	if (nfs_root_options[0] != '\0' && -	    nfs_root_options[strlen(nfs_root_options)] != ',') -		if (root_nfs_cat(nfs_root_options, ",", -						sizeof(nfs_root_options))) -			return -1; -  	return 0;  } @@ -217,7 +216,7 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,   */  static int __init root_nfs_data(char *cmdline)  { -	char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; +	char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];  	int len, retval = -1;  	char *tmp = NULL;  	const size_t tmplen = sizeof(nfs_export_path); @@ -244,9 +243,9 @@ static int __init root_nfs_data(char *cmdline)  	 * Append mandatory options for nfsroot so they override  	 * what has come before  	 */ -	snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4", +	snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4",  			&servaddr); -	if (root_nfs_cat(nfs_root_options, addr_option, +	if (root_nfs_cat(nfs_root_options, mand_options,  						sizeof(nfs_root_options)))  		goto out_optionstoolong; diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c new file mode 100644 index 00000000000..4eb0aead69b --- /dev/null +++ b/fs/nfs/nfstrace.c @@ -0,0 +1,9 @@ +/* + * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#include <linux/nfs_fs.h> +#include <linux/namei.h> +#include "internal.h" + +#define CREATE_TRACE_POINTS +#include "nfstrace.h" diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h new file mode 100644 index 00000000000..59f838cdc00 --- /dev/null +++ b/fs/nfs/nfstrace.h @@ -0,0 +1,730 @@ +/* + * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nfs + +#if !defined(_TRACE_NFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NFS_H + +#include <linux/tracepoint.h> + +#define nfs_show_file_type(ftype) \ +	__print_symbolic(ftype, \ +			{ DT_UNKNOWN, "UNKNOWN" }, \ +			{ DT_FIFO, "FIFO" }, \ +			{ DT_CHR, "CHR" }, \ +			{ DT_DIR, "DIR" }, \ +			{ DT_BLK, "BLK" }, \ +			{ DT_REG, "REG" }, \ +			{ DT_LNK, "LNK" }, \ +			{ DT_SOCK, "SOCK" }, \ +			{ DT_WHT, "WHT" }) + +#define nfs_show_cache_validity(v) \ +	__print_flags(v, "|", \ +			{ NFS_INO_INVALID_ATTR, "INVALID_ATTR" }, \ +			{ NFS_INO_INVALID_DATA, "INVALID_DATA" }, \ +			{ NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \ +			{ NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \ +			{ NFS_INO_INVALID_ACL, "INVALID_ACL" }, \ +			{ NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \ +			{ NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \ +			{ NFS_INO_INVALID_LABEL, "INVALID_LABEL" }) + +#define nfs_show_nfsi_flags(v) \ +	__print_flags(v, "|", \ +			{ 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ +			{ 1 << NFS_INO_STALE, "STALE" }, \ +			{ 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ +			{ 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ +			{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ +			{ 1 << NFS_INO_COMMIT, "COMMIT" }, \ +			{ 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ +			{ 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) + +DECLARE_EVENT_CLASS(nfs_inode_event, +		TP_PROTO( +			const struct inode *inode +		), + +		TP_ARGS(inode), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(u32, fhandle) +			__field(u64, fileid) +			__field(u64, version) +		), + +		TP_fast_assign( +			const struct nfs_inode *nfsi = NFS_I(inode); +			__entry->dev = inode->i_sb->s_dev; +			__entry->fileid = nfsi->fileid; +			__entry->fhandle = nfs_fhandle_hash(&nfsi->fh); +			__entry->version = inode->i_version; +		), + +		TP_printk( +			"fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu ", +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->fileid, +			__entry->fhandle, +			(unsigned long long)__entry->version +		) +); + +DECLARE_EVENT_CLASS(nfs_inode_event_done, +		TP_PROTO( +			const struct inode *inode, +			int error +		), + +		TP_ARGS(inode, error), + +		TP_STRUCT__entry( +			__field(int, error) +			__field(dev_t, dev) +			__field(u32, fhandle) +			__field(unsigned char, type) +			__field(u64, fileid) +			__field(u64, version) +			__field(loff_t, size) +			__field(unsigned long, nfsi_flags) +			__field(unsigned long, cache_validity) +		), + +		TP_fast_assign( +			const struct nfs_inode *nfsi = NFS_I(inode); +			__entry->error = error; +			__entry->dev = inode->i_sb->s_dev; +			__entry->fileid = nfsi->fileid; +			__entry->fhandle = nfs_fhandle_hash(&nfsi->fh); +			__entry->type = nfs_umode_to_dtype(inode->i_mode); +			__entry->version = inode->i_version; +			__entry->size = i_size_read(inode); +			__entry->nfsi_flags = nfsi->flags; +			__entry->cache_validity = nfsi->cache_validity; +		), + +		TP_printk( +			"error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " +			"type=%u (%s) version=%llu size=%lld " +			"cache_validity=%lu (%s) nfs_flags=%ld (%s)", +			__entry->error, +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->fileid, +			__entry->fhandle, +			__entry->type, +			nfs_show_file_type(__entry->type), +			(unsigned long long)__entry->version, +			(long long)__entry->size, +			__entry->cache_validity, +			nfs_show_cache_validity(__entry->cache_validity), +			__entry->nfsi_flags, +			nfs_show_nfsi_flags(__entry->nfsi_flags) +		) +); + +#define DEFINE_NFS_INODE_EVENT(name) \ +	DEFINE_EVENT(nfs_inode_event, name, \ +			TP_PROTO( \ +				const struct inode *inode \ +			), \ +			TP_ARGS(inode)) +#define DEFINE_NFS_INODE_EVENT_DONE(name) \ +	DEFINE_EVENT(nfs_inode_event_done, name, \ +			TP_PROTO( \ +				const struct inode *inode, \ +				int error \ +			), \ +			TP_ARGS(inode, error)) +DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit); +DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_revalidate_inode_exit); +DEFINE_NFS_INODE_EVENT(nfs_invalidate_mapping_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_invalidate_mapping_exit); +DEFINE_NFS_INODE_EVENT(nfs_getattr_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_getattr_exit); +DEFINE_NFS_INODE_EVENT(nfs_setattr_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_setattr_exit); +DEFINE_NFS_INODE_EVENT(nfs_writeback_page_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_page_exit); +DEFINE_NFS_INODE_EVENT(nfs_writeback_inode_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit); +DEFINE_NFS_INODE_EVENT(nfs_fsync_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit); +DEFINE_NFS_INODE_EVENT(nfs_access_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_access_exit); + +#define show_lookup_flags(flags) \ +	__print_flags((unsigned long)flags, "|", \ +			{ LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \ +			{ LOOKUP_DIRECTORY, "DIRECTORY" }, \ +			{ LOOKUP_OPEN, "OPEN" }, \ +			{ LOOKUP_CREATE, "CREATE" }, \ +			{ LOOKUP_EXCL, "EXCL" }) + +DECLARE_EVENT_CLASS(nfs_lookup_event, +		TP_PROTO( +			const struct inode *dir, +			const struct dentry *dentry, +			unsigned int flags +		), + +		TP_ARGS(dir, dentry, flags), + +		TP_STRUCT__entry( +			__field(unsigned int, flags) +			__field(dev_t, dev) +			__field(u64, dir) +			__string(name, dentry->d_name.name) +		), + +		TP_fast_assign( +			__entry->dev = dir->i_sb->s_dev; +			__entry->dir = NFS_FILEID(dir); +			__entry->flags = flags; +			__assign_str(name, dentry->d_name.name); +		), + +		TP_printk( +			"flags=%u (%s) name=%02x:%02x:%llu/%s", +			__entry->flags, +			show_lookup_flags(__entry->flags), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->dir, +			__get_str(name) +		) +); + +#define DEFINE_NFS_LOOKUP_EVENT(name) \ +	DEFINE_EVENT(nfs_lookup_event, name, \ +			TP_PROTO( \ +				const struct inode *dir, \ +				const struct dentry *dentry, \ +				unsigned int flags \ +			), \ +			TP_ARGS(dir, dentry, flags)) + +DECLARE_EVENT_CLASS(nfs_lookup_event_done, +		TP_PROTO( +			const struct inode *dir, +			const struct dentry *dentry, +			unsigned int flags, +			int error +		), + +		TP_ARGS(dir, dentry, flags, error), + +		TP_STRUCT__entry( +			__field(int, error) +			__field(unsigned int, flags) +			__field(dev_t, dev) +			__field(u64, dir) +			__string(name, dentry->d_name.name) +		), + +		TP_fast_assign( +			__entry->dev = dir->i_sb->s_dev; +			__entry->dir = NFS_FILEID(dir); +			__entry->error = error; +			__entry->flags = flags; +			__assign_str(name, dentry->d_name.name); +		), + +		TP_printk( +			"error=%d flags=%u (%s) name=%02x:%02x:%llu/%s", +			__entry->error, +			__entry->flags, +			show_lookup_flags(__entry->flags), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->dir, +			__get_str(name) +		) +); + +#define DEFINE_NFS_LOOKUP_EVENT_DONE(name) \ +	DEFINE_EVENT(nfs_lookup_event_done, name, \ +			TP_PROTO( \ +				const struct inode *dir, \ +				const struct dentry *dentry, \ +				unsigned int flags, \ +				int error \ +			), \ +			TP_ARGS(dir, dentry, flags, error)) + +DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter); +DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit); +DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter); +DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit); + +#define show_open_flags(flags) \ +	__print_flags((unsigned long)flags, "|", \ +		{ O_CREAT, "O_CREAT" }, \ +		{ O_EXCL, "O_EXCL" }, \ +		{ O_TRUNC, "O_TRUNC" }, \ +		{ O_APPEND, "O_APPEND" }, \ +		{ O_DSYNC, "O_DSYNC" }, \ +		{ O_DIRECT, "O_DIRECT" }, \ +		{ O_DIRECTORY, "O_DIRECTORY" }) + +#define show_fmode_flags(mode) \ +	__print_flags(mode, "|", \ +		{ ((__force unsigned long)FMODE_READ), "READ" }, \ +		{ ((__force unsigned long)FMODE_WRITE), "WRITE" }, \ +		{ ((__force unsigned long)FMODE_EXEC), "EXEC" }) + +TRACE_EVENT(nfs_atomic_open_enter, +		TP_PROTO( +			const struct inode *dir, +			const struct nfs_open_context *ctx, +			unsigned int flags +		), + +		TP_ARGS(dir, ctx, flags), + +		TP_STRUCT__entry( +			__field(unsigned int, flags) +			__field(unsigned int, fmode) +			__field(dev_t, dev) +			__field(u64, dir) +			__string(name, ctx->dentry->d_name.name) +		), + +		TP_fast_assign( +			__entry->dev = dir->i_sb->s_dev; +			__entry->dir = NFS_FILEID(dir); +			__entry->flags = flags; +			__entry->fmode = (__force unsigned int)ctx->mode; +			__assign_str(name, ctx->dentry->d_name.name); +		), + +		TP_printk( +			"flags=%u (%s) fmode=%s name=%02x:%02x:%llu/%s", +			__entry->flags, +			show_open_flags(__entry->flags), +			show_fmode_flags(__entry->fmode), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->dir, +			__get_str(name) +		) +); + +TRACE_EVENT(nfs_atomic_open_exit, +		TP_PROTO( +			const struct inode *dir, +			const struct nfs_open_context *ctx, +			unsigned int flags, +			int error +		), + +		TP_ARGS(dir, ctx, flags, error), + +		TP_STRUCT__entry( +			__field(int, error) +			__field(unsigned int, flags) +			__field(unsigned int, fmode) +			__field(dev_t, dev) +			__field(u64, dir) +			__string(name, ctx->dentry->d_name.name) +		), + +		TP_fast_assign( +			__entry->error = error; +			__entry->dev = dir->i_sb->s_dev; +			__entry->dir = NFS_FILEID(dir); +			__entry->flags = flags; +			__entry->fmode = (__force unsigned int)ctx->mode; +			__assign_str(name, ctx->dentry->d_name.name); +		), + +		TP_printk( +			"error=%d flags=%u (%s) fmode=%s " +			"name=%02x:%02x:%llu/%s", +			__entry->error, +			__entry->flags, +			show_open_flags(__entry->flags), +			show_fmode_flags(__entry->fmode), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->dir, +			__get_str(name) +		) +); + +TRACE_EVENT(nfs_create_enter, +		TP_PROTO( +			const struct inode *dir, +			const struct dentry *dentry, +			unsigned int flags +		), + +		TP_ARGS(dir, dentry, flags), + +		TP_STRUCT__entry( +			__field(unsigned int, flags) +			__field(dev_t, dev) +			__field(u64, dir) +			__string(name, dentry->d_name.name) +		), + +		TP_fast_assign( +			__entry->dev = dir->i_sb->s_dev; +			__entry->dir = NFS_FILEID(dir); +			__entry->flags = flags; +			__assign_str(name, dentry->d_name.name); +		), + +		TP_printk( +			"flags=%u (%s) name=%02x:%02x:%llu/%s", +			__entry->flags, +			show_open_flags(__entry->flags), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->dir, +			__get_str(name) +		) +); + +TRACE_EVENT(nfs_create_exit, +		TP_PROTO( +			const struct inode *dir, +			const struct dentry *dentry, +			unsigned int flags, +			int error +		), + +		TP_ARGS(dir, dentry, flags, error), + +		TP_STRUCT__entry( +			__field(int, error) +			__field(unsigned int, flags) +			__field(dev_t, dev) +			__field(u64, dir) +			__string(name, dentry->d_name.name) +		), + +		TP_fast_assign( +			__entry->error = error; +			__entry->dev = dir->i_sb->s_dev; +			__entry->dir = NFS_FILEID(dir); +			__entry->flags = flags; +			__assign_str(name, dentry->d_name.name); +		), + +		TP_printk( +			"error=%d flags=%u (%s) name=%02x:%02x:%llu/%s", +			__entry->error, +			__entry->flags, +			show_open_flags(__entry->flags), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->dir, +			__get_str(name) +		) +); + +DECLARE_EVENT_CLASS(nfs_directory_event, +		TP_PROTO( +			const struct inode *dir, +			const struct dentry *dentry +		), + +		TP_ARGS(dir, dentry), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(u64, dir) +			__string(name, dentry->d_name.name) +		), + +		TP_fast_assign( +			__entry->dev = dir->i_sb->s_dev; +			__entry->dir = NFS_FILEID(dir); +			__assign_str(name, dentry->d_name.name); +		), + +		TP_printk( +			"name=%02x:%02x:%llu/%s", +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->dir, +			__get_str(name) +		) +); + +#define DEFINE_NFS_DIRECTORY_EVENT(name) \ +	DEFINE_EVENT(nfs_directory_event, name, \ +			TP_PROTO( \ +				const struct inode *dir, \ +				const struct dentry *dentry \ +			), \ +			TP_ARGS(dir, dentry)) + +DECLARE_EVENT_CLASS(nfs_directory_event_done, +		TP_PROTO( +			const struct inode *dir, +			const struct dentry *dentry, +			int error +		), + +		TP_ARGS(dir, dentry, error), + +		TP_STRUCT__entry( +			__field(int, error) +			__field(dev_t, dev) +			__field(u64, dir) +			__string(name, dentry->d_name.name) +		), + +		TP_fast_assign( +			__entry->dev = dir->i_sb->s_dev; +			__entry->dir = NFS_FILEID(dir); +			__entry->error = error; +			__assign_str(name, dentry->d_name.name); +		), + +		TP_printk( +			"error=%d name=%02x:%02x:%llu/%s", +			__entry->error, +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->dir, +			__get_str(name) +		) +); + +#define DEFINE_NFS_DIRECTORY_EVENT_DONE(name) \ +	DEFINE_EVENT(nfs_directory_event_done, name, \ +			TP_PROTO( \ +				const struct inode *dir, \ +				const struct dentry *dentry, \ +				int error \ +			), \ +			TP_ARGS(dir, dentry, error)) + +DEFINE_NFS_DIRECTORY_EVENT(nfs_mknod_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mknod_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_mkdir_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mkdir_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_rmdir_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_rmdir_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_remove_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_remove_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_unlink_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_unlink_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_symlink_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_symlink_exit); + +TRACE_EVENT(nfs_link_enter, +		TP_PROTO( +			const struct inode *inode, +			const struct inode *dir, +			const struct dentry *dentry +		), + +		TP_ARGS(inode, dir, dentry), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(u64, fileid) +			__field(u64, dir) +			__string(name, dentry->d_name.name) +		), + +		TP_fast_assign( +			__entry->dev = inode->i_sb->s_dev; +			__entry->fileid = NFS_FILEID(inode); +			__entry->dir = NFS_FILEID(dir); +			__assign_str(name, dentry->d_name.name); +		), + +		TP_printk( +			"fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s", +			MAJOR(__entry->dev), MINOR(__entry->dev), +			__entry->fileid, +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->dir, +			__get_str(name) +		) +); + +TRACE_EVENT(nfs_link_exit, +		TP_PROTO( +			const struct inode *inode, +			const struct inode *dir, +			const struct dentry *dentry, +			int error +		), + +		TP_ARGS(inode, dir, dentry, error), + +		TP_STRUCT__entry( +			__field(int, error) +			__field(dev_t, dev) +			__field(u64, fileid) +			__field(u64, dir) +			__string(name, dentry->d_name.name) +		), + +		TP_fast_assign( +			__entry->dev = inode->i_sb->s_dev; +			__entry->fileid = NFS_FILEID(inode); +			__entry->dir = NFS_FILEID(dir); +			__entry->error = error; +			__assign_str(name, dentry->d_name.name); +		), + +		TP_printk( +			"error=%d fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s", +			__entry->error, +			MAJOR(__entry->dev), MINOR(__entry->dev), +			__entry->fileid, +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->dir, +			__get_str(name) +		) +); + +DECLARE_EVENT_CLASS(nfs_rename_event, +		TP_PROTO( +			const struct inode *old_dir, +			const struct dentry *old_dentry, +			const struct inode *new_dir, +			const struct dentry *new_dentry +		), + +		TP_ARGS(old_dir, old_dentry, new_dir, new_dentry), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(u64, old_dir) +			__field(u64, new_dir) +			__string(old_name, old_dentry->d_name.name) +			__string(new_name, new_dentry->d_name.name) +		), + +		TP_fast_assign( +			__entry->dev = old_dir->i_sb->s_dev; +			__entry->old_dir = NFS_FILEID(old_dir); +			__entry->new_dir = NFS_FILEID(new_dir); +			__assign_str(old_name, old_dentry->d_name.name); +			__assign_str(new_name, new_dentry->d_name.name); +		), + +		TP_printk( +			"old_name=%02x:%02x:%llu/%s new_name=%02x:%02x:%llu/%s", +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->old_dir, +			__get_str(old_name), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->new_dir, +			__get_str(new_name) +		) +); +#define DEFINE_NFS_RENAME_EVENT(name) \ +	DEFINE_EVENT(nfs_rename_event, name, \ +			TP_PROTO( \ +				const struct inode *old_dir, \ +				const struct dentry *old_dentry, \ +				const struct inode *new_dir, \ +				const struct dentry *new_dentry \ +			), \ +			TP_ARGS(old_dir, old_dentry, new_dir, new_dentry)) + +DECLARE_EVENT_CLASS(nfs_rename_event_done, +		TP_PROTO( +			const struct inode *old_dir, +			const struct dentry *old_dentry, +			const struct inode *new_dir, +			const struct dentry *new_dentry, +			int error +		), + +		TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, error), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(int, error) +			__field(u64, old_dir) +			__string(old_name, old_dentry->d_name.name) +			__field(u64, new_dir) +			__string(new_name, new_dentry->d_name.name) +		), + +		TP_fast_assign( +			__entry->dev = old_dir->i_sb->s_dev; +			__entry->old_dir = NFS_FILEID(old_dir); +			__entry->new_dir = NFS_FILEID(new_dir); +			__entry->error = error; +			__assign_str(old_name, old_dentry->d_name.name); +			__assign_str(new_name, new_dentry->d_name.name); +		), + +		TP_printk( +			"error=%d old_name=%02x:%02x:%llu/%s " +			"new_name=%02x:%02x:%llu/%s", +			__entry->error, +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->old_dir, +			__get_str(old_name), +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->new_dir, +			__get_str(new_name) +		) +); +#define DEFINE_NFS_RENAME_EVENT_DONE(name) \ +	DEFINE_EVENT(nfs_rename_event_done, name, \ +			TP_PROTO( \ +				const struct inode *old_dir, \ +				const struct dentry *old_dentry, \ +				const struct inode *new_dir, \ +				const struct dentry *new_dentry, \ +				int error \ +			), \ +			TP_ARGS(old_dir, old_dentry, new_dir, \ +				new_dentry, error)) + +DEFINE_NFS_RENAME_EVENT(nfs_rename_enter); +DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit); + +DEFINE_NFS_RENAME_EVENT_DONE(nfs_sillyrename_rename); + +TRACE_EVENT(nfs_sillyrename_unlink, +		TP_PROTO( +			const struct nfs_unlinkdata *data, +			int error +		), + +		TP_ARGS(data, error), + +		TP_STRUCT__entry( +			__field(dev_t, dev) +			__field(int, error) +			__field(u64, dir) +			__dynamic_array(char, name, data->args.name.len + 1) +		), + +		TP_fast_assign( +			struct inode *dir = data->dir; +			size_t len = data->args.name.len; +			__entry->dev = dir->i_sb->s_dev; +			__entry->dir = NFS_FILEID(dir); +			__entry->error = error; +			memcpy(__get_dynamic_array(name), +				data->args.name.name, len); +			((char *)__get_dynamic_array(name))[len] = 0; +		), + +		TP_printk( +			"error=%d name=%02x:%02x:%llu/%s", +			__entry->error, +			MAJOR(__entry->dev), MINOR(__entry->dev), +			(unsigned long long)__entry->dir, +			__get_str(name) +		) +); +#endif /* _TRACE_NFS_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE nfstrace +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild new file mode 100644 index 00000000000..ed30ea072bb --- /dev/null +++ b/fs/nfs/objlayout/Kbuild @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS Objects Layout Driver kernel module +# +objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c new file mode 100644 index 00000000000..611320753db --- /dev/null +++ b/fs/nfs/objlayout/objio_osd.c @@ -0,0 +1,706 @@ +/* + *  pNFS Objects layout implementation over open-osd initiator library + * + *  Copyright (C) 2009 Panasas Inc. [year of first publication] + *  All rights reserved. + * + *  Benny Halevy <bhalevy@panasas.com> + *  Boaz Harrosh <bharrosh@panasas.com> + * + *  This program is free software; you can redistribute it and/or modify + *  it under the terms of the GNU General Public License version 2 + *  See the file COPYING included with this distribution for more details. + * + *  Redistribution and use in source and binary forms, with or without + *  modification, are permitted provided that the following conditions + *  are met: + * + *  1. Redistributions of source code must retain the above copyright + *     notice, this list of conditions and the following disclaimer. + *  2. Redistributions in binary form must reproduce the above copyright + *     notice, this list of conditions and the following disclaimer in the + *     documentation and/or other materials provided with the distribution. + *  3. Neither the name of the Panasas company nor the names of its + *     contributors may be used to endorse or promote products derived + *     from this software without specific prior written permission. + * + *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/module.h> +#include <scsi/osd_ore.h> + +#include "objlayout.h" +#include "../internal.h" + +#define NFSDBG_FACILITY         NFSDBG_PNFS_LD + +struct objio_dev_ent { +	struct nfs4_deviceid_node id_node; +	struct ore_dev od; +}; + +static void +objio_free_deviceid_node(struct nfs4_deviceid_node *d) +{ +	struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); + +	dprintk("%s: free od=%p\n", __func__, de->od.od); +	osduld_put_device(de->od.od); +	kfree(de); +} + +static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss, +	const struct nfs4_deviceid *d_id) +{ +	struct nfs4_deviceid_node *d; +	struct objio_dev_ent *de; + +	d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id); +	if (!d) +		return NULL; + +	de = container_of(d, struct objio_dev_ent, id_node); +	return de; +} + +static struct objio_dev_ent * +_dev_list_add(const struct nfs_server *nfss, +	const struct nfs4_deviceid *d_id, struct osd_dev *od, +	gfp_t gfp_flags) +{ +	struct nfs4_deviceid_node *d; +	struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags); +	struct objio_dev_ent *n; + +	if (!de) { +		dprintk("%s: -ENOMEM od=%p\n", __func__, od); +		return NULL; +	} + +	dprintk("%s: Adding od=%p\n", __func__, od); +	nfs4_init_deviceid_node(&de->id_node, +				nfss->pnfs_curr_ld, +				nfss->nfs_client, +				d_id); +	de->od.od = od; + +	d = nfs4_insert_deviceid_node(&de->id_node); +	n = container_of(d, struct objio_dev_ent, id_node); +	if (n != de) { +		dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od); +		objio_free_deviceid_node(&de->id_node); +		de = n; +	} + +	return de; +} + +struct objio_segment { +	struct pnfs_layout_segment lseg; + +	struct ore_layout layout; +	struct ore_components oc; +}; + +static inline struct objio_segment * +OBJIO_LSEG(struct pnfs_layout_segment *lseg) +{ +	return container_of(lseg, struct objio_segment, lseg); +} + +struct objio_state { +	/* Generic layer */ +	struct objlayout_io_res oir; + +	bool sync; +	/*FIXME: Support for extra_bytes at ore_get_rw_state() */ +	struct ore_io_state *ios; +}; + +/* Send and wait for a get_device_info of devices in the layout, +   then look them up with the osd_initiator library */ +static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, +	struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id, +	gfp_t gfp_flags) +{ +	struct pnfs_osd_deviceaddr *deviceaddr; +	struct objio_dev_ent *ode; +	struct osd_dev *od; +	struct osd_dev_info odi; +	bool retry_flag = true; +	int err; + +	ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); +	if (ode) { +		objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ +		return 0; +	} + +	err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); +	if (unlikely(err)) { +		dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", +			__func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); +		return err; +	} + +	odi.systemid_len = deviceaddr->oda_systemid.len; +	if (odi.systemid_len > sizeof(odi.systemid)) { +		dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n", +			__func__, sizeof(odi.systemid)); +		err = -EINVAL; +		goto out; +	} else if (odi.systemid_len) +		memcpy(odi.systemid, deviceaddr->oda_systemid.data, +		       odi.systemid_len); +	odi.osdname_len	 = deviceaddr->oda_osdname.len; +	odi.osdname	 = (u8 *)deviceaddr->oda_osdname.data; + +	if (!odi.osdname_len && !odi.systemid_len) { +		dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", +			__func__); +		err = -ENODEV; +		goto out; +	} + +retry_lookup: +	od = osduld_info_lookup(&odi); +	if (unlikely(IS_ERR(od))) { +		err = PTR_ERR(od); +		dprintk("%s: osduld_info_lookup => %d\n", __func__, err); +		if (err == -ENODEV && retry_flag) { +			err = objlayout_autologin(deviceaddr); +			if (likely(!err)) { +				retry_flag = false; +				goto retry_lookup; +			} +		} +		goto out; +	} + +	ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, +			    gfp_flags); +	objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ +	dprintk("Adding new dev_id(%llx:%llx)\n", +		_DEVID_LO(d_id), _DEVID_HI(d_id)); +out: +	objlayout_put_deviceinfo(deviceaddr); +	return err; +} + +static void copy_single_comp(struct ore_components *oc, unsigned c, +			     struct pnfs_osd_object_cred *src_comp) +{ +	struct ore_comp *ocomp = &oc->comps[c]; + +	WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */ +	WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred)); + +	ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id; +	ocomp->obj.id = src_comp->oc_object_id.oid_object_id; + +	memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred)); +} + +static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, +		       struct objio_segment **pseg) +{ +/*	This is the in memory structure of the objio_segment + * + *	struct __alloc_objio_segment { + *		struct objio_segment olseg; + *		struct ore_dev *ods[numdevs]; + *		struct ore_comp	comps[numdevs]; + *	} *aolseg; + *	NOTE: The code as above compiles and runs perfectly. It is elegant, + *	type safe and compact. At some Past time Linus has decided he does not + *	like variable length arrays, For the sake of this principal we uglify + *	the code as below. + */ +	struct objio_segment *lseg; +	size_t lseg_size = sizeof(*lseg) + +			numdevs * sizeof(lseg->oc.ods[0]) + +			numdevs * sizeof(*lseg->oc.comps); + +	lseg = kzalloc(lseg_size, gfp_flags); +	if (unlikely(!lseg)) { +		dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__, +			numdevs, lseg_size); +		return -ENOMEM; +	} + +	lseg->oc.numdevs = numdevs; +	lseg->oc.single_comp = EC_MULTPLE_COMPS; +	lseg->oc.ods = (void *)(lseg + 1); +	lseg->oc.comps = (void *)(lseg->oc.ods + numdevs); + +	*pseg = lseg; +	return 0; +} + +int objio_alloc_lseg(struct pnfs_layout_segment **outp, +	struct pnfs_layout_hdr *pnfslay, +	struct pnfs_layout_range *range, +	struct xdr_stream *xdr, +	gfp_t gfp_flags) +{ +	struct objio_segment *objio_seg; +	struct pnfs_osd_xdr_decode_layout_iter iter; +	struct pnfs_osd_layout layout; +	struct pnfs_osd_object_cred src_comp; +	unsigned cur_comp; +	int err; + +	err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); +	if (unlikely(err)) +		return err; + +	err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg); +	if (unlikely(err)) +		return err; + +	objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit; +	objio_seg->layout.group_width = layout.olo_map.odm_group_width; +	objio_seg->layout.group_depth = layout.olo_map.odm_group_depth; +	objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; +	objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm; + +	err = ore_verify_layout(layout.olo_map.odm_num_comps, +					  &objio_seg->layout); +	if (unlikely(err)) +		goto err; + +	objio_seg->oc.first_dev = layout.olo_comps_index; +	cur_comp = 0; +	while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { +		copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); +		err = objio_devices_lookup(pnfslay, objio_seg, cur_comp, +					   &src_comp.oc_object_id.oid_device_id, +					   gfp_flags); +		if (err) +			goto err; +		++cur_comp; +	} +	/* pnfs_osd_xdr_decode_layout_comp returns false on error */ +	if (unlikely(err)) +		goto err; + +	*outp = &objio_seg->lseg; +	return 0; + +err: +	kfree(objio_seg); +	dprintk("%s: Error: return %d\n", __func__, err); +	*outp = NULL; +	return err; +} + +void objio_free_lseg(struct pnfs_layout_segment *lseg) +{ +	int i; +	struct objio_segment *objio_seg = OBJIO_LSEG(lseg); + +	for (i = 0; i < objio_seg->oc.numdevs; i++) { +		struct ore_dev *od = objio_seg->oc.ods[i]; +		struct objio_dev_ent *ode; + +		if (!od) +			break; +		ode = container_of(od, typeof(*ode), od); +		nfs4_put_deviceid_node(&ode->id_node); +	} +	kfree(objio_seg); +} + +static int +objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading, +	struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase, +	loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags, +	struct objio_state **outp) +{ +	struct objio_segment *objio_seg = OBJIO_LSEG(lseg); +	struct ore_io_state *ios; +	int ret; +	struct __alloc_objio_state { +		struct objio_state objios; +		struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs]; +	} *aos; + +	aos = kzalloc(sizeof(*aos), gfp_flags); +	if (unlikely(!aos)) +		return -ENOMEM; + +	objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs, +			aos->ioerrs, rpcdata, pnfs_layout_type); + +	ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading, +			       offset, count, &ios); +	if (unlikely(ret)) { +		kfree(aos); +		return ret; +	} + +	ios->pages = pages; +	ios->pgbase = pgbase; +	ios->private = aos; +	BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT); + +	aos->objios.sync = 0; +	aos->objios.ios = ios; +	*outp = &aos->objios; +	return 0; +} + +void objio_free_result(struct objlayout_io_res *oir) +{ +	struct objio_state *objios = container_of(oir, struct objio_state, oir); + +	ore_put_io_state(objios->ios); +	kfree(objios); +} + +static enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) +{ +	switch (oep) { +	case OSD_ERR_PRI_NO_ERROR: +		return (enum pnfs_osd_errno)0; + +	case OSD_ERR_PRI_CLEAR_PAGES: +		BUG_ON(1); +		return 0; + +	case OSD_ERR_PRI_RESOURCE: +		return PNFS_OSD_ERR_RESOURCE; +	case OSD_ERR_PRI_BAD_CRED: +		return PNFS_OSD_ERR_BAD_CRED; +	case OSD_ERR_PRI_NO_ACCESS: +		return PNFS_OSD_ERR_NO_ACCESS; +	case OSD_ERR_PRI_UNREACHABLE: +		return PNFS_OSD_ERR_UNREACHABLE; +	case OSD_ERR_PRI_NOT_FOUND: +		return PNFS_OSD_ERR_NOT_FOUND; +	case OSD_ERR_PRI_NO_SPACE: +		return PNFS_OSD_ERR_NO_SPACE; +	default: +		WARN_ON(1); +		/* fallthrough */ +	case OSD_ERR_PRI_EIO: +		return PNFS_OSD_ERR_EIO; +	} +} + +static void __on_dev_error(struct ore_io_state *ios, +	struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep, +	u64 dev_offset, u64  dev_len) +{ +	struct objio_state *objios = ios->private; +	struct pnfs_osd_objid pooid; +	struct objio_dev_ent *ode = container_of(od, typeof(*ode), od); +	/* FIXME: what to do with more-then-one-group layouts. We need to +	 * translate from ore_io_state index to oc->comps index +	 */ +	unsigned comp = dev_index; + +	pooid.oid_device_id = ode->id_node.deviceid; +	pooid.oid_partition_id = ios->oc->comps[comp].obj.partition; +	pooid.oid_object_id = ios->oc->comps[comp].obj.id; + +	objlayout_io_set_result(&objios->oir, comp, +				&pooid, osd_pri_2_pnfs_err(oep), +				dev_offset, dev_len, !ios->reading); +} + +/* + * read + */ +static void _read_done(struct ore_io_state *ios, void *private) +{ +	struct objio_state *objios = private; +	ssize_t status; +	int ret = ore_check_io(ios, &__on_dev_error); + +	/* FIXME: _io_free(ios) can we dealocate the libosd resources; */ + +	if (likely(!ret)) +		status = ios->length; +	else +		status = ret; + +	objlayout_read_done(&objios->oir, status, objios->sync); +} + +int objio_read_pagelist(struct nfs_pgio_data *rdata) +{ +	struct nfs_pgio_header *hdr = rdata->header; +	struct objio_state *objios; +	int ret; + +	ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true, +			hdr->lseg, rdata->args.pages, rdata->args.pgbase, +			rdata->args.offset, rdata->args.count, rdata, +			GFP_KERNEL, &objios); +	if (unlikely(ret)) +		return ret; + +	objios->ios->done = _read_done; +	dprintk("%s: offset=0x%llx length=0x%x\n", __func__, +		rdata->args.offset, rdata->args.count); +	ret = ore_read(objios->ios); +	if (unlikely(ret)) +		objio_free_result(&objios->oir); +	return ret; +} + +/* + * write + */ +static void _write_done(struct ore_io_state *ios, void *private) +{ +	struct objio_state *objios = private; +	ssize_t status; +	int ret = ore_check_io(ios, &__on_dev_error); + +	/* FIXME: _io_free(ios) can we dealocate the libosd resources; */ + +	if (likely(!ret)) { +		/* FIXME: should be based on the OSD's persistence model +		 * See OSD2r05 Section 4.13 Data persistence model */ +		objios->oir.committed = NFS_FILE_SYNC; +		status = ios->length; +	} else { +		status = ret; +	} + +	objlayout_write_done(&objios->oir, status, objios->sync); +} + +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) +{ +	struct objio_state *objios = priv; +	struct nfs_pgio_data *wdata = objios->oir.rpcdata; +	struct address_space *mapping = wdata->header->inode->i_mapping; +	pgoff_t index = offset / PAGE_SIZE; +	struct page *page; +	loff_t i_size = i_size_read(wdata->header->inode); + +	if (offset >= i_size) { +		*uptodate = true; +		dprintk("%s: g_zero_page index=0x%lx\n", __func__, index); +		return ZERO_PAGE(0); +	} + +	page = find_get_page(mapping, index); +	if (!page) { +		page = find_or_create_page(mapping, index, GFP_NOFS); +		if (unlikely(!page)) { +			dprintk("%s: grab_cache_page Failed index=0x%lx\n", +				__func__, index); +			return NULL; +		} +		unlock_page(page); +	} +	if (PageDirty(page) || PageWriteback(page)) +		*uptodate = true; +	else +		*uptodate = PageUptodate(page); +	dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); +	return page; +} + +static void __r4w_put_page(void *priv, struct page *page) +{ +	dprintk("%s: index=0x%lx\n", __func__, +		(page == ZERO_PAGE(0)) ? -1UL : page->index); +	if (ZERO_PAGE(0) != page) +		page_cache_release(page); +	return; +} + +static const struct _ore_r4w_op _r4w_op = { +	.get_page = &__r4w_get_page, +	.put_page = &__r4w_put_page, +}; + +int objio_write_pagelist(struct nfs_pgio_data *wdata, int how) +{ +	struct nfs_pgio_header *hdr = wdata->header; +	struct objio_state *objios; +	int ret; + +	ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false, +			hdr->lseg, wdata->args.pages, wdata->args.pgbase, +			wdata->args.offset, wdata->args.count, wdata, GFP_NOFS, +			&objios); +	if (unlikely(ret)) +		return ret; + +	objios->sync = 0 != (how & FLUSH_SYNC); +	objios->ios->r4w = &_r4w_op; + +	if (!objios->sync) +		objios->ios->done = _write_done; + +	dprintk("%s: offset=0x%llx length=0x%x\n", __func__, +		wdata->args.offset, wdata->args.count); +	ret = ore_write(objios->ios); +	if (unlikely(ret)) { +		objio_free_result(&objios->oir); +		return ret; +	} + +	if (objios->sync) +		_write_done(objios->ios, objios); + +	return 0; +} + +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio, +			  struct nfs_page *prev, struct nfs_page *req) +{ +	unsigned int size; + +	size = pnfs_generic_pg_test(pgio, prev, req); + +	if (!size || pgio->pg_count + req->wb_bytes > +	    (unsigned long)pgio->pg_layout_private) +		return 0; + +	return min(size, req->wb_bytes); +} + +static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ +	pnfs_generic_pg_init_read(pgio, req); +	if (unlikely(pgio->pg_lseg == NULL)) +		return; /* Not pNFS */ + +	pgio->pg_layout_private = (void *) +				OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; +} + +static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout, +				   unsigned long *stripe_end) +{ +	u32 stripe_off; +	unsigned stripe_size; + +	if (layout->raid_algorithm == PNFS_OSD_RAID_0) +		return true; + +	stripe_size = layout->stripe_unit * +				(layout->group_width - layout->parity); + +	div_u64_rem(offset, stripe_size, &stripe_off); +	if (!stripe_off) +		return true; + +	*stripe_end = stripe_size - stripe_off; +	return false; +} + +static void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ +	unsigned long stripe_end = 0; +	u64 wb_size; + +	if (pgio->pg_dreq == NULL) +		wb_size = i_size_read(pgio->pg_inode) - req_offset(req); +	else +		wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + +	pnfs_generic_pg_init_write(pgio, req, wb_size); +	if (unlikely(pgio->pg_lseg == NULL)) +		return; /* Not pNFS */ + +	if (req->wb_offset || +	    !aligned_on_raid_stripe(req->wb_index * PAGE_SIZE, +			       &OBJIO_LSEG(pgio->pg_lseg)->layout, +			       &stripe_end)) { +		pgio->pg_layout_private = (void *)stripe_end; +	} else { +		pgio->pg_layout_private = (void *) +				OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; +	} +} + +static const struct nfs_pageio_ops objio_pg_read_ops = { +	.pg_init = objio_init_read, +	.pg_test = objio_pg_test, +	.pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops objio_pg_write_ops = { +	.pg_init = objio_init_write, +	.pg_test = objio_pg_test, +	.pg_doio = pnfs_generic_pg_writepages, +}; + +static struct pnfs_layoutdriver_type objlayout_type = { +	.id = LAYOUT_OSD2_OBJECTS, +	.name = "LAYOUT_OSD2_OBJECTS", +	.flags                   = PNFS_LAYOUTRET_ON_SETATTR | +				   PNFS_LAYOUTRET_ON_ERROR, + +	.owner		       	 = THIS_MODULE, +	.alloc_layout_hdr        = objlayout_alloc_layout_hdr, +	.free_layout_hdr         = objlayout_free_layout_hdr, + +	.alloc_lseg              = objlayout_alloc_lseg, +	.free_lseg               = objlayout_free_lseg, + +	.read_pagelist           = objlayout_read_pagelist, +	.write_pagelist          = objlayout_write_pagelist, +	.pg_read_ops             = &objio_pg_read_ops, +	.pg_write_ops            = &objio_pg_write_ops, + +	.free_deviceid_node	 = objio_free_deviceid_node, + +	.encode_layoutcommit	 = objlayout_encode_layoutcommit, +	.encode_layoutreturn     = objlayout_encode_layoutreturn, +}; + +MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); +MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>"); +MODULE_LICENSE("GPL"); + +static int __init +objlayout_init(void) +{ +	int ret = pnfs_register_layoutdriver(&objlayout_type); + +	if (ret) +		printk(KERN_INFO +			"NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n", +			__func__, ret); +	else +		printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n", +			__func__); +	return ret; +} + +static void __exit +objlayout_exit(void) +{ +	pnfs_unregister_layoutdriver(&objlayout_type); +	printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n", +	       __func__); +} + +MODULE_ALIAS("nfs-layouttype4-2"); + +module_init(objlayout_init); +module_exit(objlayout_exit); diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c new file mode 100644 index 00000000000..765d3f54e98 --- /dev/null +++ b/fs/nfs/objlayout/objlayout.c @@ -0,0 +1,779 @@ +/* + *  pNFS Objects layout driver high level definitions + * + *  Copyright (C) 2007 Panasas Inc. [year of first publication] + *  All rights reserved. + * + *  Benny Halevy <bhalevy@panasas.com> + *  Boaz Harrosh <bharrosh@panasas.com> + * + *  This program is free software; you can redistribute it and/or modify + *  it under the terms of the GNU General Public License version 2 + *  See the file COPYING included with this distribution for more details. + * + *  Redistribution and use in source and binary forms, with or without + *  modification, are permitted provided that the following conditions + *  are met: + * + *  1. Redistributions of source code must retain the above copyright + *     notice, this list of conditions and the following disclaimer. + *  2. Redistributions in binary form must reproduce the above copyright + *     notice, this list of conditions and the following disclaimer in the + *     documentation and/or other materials provided with the distribution. + *  3. Neither the name of the Panasas company nor the names of its + *     contributors may be used to endorse or promote products derived + *     from this software without specific prior written permission. + * + *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/kmod.h> +#include <linux/moduleparam.h> +#include <linux/ratelimit.h> +#include <scsi/osd_initiator.h> +#include "objlayout.h" + +#define NFSDBG_FACILITY         NFSDBG_PNFS_LD +/* + * Create a objlayout layout structure for the given inode and return it. + */ +struct pnfs_layout_hdr * +objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) +{ +	struct objlayout *objlay; + +	objlay = kzalloc(sizeof(struct objlayout), gfp_flags); +	if (!objlay) +		return NULL; +	spin_lock_init(&objlay->lock); +	INIT_LIST_HEAD(&objlay->err_list); +	dprintk("%s: Return %p\n", __func__, objlay); +	return &objlay->pnfs_layout; +} + +/* + * Free an objlayout layout structure + */ +void +objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ +	struct objlayout *objlay = OBJLAYOUT(lo); + +	dprintk("%s: objlay %p\n", __func__, objlay); + +	WARN_ON(!list_empty(&objlay->err_list)); +	kfree(objlay); +} + +/* + * Unmarshall layout and store it in pnfslay. + */ +struct pnfs_layout_segment * +objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, +		     struct nfs4_layoutget_res *lgr, +		     gfp_t gfp_flags) +{ +	int status = -ENOMEM; +	struct xdr_stream stream; +	struct xdr_buf buf = { +		.pages =  lgr->layoutp->pages, +		.page_len =  lgr->layoutp->len, +		.buflen =  lgr->layoutp->len, +		.len = lgr->layoutp->len, +	}; +	struct page *scratch; +	struct pnfs_layout_segment *lseg; + +	dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay); + +	scratch = alloc_page(gfp_flags); +	if (!scratch) +		goto err_nofree; + +	xdr_init_decode(&stream, &buf, NULL); +	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + +	status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags); +	if (unlikely(status)) { +		dprintk("%s: objio_alloc_lseg Return err %d\n", __func__, +			status); +		goto err; +	} + +	__free_page(scratch); + +	dprintk("%s: Return %p\n", __func__, lseg); +	return lseg; + +err: +	__free_page(scratch); +err_nofree: +	dprintk("%s: Err Return=>%d\n", __func__, status); +	return ERR_PTR(status); +} + +/* + * Free a layout segement + */ +void +objlayout_free_lseg(struct pnfs_layout_segment *lseg) +{ +	dprintk("%s: freeing layout segment %p\n", __func__, lseg); + +	if (unlikely(!lseg)) +		return; + +	objio_free_lseg(lseg); +} + +/* + * I/O Operations + */ +static inline u64 +end_offset(u64 start, u64 len) +{ +	u64 end; + +	end = start + len; +	return end >= start ? end : NFS4_MAX_UINT64; +} + +static void _fix_verify_io_params(struct pnfs_layout_segment *lseg, +			   struct page ***p_pages, unsigned *p_pgbase, +			   u64 offset, unsigned long count) +{ +	u64 lseg_end_offset; + +	BUG_ON(offset < lseg->pls_range.offset); +	lseg_end_offset = end_offset(lseg->pls_range.offset, +				     lseg->pls_range.length); +	BUG_ON(offset >= lseg_end_offset); +	WARN_ON(offset + count > lseg_end_offset); + +	if (*p_pgbase > PAGE_SIZE) { +		dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase); +		*p_pages += *p_pgbase >> PAGE_SHIFT; +		*p_pgbase &= ~PAGE_MASK; +	} +} + +/* + * I/O done common code + */ +static void +objlayout_iodone(struct objlayout_io_res *oir) +{ +	if (likely(oir->status >= 0)) { +		objio_free_result(oir); +	} else { +		struct objlayout *objlay = oir->objlay; + +		spin_lock(&objlay->lock); +		objlay->delta_space_valid = OBJ_DSU_INVALID; +		list_add(&objlay->err_list, &oir->err_list); +		spin_unlock(&objlay->lock); +	} +} + +/* + * objlayout_io_set_result - Set an osd_error code on a specific osd comp. + * + * The @index component IO failed (error returned from target). Register + * the error for later reporting at layout-return. + */ +void +objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, +			struct pnfs_osd_objid *pooid, int osd_error, +			u64 offset, u64 length, bool is_write) +{ +	struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index]; + +	BUG_ON(index >= oir->num_comps); +	if (osd_error) { +		ioerr->oer_component = *pooid; +		ioerr->oer_comp_offset = offset; +		ioerr->oer_comp_length = length; +		ioerr->oer_iswrite = is_write; +		ioerr->oer_errno = osd_error; + +		dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " +			"par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", +			__func__, index, ioerr->oer_errno, +			ioerr->oer_iswrite, +			_DEVID_LO(&ioerr->oer_component.oid_device_id), +			_DEVID_HI(&ioerr->oer_component.oid_device_id), +			ioerr->oer_component.oid_partition_id, +			ioerr->oer_component.oid_object_id, +			ioerr->oer_comp_offset, +			ioerr->oer_comp_length); +	} else { +		/* User need not call if no error is reported */ +		ioerr->oer_errno = 0; +	} +} + +/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). + * This is because the osd completion is called with ints-off from + * the block layer + */ +static void _rpc_read_complete(struct work_struct *work) +{ +	struct rpc_task *task; +	struct nfs_pgio_data *rdata; + +	dprintk("%s enter\n", __func__); +	task = container_of(work, struct rpc_task, u.tk_work); +	rdata = container_of(task, struct nfs_pgio_data, task); + +	pnfs_ld_read_done(rdata); +} + +void +objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) +{ +	struct nfs_pgio_data *rdata = oir->rpcdata; + +	oir->status = rdata->task.tk_status = status; +	if (status >= 0) +		rdata->res.count = status; +	else +		rdata->header->pnfs_error = status; +	objlayout_iodone(oir); +	/* must not use oir after this point */ + +	dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, +		status, rdata->res.eof, sync); + +	if (sync) +		pnfs_ld_read_done(rdata); +	else { +		INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); +		schedule_work(&rdata->task.u.tk_work); +	} +} + +/* + * Perform sync or async reads. + */ +enum pnfs_try_status +objlayout_read_pagelist(struct nfs_pgio_data *rdata) +{ +	struct nfs_pgio_header *hdr = rdata->header; +	struct inode *inode = hdr->inode; +	loff_t offset = rdata->args.offset; +	size_t count = rdata->args.count; +	int err; +	loff_t eof; + +	eof = i_size_read(inode); +	if (unlikely(offset + count > eof)) { +		if (offset >= eof) { +			err = 0; +			rdata->res.count = 0; +			rdata->res.eof = 1; +			/*FIXME: do we need to call pnfs_ld_read_done() */ +			goto out; +		} +		count = eof - offset; +	} + +	rdata->res.eof = (offset + count) >= eof; +	_fix_verify_io_params(hdr->lseg, &rdata->args.pages, +			      &rdata->args.pgbase, +			      rdata->args.offset, rdata->args.count); + +	dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n", +		__func__, inode->i_ino, offset, count, rdata->res.eof); + +	err = objio_read_pagelist(rdata); + out: +	if (unlikely(err)) { +		hdr->pnfs_error = err; +		dprintk("%s: Returned Error %d\n", __func__, err); +		return PNFS_NOT_ATTEMPTED; +	} +	return PNFS_ATTEMPTED; +} + +/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). + * This is because the osd completion is called with ints-off from + * the block layer + */ +static void _rpc_write_complete(struct work_struct *work) +{ +	struct rpc_task *task; +	struct nfs_pgio_data *wdata; + +	dprintk("%s enter\n", __func__); +	task = container_of(work, struct rpc_task, u.tk_work); +	wdata = container_of(task, struct nfs_pgio_data, task); + +	pnfs_ld_write_done(wdata); +} + +void +objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) +{ +	struct nfs_pgio_data *wdata = oir->rpcdata; + +	oir->status = wdata->task.tk_status = status; +	if (status >= 0) { +		wdata->res.count = status; +		wdata->verf.committed = oir->committed; +	} else { +		wdata->header->pnfs_error = status; +	} +	objlayout_iodone(oir); +	/* must not use oir after this point */ + +	dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, +		status, wdata->verf.committed, sync); + +	if (sync) +		pnfs_ld_write_done(wdata); +	else { +		INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); +		schedule_work(&wdata->task.u.tk_work); +	} +} + +/* + * Perform sync or async writes. + */ +enum pnfs_try_status +objlayout_write_pagelist(struct nfs_pgio_data *wdata, +			 int how) +{ +	struct nfs_pgio_header *hdr = wdata->header; +	int err; + +	_fix_verify_io_params(hdr->lseg, &wdata->args.pages, +			      &wdata->args.pgbase, +			      wdata->args.offset, wdata->args.count); + +	err = objio_write_pagelist(wdata, how); +	if (unlikely(err)) { +		hdr->pnfs_error = err; +		dprintk("%s: Returned Error %d\n", __func__, err); +		return PNFS_NOT_ATTEMPTED; +	} +	return PNFS_ATTEMPTED; +} + +void +objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, +			      struct xdr_stream *xdr, +			      const struct nfs4_layoutcommit_args *args) +{ +	struct objlayout *objlay = OBJLAYOUT(pnfslay); +	struct pnfs_osd_layoutupdate lou; +	__be32 *start; + +	dprintk("%s: Begin\n", __func__); + +	spin_lock(&objlay->lock); +	lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); +	lou.dsu_delta = objlay->delta_space_used; +	objlay->delta_space_used = 0; +	objlay->delta_space_valid = OBJ_DSU_INIT; +	lou.olu_ioerr_flag = !list_empty(&objlay->err_list); +	spin_unlock(&objlay->lock); + +	start = xdr_reserve_space(xdr, 4); + +	BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); + +	*start = cpu_to_be32((xdr->p - start - 1) * 4); + +	dprintk("%s: Return delta_space_used %lld err %d\n", __func__, +		lou.dsu_delta, lou.olu_ioerr_flag); +} + +static int +err_prio(u32 oer_errno) +{ +	switch (oer_errno) { +	case 0: +		return 0; + +	case PNFS_OSD_ERR_RESOURCE: +		return OSD_ERR_PRI_RESOURCE; +	case PNFS_OSD_ERR_BAD_CRED: +		return OSD_ERR_PRI_BAD_CRED; +	case PNFS_OSD_ERR_NO_ACCESS: +		return OSD_ERR_PRI_NO_ACCESS; +	case PNFS_OSD_ERR_UNREACHABLE: +		return OSD_ERR_PRI_UNREACHABLE; +	case PNFS_OSD_ERR_NOT_FOUND: +		return OSD_ERR_PRI_NOT_FOUND; +	case PNFS_OSD_ERR_NO_SPACE: +		return OSD_ERR_PRI_NO_SPACE; +	default: +		WARN_ON(1); +		/* fallthrough */ +	case PNFS_OSD_ERR_EIO: +		return OSD_ERR_PRI_EIO; +	} +} + +static void +merge_ioerr(struct pnfs_osd_ioerr *dest_err, +	    const struct pnfs_osd_ioerr *src_err) +{ +	u64 dest_end, src_end; + +	if (!dest_err->oer_errno) { +		*dest_err = *src_err; +		/* accumulated device must be blank */ +		memset(&dest_err->oer_component.oid_device_id, 0, +			sizeof(dest_err->oer_component.oid_device_id)); + +		return; +	} + +	if (dest_err->oer_component.oid_partition_id != +				src_err->oer_component.oid_partition_id) +		dest_err->oer_component.oid_partition_id = 0; + +	if (dest_err->oer_component.oid_object_id != +				src_err->oer_component.oid_object_id) +		dest_err->oer_component.oid_object_id = 0; + +	if (dest_err->oer_comp_offset > src_err->oer_comp_offset) +		dest_err->oer_comp_offset = src_err->oer_comp_offset; + +	dest_end = end_offset(dest_err->oer_comp_offset, +			      dest_err->oer_comp_length); +	src_end =  end_offset(src_err->oer_comp_offset, +			      src_err->oer_comp_length); +	if (dest_end < src_end) +		dest_end = src_end; + +	dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; + +	if ((src_err->oer_iswrite == dest_err->oer_iswrite) && +	    (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { +			dest_err->oer_errno = src_err->oer_errno; +	} else if (src_err->oer_iswrite) { +		dest_err->oer_iswrite = true; +		dest_err->oer_errno = src_err->oer_errno; +	} +} + +static void +encode_accumulated_error(struct objlayout *objlay, __be32 *p) +{ +	struct objlayout_io_res *oir, *tmp; +	struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; + +	list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { +		unsigned i; + +		for (i = 0; i < oir->num_comps; i++) { +			struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; + +			if (!ioerr->oer_errno) +				continue; + +			printk(KERN_ERR "NFS: %s: err[%d]: errno=%d " +				"is_write=%d dev(%llx:%llx) par=0x%llx " +				"obj=0x%llx offset=0x%llx length=0x%llx\n", +				__func__, i, ioerr->oer_errno, +				ioerr->oer_iswrite, +				_DEVID_LO(&ioerr->oer_component.oid_device_id), +				_DEVID_HI(&ioerr->oer_component.oid_device_id), +				ioerr->oer_component.oid_partition_id, +				ioerr->oer_component.oid_object_id, +				ioerr->oer_comp_offset, +				ioerr->oer_comp_length); + +			merge_ioerr(&accumulated_err, ioerr); +		} +		list_del(&oir->err_list); +		objio_free_result(oir); +	} + +	pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); +} + +void +objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, +			      struct xdr_stream *xdr, +			      const struct nfs4_layoutreturn_args *args) +{ +	struct objlayout *objlay = OBJLAYOUT(pnfslay); +	struct objlayout_io_res *oir, *tmp; +	__be32 *start; + +	dprintk("%s: Begin\n", __func__); +	start = xdr_reserve_space(xdr, 4); +	BUG_ON(!start); + +	spin_lock(&objlay->lock); + +	list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { +		__be32 *last_xdr = NULL, *p; +		unsigned i; +		int res = 0; + +		for (i = 0; i < oir->num_comps; i++) { +			struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; + +			if (!ioerr->oer_errno) +				continue; + +			dprintk("%s: err[%d]: errno=%d is_write=%d " +				"dev(%llx:%llx) par=0x%llx obj=0x%llx " +				"offset=0x%llx length=0x%llx\n", +				__func__, i, ioerr->oer_errno, +				ioerr->oer_iswrite, +				_DEVID_LO(&ioerr->oer_component.oid_device_id), +				_DEVID_HI(&ioerr->oer_component.oid_device_id), +				ioerr->oer_component.oid_partition_id, +				ioerr->oer_component.oid_object_id, +				ioerr->oer_comp_offset, +				ioerr->oer_comp_length); + +			p = pnfs_osd_xdr_ioerr_reserve_space(xdr); +			if (unlikely(!p)) { +				res = -E2BIG; +				break; /* accumulated_error */ +			} + +			last_xdr = p; +			pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]); +		} + +		/* TODO: use xdr_write_pages */ +		if (unlikely(res)) { +			/* no space for even one error descriptor */ +			BUG_ON(!last_xdr); + +			/* we've encountered a situation with lots and lots of +			 * errors and no space to encode them all. Use the last +			 * available slot to report the union of all the +			 * remaining errors. +			 */ +			encode_accumulated_error(objlay, last_xdr); +			goto loop_done; +		} +		list_del(&oir->err_list); +		objio_free_result(oir); +	} +loop_done: +	spin_unlock(&objlay->lock); + +	*start = cpu_to_be32((xdr->p - start - 1) * 4); +	dprintk("%s: Return\n", __func__); +} + + +/* + * Get Device Info API for io engines + */ +struct objlayout_deviceinfo { +	struct page *page; +	struct pnfs_osd_deviceaddr da; /* This must be last */ +}; + +/* Initialize and call nfs_getdeviceinfo, then decode and return a + * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() + * should be called. + */ +int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, +	struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, +	gfp_t gfp_flags) +{ +	struct objlayout_deviceinfo *odi; +	struct pnfs_device pd; +	struct page *page, **pages; +	u32 *p; +	int err; + +	page = alloc_page(gfp_flags); +	if (!page) +		return -ENOMEM; + +	pages = &page; +	pd.pages = pages; + +	memcpy(&pd.dev_id, d_id, sizeof(*d_id)); +	pd.layout_type = LAYOUT_OSD2_OBJECTS; +	pd.pages = &page; +	pd.pgbase = 0; +	pd.pglen = PAGE_SIZE; +	pd.mincount = 0; +	pd.maxcount = PAGE_SIZE; + +	err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd, +			pnfslay->plh_lc_cred); +	dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); +	if (err) +		goto err_out; + +	p = page_address(page); +	odi = kzalloc(sizeof(*odi), gfp_flags); +	if (!odi) { +		err = -ENOMEM; +		goto err_out; +	} +	pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); +	odi->page = page; +	*deviceaddr = &odi->da; +	return 0; + +err_out: +	__free_page(page); +	return err; +} + +void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) +{ +	struct objlayout_deviceinfo *odi = container_of(deviceaddr, +						struct objlayout_deviceinfo, +						da); + +	__free_page(odi->page); +	kfree(odi); +} + +enum { +	OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, +	OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, +	OSD_LOGIN_UPCALL_PATHLEN  = 256 +}; + +static char osd_login_prog[OSD_LOGIN_UPCALL_PATHLEN] = "/sbin/osd_login"; + +module_param_string(osd_login_prog, osd_login_prog, sizeof(osd_login_prog), +		    0600); +MODULE_PARM_DESC(osd_login_prog, "Path to the osd_login upcall program"); + +struct __auto_login { +	char uri[OBJLAYOUT_MAX_URI_LEN]; +	char osdname[OBJLAYOUT_MAX_OSDNAME_LEN]; +	char systemid_hex[OBJLAYOUT_MAX_SYSID_HEX_LEN]; +}; + +static int __objlayout_upcall(struct __auto_login *login) +{ +	static char *envp[] = { "HOME=/", +		"TERM=linux", +		"PATH=/sbin:/usr/sbin:/bin:/usr/bin", +		NULL +	}; +	char *argv[8]; +	int ret; + +	if (unlikely(!osd_login_prog[0])) { +		dprintk("%s: osd_login_prog is disabled\n", __func__); +		return -EACCES; +	} + +	dprintk("%s uri: %s\n", __func__, login->uri); +	dprintk("%s osdname %s\n", __func__, login->osdname); +	dprintk("%s systemid_hex %s\n", __func__, login->systemid_hex); + +	argv[0] = (char *)osd_login_prog; +	argv[1] = "-u"; +	argv[2] = login->uri; +	argv[3] = "-o"; +	argv[4] = login->osdname; +	argv[5] = "-s"; +	argv[6] = login->systemid_hex; +	argv[7] = NULL; + +	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); +	/* +	 * Disable the upcall mechanism if we're getting an ENOENT or +	 * EACCES error. The admin can re-enable it on the fly by using +	 * sysfs to set the objlayoutdriver.osd_login_prog module parameter once +	 * the problem has been fixed. +	 */ +	if (ret == -ENOENT || ret == -EACCES) { +		printk(KERN_ERR "PNFS-OBJ: %s was not found please set " +			"objlayoutdriver.osd_login_prog kernel parameter!\n", +			osd_login_prog); +		osd_login_prog[0] = '\0'; +	} +	dprintk("%s %s return value: %d\n", __func__, osd_login_prog, ret); + +	return ret; +} + +/* Assume dest is all zeros */ +static void __copy_nfsS_and_zero_terminate(struct nfs4_string s, +					   char *dest, int max_len, +					   const char *var_name) +{ +	if (!s.len) +		return; + +	if (s.len >= max_len) { +		pr_warn_ratelimited( +			"objlayout_autologin: %s: s.len(%d) >= max_len(%d)", +			var_name, s.len, max_len); +		s.len = max_len - 1; /* space for null terminator */ +	} + +	memcpy(dest, s.data, s.len); +} + +/* Assume sysid is all zeros */ +static void _sysid_2_hex(struct nfs4_string s, +		  char sysid[OBJLAYOUT_MAX_SYSID_HEX_LEN]) +{ +	int i; +	char *cur; + +	if (!s.len) +		return; + +	if (s.len != OSD_SYSTEMID_LEN) { +		pr_warn_ratelimited( +		    "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN", +		    s.len); +		if (s.len > OSD_SYSTEMID_LEN) +			s.len = OSD_SYSTEMID_LEN; +	} + +	cur = sysid; +	for (i = 0; i < s.len; i++) +		cur = hex_byte_pack(cur, s.data[i]); +} + +int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr) +{ +	int rc; +	struct __auto_login login; + +	if (!deviceaddr->oda_targetaddr.ota_netaddr.r_addr.len) +		return -ENODEV; + +	memset(&login, 0, sizeof(login)); +	__copy_nfsS_and_zero_terminate( +		deviceaddr->oda_targetaddr.ota_netaddr.r_addr, +		login.uri, sizeof(login.uri), "URI"); + +	__copy_nfsS_and_zero_terminate( +		deviceaddr->oda_osdname, +		login.osdname, sizeof(login.osdname), "OSDNAME"); + +	_sysid_2_hex(deviceaddr->oda_systemid, login.systemid_hex); + +	rc = __objlayout_upcall(&login); +	if (rc > 0) /* script returns positive values */ +		rc = -ENODEV; + +	return rc; +} diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h new file mode 100644 index 00000000000..01e041029a6 --- /dev/null +++ b/fs/nfs/objlayout/objlayout.h @@ -0,0 +1,189 @@ +/* + *  Data types and function declerations for interfacing with the + *  pNFS standard object layout driver. + * + *  Copyright (C) 2007 Panasas Inc. [year of first publication] + *  All rights reserved. + * + *  Benny Halevy <bhalevy@panasas.com> + *  Boaz Harrosh <bharrosh@panasas.com> + * + *  This program is free software; you can redistribute it and/or modify + *  it under the terms of the GNU General Public License version 2 + *  See the file COPYING included with this distribution for more details. + * + *  Redistribution and use in source and binary forms, with or without + *  modification, are permitted provided that the following conditions + *  are met: + * + *  1. Redistributions of source code must retain the above copyright + *     notice, this list of conditions and the following disclaimer. + *  2. Redistributions in binary form must reproduce the above copyright + *     notice, this list of conditions and the following disclaimer in the + *     documentation and/or other materials provided with the distribution. + *  3. Neither the name of the Panasas company nor the names of its + *     contributors may be used to endorse or promote products derived + *     from this software without specific prior written permission. + * + *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _OBJLAYOUT_H +#define _OBJLAYOUT_H + +#include <linux/nfs_fs.h> +#include <linux/pnfs_osd_xdr.h> +#include "../pnfs.h" + +/* + * per-inode layout + */ +struct objlayout { +	struct pnfs_layout_hdr pnfs_layout; + +	 /* for layout_commit */ +	enum osd_delta_space_valid_enum { +		OBJ_DSU_INIT = 0, +		OBJ_DSU_VALID, +		OBJ_DSU_INVALID, +	} delta_space_valid; +	s64 delta_space_used;  /* consumed by write ops */ + +	 /* for layout_return */ +	spinlock_t lock; +	struct list_head err_list; +}; + +static inline struct objlayout * +OBJLAYOUT(struct pnfs_layout_hdr *lo) +{ +	return container_of(lo, struct objlayout, pnfs_layout); +} + +/* + * per-I/O operation state + * embedded in objects provider io_state data structure + */ +struct objlayout_io_res { +	struct objlayout *objlay; + +	void *rpcdata; +	int status;             /* res */ +	int committed;          /* res */ + +	/* Error reporting (layout_return) */ +	struct list_head err_list; +	unsigned num_comps; +	/* Pointer to array of error descriptors of size num_comps. +	 * It should contain as many entries as devices in the osd_layout +	 * that participate in the I/O. It is up to the io_engine to allocate +	 * needed space and set num_comps. +	 */ +	struct pnfs_osd_ioerr *ioerrs; +}; + +static inline +void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps, +			struct pnfs_osd_ioerr *ioerrs, void *rpcdata, +			struct pnfs_layout_hdr *pnfs_layout_type) +{ +	oir->objlay = OBJLAYOUT(pnfs_layout_type); +	oir->rpcdata = rpcdata; +	INIT_LIST_HEAD(&oir->err_list); +	oir->num_comps = num_comps; +	oir->ioerrs = ioerrs; +} + +/* + * Raid engine I/O API + */ +extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, +	struct pnfs_layout_hdr *pnfslay, +	struct pnfs_layout_range *range, +	struct xdr_stream *xdr, +	gfp_t gfp_flags); +extern void objio_free_lseg(struct pnfs_layout_segment *lseg); + +/* objio_free_result will free these @oir structs received from + * objlayout_{read,write}_done + */ +extern void objio_free_result(struct objlayout_io_res *oir); + +extern int objio_read_pagelist(struct nfs_pgio_data *rdata); +extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how); + +/* + * callback API + */ +extern void objlayout_io_set_result(struct objlayout_io_res *oir, +			unsigned index, struct pnfs_osd_objid *pooid, +			int osd_error, u64 offset, u64 length, bool is_write); + +static inline void +objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used) +{ +	/* If one of the I/Os errored out and the delta_space_used was +	 * invalid we render the complete report as invalid. Protocol mandate +	 * the DSU be accurate or not reported. +	 */ +	spin_lock(&objlay->lock); +	if (objlay->delta_space_valid != OBJ_DSU_INVALID) { +		objlay->delta_space_valid = OBJ_DSU_VALID; +		objlay->delta_space_used += space_used; +	} +	spin_unlock(&objlay->lock); +} + +extern void objlayout_read_done(struct objlayout_io_res *oir, +				ssize_t status, bool sync); +extern void objlayout_write_done(struct objlayout_io_res *oir, +				 ssize_t status, bool sync); + +extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, +	struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, +	gfp_t gfp_flags); +extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); + +/* + * exported generic objects function vectors + */ + +extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags); +extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *); + +extern struct pnfs_layout_segment *objlayout_alloc_lseg( +	struct pnfs_layout_hdr *, +	struct nfs4_layoutget_res *, +	gfp_t gfp_flags); +extern void objlayout_free_lseg(struct pnfs_layout_segment *); + +extern enum pnfs_try_status objlayout_read_pagelist( +	struct nfs_pgio_data *); + +extern enum pnfs_try_status objlayout_write_pagelist( +	struct nfs_pgio_data *, +	int how); + +extern void objlayout_encode_layoutcommit( +	struct pnfs_layout_hdr *, +	struct xdr_stream *, +	const struct nfs4_layoutcommit_args *); + +extern void objlayout_encode_layoutreturn( +	struct pnfs_layout_hdr *, +	struct xdr_stream *, +	const struct nfs4_layoutreturn_args *); + +extern int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr); + +#endif /* _OBJLAYOUT_H */ diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c new file mode 100644 index 00000000000..b3918f7ac34 --- /dev/null +++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c @@ -0,0 +1,415 @@ +/* + *  Object-Based pNFS Layout XDR layer + * + *  Copyright (C) 2007 Panasas Inc. [year of first publication] + *  All rights reserved. + * + *  Benny Halevy <bhalevy@panasas.com> + *  Boaz Harrosh <bharrosh@panasas.com> + * + *  This program is free software; you can redistribute it and/or modify + *  it under the terms of the GNU General Public License version 2 + *  See the file COPYING included with this distribution for more details. + * + *  Redistribution and use in source and binary forms, with or without + *  modification, are permitted provided that the following conditions + *  are met: + * + *  1. Redistributions of source code must retain the above copyright + *     notice, this list of conditions and the following disclaimer. + *  2. Redistributions in binary form must reproduce the above copyright + *     notice, this list of conditions and the following disclaimer in the + *     documentation and/or other materials provided with the distribution. + *  3. Neither the name of the Panasas company nor the names of its + *     contributors may be used to endorse or promote products derived + *     from this software without specific prior written permission. + * + *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/pnfs_osd_xdr.h> + +#define NFSDBG_FACILITY         NFSDBG_PNFS_LD + +/* + * The following implementation is based on RFC5664 + */ + +/* + * struct pnfs_osd_objid { + *	struct nfs4_deviceid	oid_device_id; + *	u64			oid_partition_id; + *	u64			oid_object_id; + * }; // xdr size 32 bytes + */ +static __be32 * +_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) +{ +	p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data, +				    sizeof(objid->oid_device_id.data)); + +	p = xdr_decode_hyper(p, &objid->oid_partition_id); +	p = xdr_decode_hyper(p, &objid->oid_object_id); +	return p; +} +/* + * struct pnfs_osd_opaque_cred { + *	u32 cred_len; + *	void *cred; + * }; // xdr size [variable] + * The return pointers are from the xdr buffer + */ +static int +_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred, +			    struct xdr_stream *xdr) +{ +	__be32 *p = xdr_inline_decode(xdr, 1); + +	if (!p) +		return -EINVAL; + +	opaque_cred->cred_len = be32_to_cpu(*p++); + +	p = xdr_inline_decode(xdr, opaque_cred->cred_len); +	if (!p) +		return -EINVAL; + +	opaque_cred->cred = p; +	return 0; +} + +/* + * struct pnfs_osd_object_cred { + *	struct pnfs_osd_objid		oc_object_id; + *	u32				oc_osd_version; + *	u32				oc_cap_key_sec; + *	struct pnfs_osd_opaque_cred	oc_cap_key + *	struct pnfs_osd_opaque_cred	oc_cap; + * }; // xdr size 32 + 4 + 4 + [variable] + [variable] + */ +static int +_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp, +			    struct xdr_stream *xdr) +{ +	__be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4); +	int ret; + +	if (!p) +		return -EIO; + +	p = _osd_xdr_decode_objid(p, &comp->oc_object_id); +	comp->oc_osd_version = be32_to_cpup(p++); +	comp->oc_cap_key_sec = be32_to_cpup(p); + +	ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr); +	if (unlikely(ret)) +		return ret; + +	ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr); +	return ret; +} + +/* + * struct pnfs_osd_data_map { + *	u32	odm_num_comps; + *	u64	odm_stripe_unit; + *	u32	odm_group_width; + *	u32	odm_group_depth; + *	u32	odm_mirror_cnt; + *	u32	odm_raid_algorithm; + * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4 + */ +static inline int +_osd_data_map_xdr_sz(void) +{ +	return 4 + 8 + 4 + 4 + 4 + 4; +} + +static __be32 * +_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map) +{ +	data_map->odm_num_comps = be32_to_cpup(p++); +	p = xdr_decode_hyper(p, &data_map->odm_stripe_unit); +	data_map->odm_group_width = be32_to_cpup(p++); +	data_map->odm_group_depth = be32_to_cpup(p++); +	data_map->odm_mirror_cnt = be32_to_cpup(p++); +	data_map->odm_raid_algorithm = be32_to_cpup(p++); +	dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " +		"odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", +		__func__, +		data_map->odm_num_comps, +		(unsigned long long)data_map->odm_stripe_unit, +		data_map->odm_group_width, +		data_map->odm_group_depth, +		data_map->odm_mirror_cnt, +		data_map->odm_raid_algorithm); +	return p; +} + +int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout, +	struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr) +{ +	__be32 *p; + +	memset(iter, 0, sizeof(*iter)); + +	p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4); +	if (unlikely(!p)) +		return -EINVAL; + +	p = _osd_xdr_decode_data_map(p, &layout->olo_map); +	layout->olo_comps_index = be32_to_cpup(p++); +	layout->olo_num_comps = be32_to_cpup(p++); +	dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__, +		layout->olo_comps_index, layout->olo_num_comps); + +	iter->total_comps = layout->olo_num_comps; +	return 0; +} + +bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp, +	struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr, +	int *err) +{ +	BUG_ON(iter->decoded_comps > iter->total_comps); +	if (iter->decoded_comps == iter->total_comps) +		return false; + +	*err = _osd_xdr_decode_object_cred(comp, xdr); +	if (unlikely(*err)) { +		dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d " +			"total_comps=%d\n", __func__, *err, +			iter->decoded_comps, iter->total_comps); +		return false; /* stop the loop */ +	} +	dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx " +		"key_len=%u cap_len=%u\n", +		__func__, +		_DEVID_LO(&comp->oc_object_id.oid_device_id), +		_DEVID_HI(&comp->oc_object_id.oid_device_id), +		comp->oc_object_id.oid_partition_id, +		comp->oc_object_id.oid_object_id, +		comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); + +	iter->decoded_comps++; +	return true; +} + +/* + * Get Device Information Decoding + * + * Note: since Device Information is currently done synchronously, all + *       variable strings fields are left inside the rpc buffer and are only + *       pointed to by the pnfs_osd_deviceaddr members. So the read buffer + *       should not be freed while the returned information is in use. + */ +/* + *struct nfs4_string { + *	unsigned int len; + *	char *data; + *}; // size [variable] + * NOTE: Returned string points to inside the XDR buffer + */ +static __be32 * +__read_u8_opaque(__be32 *p, struct nfs4_string *str) +{ +	str->len = be32_to_cpup(p++); +	str->data = (char *)p; + +	p += XDR_QUADLEN(str->len); +	return p; +} + +/* + * struct pnfs_osd_targetid { + *	u32			oti_type; + *	struct nfs4_string	oti_scsi_device_id; + * };// size 4 + [variable] + */ +static __be32 * +__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid) +{ +	u32 oti_type; + +	oti_type = be32_to_cpup(p++); +	targetid->oti_type = oti_type; + +	switch (oti_type) { +	case OBJ_TARGET_SCSI_NAME: +	case OBJ_TARGET_SCSI_DEVICE_ID: +		p = __read_u8_opaque(p, &targetid->oti_scsi_device_id); +	} + +	return p; +} + +/* + * struct pnfs_osd_net_addr { + *	struct nfs4_string	r_netid; + *	struct nfs4_string	r_addr; + * }; + */ +static __be32 * +__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr) +{ +	p = __read_u8_opaque(p, &netaddr->r_netid); +	p = __read_u8_opaque(p, &netaddr->r_addr); + +	return p; +} + +/* + * struct pnfs_osd_targetaddr { + *	u32				ota_available; + *	struct pnfs_osd_net_addr	ota_netaddr; + * }; + */ +static __be32 * +__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr) +{ +	u32 ota_available; + +	ota_available = be32_to_cpup(p++); +	targetaddr->ota_available = ota_available; + +	if (ota_available) +		p = __read_net_addr(p, &targetaddr->ota_netaddr); + + +	return p; +} + +/* + * struct pnfs_osd_deviceaddr { + *	struct pnfs_osd_targetid	oda_targetid; + *	struct pnfs_osd_targetaddr	oda_targetaddr; + *	u8				oda_lun[8]; + *	struct nfs4_string		oda_systemid; + *	struct pnfs_osd_object_cred	oda_root_obj_cred; + *	struct nfs4_string		oda_osdname; + * }; + */ + +/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does + * not have an xdr_stream + */ +static __be32 * +__read_opaque_cred(__be32 *p, +			      struct pnfs_osd_opaque_cred *opaque_cred) +{ +	opaque_cred->cred_len = be32_to_cpu(*p++); +	opaque_cred->cred = p; +	return p + XDR_QUADLEN(opaque_cred->cred_len); +} + +static __be32 * +__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp) +{ +	p = _osd_xdr_decode_objid(p, &comp->oc_object_id); +	comp->oc_osd_version = be32_to_cpup(p++); +	comp->oc_cap_key_sec = be32_to_cpup(p++); + +	p = __read_opaque_cred(p, &comp->oc_cap_key); +	p = __read_opaque_cred(p, &comp->oc_cap); +	return p; +} + +void pnfs_osd_xdr_decode_deviceaddr( +	struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p) +{ +	p = __read_targetid(p, &deviceaddr->oda_targetid); + +	p = __read_targetaddr(p, &deviceaddr->oda_targetaddr); + +	p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun, +				    sizeof(deviceaddr->oda_lun)); + +	p = __read_u8_opaque(p, &deviceaddr->oda_systemid); + +	p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred); + +	p = __read_u8_opaque(p, &deviceaddr->oda_osdname); + +	/* libosd likes this terminated in dbg. It's last, so no problems */ +	deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0; +} + +/* + * struct pnfs_osd_layoutupdate { + *	u32	dsu_valid; + *	s64	dsu_delta; + *	u32	olu_ioerr_flag; + * }; xdr size 4 + 8 + 4 + */ +int +pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, +				 struct pnfs_osd_layoutupdate *lou) +{ +	__be32 *p = xdr_reserve_space(xdr,  4 + 8 + 4); + +	if (!p) +		return -E2BIG; + +	*p++ = cpu_to_be32(lou->dsu_valid); +	if (lou->dsu_valid) +		p = xdr_encode_hyper(p, lou->dsu_delta); +	*p++ = cpu_to_be32(lou->olu_ioerr_flag); +	return 0; +} + +/* + * struct pnfs_osd_objid { + *	struct nfs4_deviceid	oid_device_id; + *	u64			oid_partition_id; + *	u64			oid_object_id; + * }; // xdr size 32 bytes + */ +static inline __be32 * +pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id) +{ +	p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, +				    sizeof(object_id->oid_device_id.data)); +	p = xdr_encode_hyper(p, object_id->oid_partition_id); +	p = xdr_encode_hyper(p, object_id->oid_object_id); + +	return p; +} + +/* + * struct pnfs_osd_ioerr { + *	struct pnfs_osd_objid	oer_component; + *	u64			oer_comp_offset; + *	u64			oer_comp_length; + *	u32			oer_iswrite; + *	u32			oer_errno; + * }; // xdr size 32 + 24 bytes + */ +void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr) +{ +	p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component); +	p = xdr_encode_hyper(p, ioerr->oer_comp_offset); +	p = xdr_encode_hyper(p, ioerr->oer_comp_length); +	*p++ = cpu_to_be32(ioerr->oer_iswrite); +	*p   = cpu_to_be32(ioerr->oer_errno); +} + +__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr) +{ +	__be32 *p; + +	p = xdr_reserve_space(xdr, 32 + 24); +	if (unlikely(!p)) +		dprintk("%s: out of xdr space\n", __func__); + +	return p; +} diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 137b549e63d..17fab89f635 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -13,25 +13,71 @@  #include <linux/file.h>  #include <linux/sched.h>  #include <linux/sunrpc/clnt.h> +#include <linux/nfs.h>  #include <linux/nfs3.h>  #include <linux/nfs4.h>  #include <linux/nfs_page.h>  #include <linux/nfs_fs.h>  #include <linux/nfs_mount.h> +#include <linux/export.h>  #include "internal.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY		NFSDBG_PAGECACHE  static struct kmem_cache *nfs_page_cachep; +static const struct rpc_call_ops nfs_pgio_common_ops; + +static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) +{ +	p->npages = pagecount; +	if (pagecount <= ARRAY_SIZE(p->page_array)) +		p->pagevec = p->page_array; +	else { +		p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); +		if (!p->pagevec) +			p->npages = 0; +	} +	return p->pagevec != NULL; +} + +void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, +		       struct nfs_pgio_header *hdr, +		       void (*release)(struct nfs_pgio_header *hdr)) +{ +	hdr->req = nfs_list_entry(desc->pg_list.next); +	hdr->inode = desc->pg_inode; +	hdr->cred = hdr->req->wb_context->cred; +	hdr->io_start = req_offset(hdr->req); +	hdr->good_bytes = desc->pg_count; +	hdr->dreq = desc->pg_dreq; +	hdr->layout_private = desc->pg_layout_private; +	hdr->release = release; +	hdr->completion_ops = desc->pg_completion_ops; +	if (hdr->completion_ops->init_hdr) +		hdr->completion_ops->init_hdr(hdr); +} +EXPORT_SYMBOL_GPL(nfs_pgheader_init); + +void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) +{ +	spin_lock(&hdr->lock); +	if (pos < hdr->io_start + hdr->good_bytes) { +		set_bit(NFS_IOHDR_ERROR, &hdr->flags); +		clear_bit(NFS_IOHDR_EOF, &hdr->flags); +		hdr->good_bytes = pos - hdr->io_start; +		hdr->error = error; +	} +	spin_unlock(&hdr->lock); +}  static inline struct nfs_page *  nfs_page_alloc(void)  { -	struct nfs_page	*p; -	p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL); -	if (p) { -		memset(p, 0, sizeof(*p)); +	struct nfs_page	*p = kmem_cache_zalloc(nfs_page_cachep, GFP_NOIO); +	if (p)  		INIT_LIST_HEAD(&p->wb_list); -	}  	return p;  } @@ -41,11 +87,217 @@ nfs_page_free(struct nfs_page *p)  	kmem_cache_free(nfs_page_cachep, p);  } +static void +nfs_iocounter_inc(struct nfs_io_counter *c) +{ +	atomic_inc(&c->io_count); +} + +static void +nfs_iocounter_dec(struct nfs_io_counter *c) +{ +	if (atomic_dec_and_test(&c->io_count)) { +		clear_bit(NFS_IO_INPROGRESS, &c->flags); +		smp_mb__after_atomic(); +		wake_up_bit(&c->flags, NFS_IO_INPROGRESS); +	} +} + +static int +__nfs_iocounter_wait(struct nfs_io_counter *c) +{ +	wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS); +	DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS); +	int ret = 0; + +	do { +		prepare_to_wait(wq, &q.wait, TASK_KILLABLE); +		set_bit(NFS_IO_INPROGRESS, &c->flags); +		if (atomic_read(&c->io_count) == 0) +			break; +		ret = nfs_wait_bit_killable(&c->flags); +	} while (atomic_read(&c->io_count) != 0); +	finish_wait(wq, &q.wait); +	return ret; +} + +/** + * nfs_iocounter_wait - wait for i/o to complete + * @c: nfs_io_counter to use + * + * returns -ERESTARTSYS if interrupted by a fatal signal. + * Otherwise returns 0 once the io_count hits 0. + */ +int +nfs_iocounter_wait(struct nfs_io_counter *c) +{ +	if (atomic_read(&c->io_count) == 0) +		return 0; +	return __nfs_iocounter_wait(c); +} + +static int nfs_wait_bit_uninterruptible(void *word) +{ +	io_schedule(); +	return 0; +} + +/* + * nfs_page_group_lock - lock the head of the page group + * @req - request in group that is to be locked + * + * this lock must be held if modifying the page group list + */ +void +nfs_page_group_lock(struct nfs_page *req) +{ +	struct nfs_page *head = req->wb_head; + +	WARN_ON_ONCE(head != head->wb_head); + +	wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, +			nfs_wait_bit_uninterruptible, +			TASK_UNINTERRUPTIBLE); +} + +/* + * nfs_page_group_unlock - unlock the head of the page group + * @req - request in group that is to be unlocked + */ +void +nfs_page_group_unlock(struct nfs_page *req) +{ +	struct nfs_page *head = req->wb_head; + +	WARN_ON_ONCE(head != head->wb_head); + +	smp_mb__before_atomic(); +	clear_bit(PG_HEADLOCK, &head->wb_flags); +	smp_mb__after_atomic(); +	wake_up_bit(&head->wb_flags, PG_HEADLOCK); +} + +/* + * nfs_page_group_sync_on_bit_locked + * + * must be called with page group lock held + */ +static bool +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) +{ +	struct nfs_page *head = req->wb_head; +	struct nfs_page *tmp; + +	WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags)); +	WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags)); + +	tmp = req->wb_this_page; +	while (tmp != req) { +		if (!test_bit(bit, &tmp->wb_flags)) +			return false; +		tmp = tmp->wb_this_page; +	} + +	/* true! reset all bits */ +	tmp = req; +	do { +		clear_bit(bit, &tmp->wb_flags); +		tmp = tmp->wb_this_page; +	} while (tmp != req); + +	return true; +} + +/* + * nfs_page_group_sync_on_bit - set bit on current request, but only + *   return true if the bit is set for all requests in page group + * @req - request in page group + * @bit - PG_* bit that is used to sync page group + */ +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit) +{ +	bool ret; + +	nfs_page_group_lock(req); +	ret = nfs_page_group_sync_on_bit_locked(req, bit); +	nfs_page_group_unlock(req); + +	return ret; +} + +/* + * nfs_page_group_init - Initialize the page group linkage for @req + * @req - a new nfs request + * @prev - the previous request in page group, or NULL if @req is the first + *         or only request in the group (the head). + */ +static inline void +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) +{ +	WARN_ON_ONCE(prev == req); + +	if (!prev) { +		/* a head request */ +		req->wb_head = req; +		req->wb_this_page = req; +	} else { +		/* a subrequest */ +		WARN_ON_ONCE(prev->wb_this_page != prev->wb_head); +		WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags)); +		req->wb_head = prev->wb_head; +		req->wb_this_page = prev->wb_this_page; +		prev->wb_this_page = req; + +		/* All subrequests take a ref on the head request until +		 * nfs_page_group_destroy is called */ +		kref_get(&req->wb_head->wb_kref); + +		/* grab extra ref if head request has extra ref from +		 * the write/commit path to handle handoff between write +		 * and commit lists */ +		if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) { +			set_bit(PG_INODE_REF, &req->wb_flags); +			kref_get(&req->wb_kref); +		} +	} +} + +/* + * nfs_page_group_destroy - sync the destruction of page groups + * @req - request that no longer needs the page group + * + * releases the page group reference from each member once all + * members have called this function. + */ +static void +nfs_page_group_destroy(struct kref *kref) +{ +	struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); +	struct nfs_page *tmp, *next; + +	/* subrequests must release the ref on the head request */ +	if (req->wb_head != req) +		nfs_release_request(req->wb_head); + +	if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN)) +		return; + +	tmp = req; +	do { +		next = tmp->wb_this_page; +		/* unlink and free */ +		tmp->wb_this_page = tmp; +		tmp->wb_head = tmp; +		nfs_free_request(tmp); +		tmp = next; +	} while (tmp != req); +} +  /**   * nfs_create_request - Create an NFS read/write request. - * @file: file descriptor to use - * @inode: inode to which the request is attached + * @ctx: open context to use   * @page: page to write + * @last: last nfs request created for this page group or NULL if head   * @offset: starting offset within the page for the write   * @count: number of bytes to read/write   * @@ -54,39 +306,41 @@ nfs_page_free(struct nfs_page *p)   * User should ensure it is safe to sleep in this function.   */  struct nfs_page * -nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, -		   struct page *page, -		   unsigned int offset, unsigned int count) +nfs_create_request(struct nfs_open_context *ctx, struct page *page, +		   struct nfs_page *last, unsigned int offset, +		   unsigned int count)  {  	struct nfs_page		*req; +	struct nfs_lock_context *l_ctx; +	if (test_bit(NFS_CONTEXT_BAD, &ctx->flags)) +		return ERR_PTR(-EBADF);  	/* try to allocate the request struct */  	req = nfs_page_alloc();  	if (req == NULL)  		return ERR_PTR(-ENOMEM);  	/* get lock context early so we can deal with alloc failures */ -	req->wb_lock_context = nfs_get_lock_context(ctx); -	if (req->wb_lock_context == NULL) { +	l_ctx = nfs_get_lock_context(ctx); +	if (IS_ERR(l_ctx)) {  		nfs_page_free(req); -		return ERR_PTR(-ENOMEM); +		return ERR_CAST(l_ctx);  	} +	req->wb_lock_context = l_ctx; +	nfs_iocounter_inc(&l_ctx->io_count);  	/* Initialize the request struct. Initially, we assume a  	 * long write-back delay. This will be adjusted in  	 * update_nfs_request below if the region is not locked. */  	req->wb_page    = page; -	atomic_set(&req->wb_complete, 0); -	req->wb_index	= page->index; +	req->wb_index	= page_file_index(page);  	page_cache_get(page); -	BUG_ON(PagePrivate(page)); -	BUG_ON(!PageLocked(page)); -	BUG_ON(page->mapping->host != inode);  	req->wb_offset  = offset;  	req->wb_pgbase	= offset;  	req->wb_bytes   = count;  	req->wb_context = get_nfs_open_context(ctx);  	kref_init(&req->wb_kref); +	nfs_page_group_init(req, last);  	return req;  } @@ -100,51 +354,30 @@ void nfs_unlock_request(struct nfs_page *req)  		printk(KERN_ERR "NFS: Invalid unlock attempted\n");  		BUG();  	} -	smp_mb__before_clear_bit(); +	smp_mb__before_atomic();  	clear_bit(PG_BUSY, &req->wb_flags); -	smp_mb__after_clear_bit(); +	smp_mb__after_atomic();  	wake_up_bit(&req->wb_flags, PG_BUSY); -	nfs_release_request(req);  }  /** - * nfs_set_page_tag_locked - Tag a request as locked + * nfs_unlock_and_release_request - Unlock request and release the nfs_page   * @req:   */ -int nfs_set_page_tag_locked(struct nfs_page *req) -{ -	if (!nfs_lock_request_dontget(req)) -		return 0; -	if (req->wb_page != NULL) -		radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); -	return 1; -} - -/** - * nfs_clear_page_tag_locked - Clear request tag and wake up sleepers - */ -void nfs_clear_page_tag_locked(struct nfs_page *req) +void nfs_unlock_and_release_request(struct nfs_page *req)  { -	if (req->wb_page != NULL) { -		struct inode *inode = req->wb_context->path.dentry->d_inode; -		struct nfs_inode *nfsi = NFS_I(inode); - -		spin_lock(&inode->i_lock); -		radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); -		nfs_unlock_request(req); -		spin_unlock(&inode->i_lock); -	} else -		nfs_unlock_request(req); +	nfs_unlock_request(req); +	nfs_release_request(req);  } -/** +/*   * nfs_clear_request - Free up all resources allocated to the request   * @req:   *   * Release page and open context resources associated with a read/write   * request after it has completed.   */ -void nfs_clear_request(struct nfs_page *req) +static void nfs_clear_request(struct nfs_page *req)  {  	struct page *page = req->wb_page;  	struct nfs_open_context *ctx = req->wb_context; @@ -155,6 +388,7 @@ void nfs_clear_request(struct nfs_page *req)  		req->wb_page = NULL;  	}  	if (l_ctx != NULL) { +		nfs_iocounter_dec(&l_ctx->io_count);  		nfs_put_lock_context(l_ctx);  		req->wb_lock_context = NULL;  	} @@ -164,16 +398,22 @@ void nfs_clear_request(struct nfs_page *req)  	}  } -  /**   * nfs_release_request - Release the count on an NFS read/write request   * @req: request to release   *   * Note: Should never be called with the spinlock held!   */ -static void nfs_free_request(struct kref *kref) +void nfs_free_request(struct nfs_page *req)  { -	struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); +	WARN_ON_ONCE(req->wb_this_page != req); + +	/* extra debug: make sure no sync bits are still set */ +	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); +	WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags)); +	WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags)); +	WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags)); +	WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));  	/* Release struct file and open context */  	nfs_clear_request(req); @@ -182,13 +422,7 @@ static void nfs_free_request(struct kref *kref)  void nfs_release_request(struct nfs_page *req)  { -	kref_put(&req->wb_kref, nfs_free_request); -} - -static int nfs_wait_bit_uninterruptible(void *word) -{ -	io_schedule(); -	return 0; +	kref_put(&req->wb_kref, nfs_page_group_destroy);  }  /** @@ -206,6 +440,249 @@ nfs_wait_on_request(struct nfs_page *req)  			TASK_UNINTERRUPTIBLE);  } +/* + * nfs_generic_pg_test - determine if requests can be coalesced + * @desc: pointer to descriptor + * @prev: previous request in desc, or NULL + * @req: this request + * + * Returns zero if @req can be coalesced into @desc, otherwise it returns + * the size of the request. + */ +size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, +			   struct nfs_page *prev, struct nfs_page *req) +{ +	if (desc->pg_count > desc->pg_bsize) { +		/* should never happen */ +		WARN_ON_ONCE(1); +		return 0; +	} + +	return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); +} +EXPORT_SYMBOL_GPL(nfs_generic_pg_test); + +static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr) +{ +	return container_of(hdr, struct nfs_rw_header, header); +} + +/** + * nfs_rw_header_alloc - Allocate a header for a read or write + * @ops: Read or write function vector + */ +struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops) +{ +	struct nfs_rw_header *header = ops->rw_alloc_header(); + +	if (header) { +		struct nfs_pgio_header *hdr = &header->header; + +		INIT_LIST_HEAD(&hdr->pages); +		spin_lock_init(&hdr->lock); +		atomic_set(&hdr->refcnt, 0); +		hdr->rw_ops = ops; +	} +	return header; +} +EXPORT_SYMBOL_GPL(nfs_rw_header_alloc); + +/* + * nfs_rw_header_free - Free a read or write header + * @hdr: The header to free + */ +void nfs_rw_header_free(struct nfs_pgio_header *hdr) +{ +	hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr)); +} +EXPORT_SYMBOL_GPL(nfs_rw_header_free); + +/** + * nfs_pgio_data_alloc - Allocate pageio data + * @hdr: The header making a request + * @pagecount: Number of pages to create + */ +static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr, +						 unsigned int pagecount) +{ +	struct nfs_pgio_data *data, *prealloc; + +	prealloc = &NFS_RW_HEADER(hdr)->rpc_data; +	if (prealloc->header == NULL) +		data = prealloc; +	else +		data = kzalloc(sizeof(*data), GFP_KERNEL); +	if (!data) +		goto out; + +	if (nfs_pgarray_set(&data->pages, pagecount)) { +		data->header = hdr; +		atomic_inc(&hdr->refcnt); +	} else { +		if (data != prealloc) +			kfree(data); +		data = NULL; +	} +out: +	return data; +} + +/** + * nfs_pgio_data_release - Properly free pageio data + * @data: The data to release + */ +void nfs_pgio_data_release(struct nfs_pgio_data *data) +{ +	struct nfs_pgio_header *hdr = data->header; +	struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr); + +	put_nfs_open_context(data->args.context); +	if (data->pages.pagevec != data->pages.page_array) +		kfree(data->pages.pagevec); +	if (data == &pageio_header->rpc_data) { +		data->header = NULL; +		data = NULL; +	} +	if (atomic_dec_and_test(&hdr->refcnt)) +		hdr->completion_ops->completion(hdr); +	/* Note: we only free the rpc_task after callbacks are done. +	 * See the comment in rpc_free_task() for why +	 */ +	kfree(data); +} +EXPORT_SYMBOL_GPL(nfs_pgio_data_release); + +/** + * nfs_pgio_rpcsetup - Set up arguments for a pageio call + * @data: The pageio data + * @count: Number of bytes to read + * @offset: Initial offset + * @how: How to commit data (writes only) + * @cinfo: Commit information for the call (writes only) + */ +static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data, +			      unsigned int count, unsigned int offset, +			      int how, struct nfs_commit_info *cinfo) +{ +	struct nfs_page *req = data->header->req; + +	/* Set up the RPC argument and reply structs +	 * NB: take care not to mess about with data->commit et al. */ + +	data->args.fh     = NFS_FH(data->header->inode); +	data->args.offset = req_offset(req) + offset; +	/* pnfs_set_layoutcommit needs this */ +	data->mds_offset = data->args.offset; +	data->args.pgbase = req->wb_pgbase + offset; +	data->args.pages  = data->pages.pagevec; +	data->args.count  = count; +	data->args.context = get_nfs_open_context(req->wb_context); +	data->args.lock_context = req->wb_lock_context; +	data->args.stable  = NFS_UNSTABLE; +	switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { +	case 0: +		break; +	case FLUSH_COND_STABLE: +		if (nfs_reqs_to_commit(cinfo)) +			break; +	default: +		data->args.stable = NFS_FILE_SYNC; +	} + +	data->res.fattr   = &data->fattr; +	data->res.count   = count; +	data->res.eof     = 0; +	data->res.verf    = &data->verf; +	nfs_fattr_init(&data->fattr); +} + +/** + * nfs_pgio_prepare - Prepare pageio data to go over the wire + * @task: The current task + * @calldata: pageio data to prepare + */ +static void nfs_pgio_prepare(struct rpc_task *task, void *calldata) +{ +	struct nfs_pgio_data *data = calldata; +	int err; +	err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data); +	if (err) +		rpc_exit(task, err); +} + +int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data, +		      const struct rpc_call_ops *call_ops, int how, int flags) +{ +	struct rpc_task *task; +	struct rpc_message msg = { +		.rpc_argp = &data->args, +		.rpc_resp = &data->res, +		.rpc_cred = data->header->cred, +	}; +	struct rpc_task_setup task_setup_data = { +		.rpc_client = clnt, +		.task = &data->task, +		.rpc_message = &msg, +		.callback_ops = call_ops, +		.callback_data = data, +		.workqueue = nfsiod_workqueue, +		.flags = RPC_TASK_ASYNC | flags, +	}; +	int ret = 0; + +	data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how); + +	dprintk("NFS: %5u initiated pgio call " +		"(req %s/%llu, %u bytes @ offset %llu)\n", +		data->task.tk_pid, +		data->header->inode->i_sb->s_id, +		(unsigned long long)NFS_FILEID(data->header->inode), +		data->args.count, +		(unsigned long long)data->args.offset); + +	task = rpc_run_task(&task_setup_data); +	if (IS_ERR(task)) { +		ret = PTR_ERR(task); +		goto out; +	} +	if (how & FLUSH_SYNC) { +		ret = rpc_wait_for_completion_task(task); +		if (ret == 0) +			ret = task->tk_status; +	} +	rpc_put_task(task); +out: +	return ret; +} +EXPORT_SYMBOL_GPL(nfs_initiate_pgio); + +/** + * nfs_pgio_error - Clean up from a pageio error + * @desc: IO descriptor + * @hdr: pageio header + */ +static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, +			  struct nfs_pgio_header *hdr) +{ +	set_bit(NFS_IOHDR_REDO, &hdr->flags); +	nfs_pgio_data_release(hdr->data); +	hdr->data = NULL; +	desc->pg_completion_ops->error_cleanup(&desc->pg_list); +	return -ENOMEM; +} + +/** + * nfs_pgio_release - Release pageio data + * @calldata: The pageio data to release + */ +static void nfs_pgio_release(void *calldata) +{ +	struct nfs_pgio_data *data = calldata; +	if (data->header->rw_ops->rw_release) +		data->header->rw_ops->rw_release(data); +	nfs_pgio_data_release(data); +} +  /**   * nfs_pageio_init - initialise a page io descriptor   * @desc: pointer to descriptor @@ -216,7 +693,9 @@ nfs_wait_on_request(struct nfs_page *req)   */  void nfs_pageio_init(struct nfs_pageio_descriptor *desc,  		     struct inode *inode, -		     int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), +		     const struct nfs_pageio_ops *pg_ops, +		     const struct nfs_pgio_completion_ops *compl_ops, +		     const struct nfs_rw_ops *rw_ops,  		     size_t bsize,  		     int io_flags)  { @@ -225,10 +704,119 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,  	desc->pg_count = 0;  	desc->pg_bsize = bsize;  	desc->pg_base = 0; +	desc->pg_moreio = 0; +	desc->pg_recoalesce = 0;  	desc->pg_inode = inode; -	desc->pg_doio = doio; +	desc->pg_ops = pg_ops; +	desc->pg_completion_ops = compl_ops; +	desc->pg_rw_ops = rw_ops;  	desc->pg_ioflags = io_flags;  	desc->pg_error = 0; +	desc->pg_lseg = NULL; +	desc->pg_dreq = NULL; +	desc->pg_layout_private = NULL; +} +EXPORT_SYMBOL_GPL(nfs_pageio_init); + +/** + * nfs_pgio_result - Basic pageio error handling + * @task: The task that ran + * @calldata: Pageio data to check + */ +static void nfs_pgio_result(struct rpc_task *task, void *calldata) +{ +	struct nfs_pgio_data *data = calldata; +	struct inode *inode = data->header->inode; + +	dprintk("NFS: %s: %5u, (status %d)\n", __func__, +		task->tk_pid, task->tk_status); + +	if (data->header->rw_ops->rw_done(task, data, inode) != 0) +		return; +	if (task->tk_status < 0) +		nfs_set_pgio_error(data->header, task->tk_status, data->args.offset); +	else +		data->header->rw_ops->rw_result(task, data); +} + +/* + * Create an RPC task for the given read or write request and kick it. + * The page must have been locked by the caller. + * + * It may happen that the page we're passed is not marked dirty. + * This is the case if nfs_updatepage detects a conflicting request + * that has been written but not committed. + */ +int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, +		     struct nfs_pgio_header *hdr) +{ +	struct nfs_page		*req; +	struct page		**pages; +	struct nfs_pgio_data	*data; +	struct list_head *head = &desc->pg_list; +	struct nfs_commit_info cinfo; + +	data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base, +							   desc->pg_count)); +	if (!data) +		return nfs_pgio_error(desc, hdr); + +	nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); +	pages = data->pages.pagevec; +	while (!list_empty(head)) { +		req = nfs_list_entry(head->next); +		nfs_list_remove_request(req); +		nfs_list_add_request(req, &hdr->pages); +		*pages++ = req->wb_page; +	} + +	if ((desc->pg_ioflags & FLUSH_COND_STABLE) && +	    (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) +		desc->pg_ioflags &= ~FLUSH_COND_STABLE; + +	/* Set up the argument struct */ +	nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo); +	hdr->data = data; +	desc->pg_rpc_callops = &nfs_pgio_common_ops; +	return 0; +} +EXPORT_SYMBOL_GPL(nfs_generic_pgio); + +static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) +{ +	struct nfs_rw_header *rw_hdr; +	struct nfs_pgio_header *hdr; +	int ret; + +	rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops); +	if (!rw_hdr) { +		desc->pg_completion_ops->error_cleanup(&desc->pg_list); +		return -ENOMEM; +	} +	hdr = &rw_hdr->header; +	nfs_pgheader_init(desc, hdr, nfs_rw_header_free); +	atomic_inc(&hdr->refcnt); +	ret = nfs_generic_pgio(desc, hdr); +	if (ret == 0) +		ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), +					hdr->data, desc->pg_rpc_callops, +					desc->pg_ioflags, 0); +	if (atomic_dec_and_test(&hdr->refcnt)) +		hdr->completion_ops->completion(hdr); +	return ret; +} + +static bool nfs_match_open_context(const struct nfs_open_context *ctx1, +		const struct nfs_open_context *ctx2) +{ +	return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; +} + +static bool nfs_match_lock_context(const struct nfs_lock_context *l1, +		const struct nfs_lock_context *l2) +{ +	return l1->lockowner.l_owner == l2->lockowner.l_owner +		&& l1->lockowner.l_pid == l2->lockowner.l_pid;  }  /** @@ -242,22 +830,27 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,   *   * Return 'true' if this is the case, else return 'false'.   */ -static int nfs_can_coalesce_requests(struct nfs_page *prev, -				     struct nfs_page *req) +static bool nfs_can_coalesce_requests(struct nfs_page *prev, +				      struct nfs_page *req, +				      struct nfs_pageio_descriptor *pgio)  { -	if (req->wb_context->cred != prev->wb_context->cred) -		return 0; -	if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) -		return 0; -	if (req->wb_context->state != prev->wb_context->state) -		return 0; -	if (req->wb_index != (prev->wb_index + 1)) -		return 0; -	if (req->wb_pgbase != 0) -		return 0; -	if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) -		return 0; -	return 1; +	size_t size; + +	if (prev) { +		if (!nfs_match_open_context(req->wb_context, prev->wb_context)) +			return false; +		if (req->wb_context->dentry->d_inode->i_flock != NULL && +		    !nfs_match_lock_context(req->wb_lock_context, +					    prev->wb_lock_context)) +			return false; +		if (req_offset(req) != req_offset(prev) + prev->wb_bytes) +			return false; +	} +	size = pgio->pg_ops->pg_test(pgio, prev, req); +	WARN_ON_ONCE(size > req->wb_bytes); +	if (size && size < req->wb_bytes) +		req->wb_bytes = size; +	return size > 0;  }  /** @@ -271,31 +864,19 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,  static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,  				     struct nfs_page *req)  { -	size_t newlen = req->wb_bytes; - +	struct nfs_page *prev = NULL;  	if (desc->pg_count != 0) { -		struct nfs_page *prev; - -		/* -		 * FIXME: ideally we should be able to coalesce all requests -		 * that are not block boundary aligned, but currently this -		 * is problematic for the case of bsize < PAGE_CACHE_SIZE, -		 * since nfs_flush_multi and nfs_pagein_multi assume you -		 * can have only one struct nfs_page. -		 */ -		if (desc->pg_bsize < PAGE_SIZE) -			return 0; -		newlen += desc->pg_count; -		if (newlen > desc->pg_bsize) -			return 0;  		prev = nfs_list_entry(desc->pg_list.prev); -		if (!nfs_can_coalesce_requests(prev, req)) -			return 0; -	} else +	} else { +		if (desc->pg_ops->pg_init) +			desc->pg_ops->pg_init(desc, req);  		desc->pg_base = req->wb_pgbase; +	} +	if (!nfs_can_coalesce_requests(prev, req, desc)) +		return 0;  	nfs_list_remove_request(req);  	nfs_list_add_request(req, &desc->pg_list); -	desc->pg_count = newlen; +	desc->pg_count += req->wb_bytes;  	return 1;  } @@ -305,12 +886,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,  static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)  {  	if (!list_empty(&desc->pg_list)) { -		int error = desc->pg_doio(desc->pg_inode, -					  &desc->pg_list, -					  nfs_page_array_len(desc->pg_base, -							     desc->pg_count), -					  desc->pg_count, -					  desc->pg_ioflags); +		int error = desc->pg_ops->pg_doio(desc);  		if (error < 0)  			desc->pg_error = error;  		else @@ -327,28 +903,133 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)   * @desc: destination io descriptor   * @req: request   * + * This may split a request into subrequests which are all part of the + * same page group. + *   * Returns true if the request 'req' was successfully coalesced into the   * existing list of pages 'desc'.   */ -int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, +static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,  			   struct nfs_page *req)  { -	while (!nfs_pageio_do_add_request(desc, req)) { -		nfs_pageio_doio(desc); -		if (desc->pg_error < 0) -			return 0; -	} +	struct nfs_page *subreq; +	unsigned int bytes_left = 0; +	unsigned int offset, pgbase; + +	nfs_page_group_lock(req); + +	subreq = req; +	bytes_left = subreq->wb_bytes; +	offset = subreq->wb_offset; +	pgbase = subreq->wb_pgbase; + +	do { +		if (!nfs_pageio_do_add_request(desc, subreq)) { +			/* make sure pg_test call(s) did nothing */ +			WARN_ON_ONCE(subreq->wb_bytes != bytes_left); +			WARN_ON_ONCE(subreq->wb_offset != offset); +			WARN_ON_ONCE(subreq->wb_pgbase != pgbase); + +			nfs_page_group_unlock(req); +			desc->pg_moreio = 1; +			nfs_pageio_doio(desc); +			if (desc->pg_error < 0) +				return 0; +			if (desc->pg_recoalesce) +				return 0; +			/* retry add_request for this subreq */ +			nfs_page_group_lock(req); +			continue; +		} + +		/* check for buggy pg_test call(s) */ +		WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE); +		WARN_ON_ONCE(subreq->wb_bytes > bytes_left); +		WARN_ON_ONCE(subreq->wb_bytes == 0); + +		bytes_left -= subreq->wb_bytes; +		offset += subreq->wb_bytes; +		pgbase += subreq->wb_bytes; + +		if (bytes_left) { +			subreq = nfs_create_request(req->wb_context, +					req->wb_page, +					subreq, pgbase, bytes_left); +			if (IS_ERR(subreq)) +				goto err_ptr; +			nfs_lock_request(subreq); +			subreq->wb_offset  = offset; +			subreq->wb_index = req->wb_index; +		} +	} while (bytes_left > 0); + +	nfs_page_group_unlock(req);  	return 1; +err_ptr: +	desc->pg_error = PTR_ERR(subreq); +	nfs_page_group_unlock(req); +	return 0;  } +static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) +{ +	LIST_HEAD(head); + +	do { +		list_splice_init(&desc->pg_list, &head); +		desc->pg_bytes_written -= desc->pg_count; +		desc->pg_count = 0; +		desc->pg_base = 0; +		desc->pg_recoalesce = 0; +		desc->pg_moreio = 0; + +		while (!list_empty(&head)) { +			struct nfs_page *req; + +			req = list_first_entry(&head, struct nfs_page, wb_list); +			nfs_list_remove_request(req); +			if (__nfs_pageio_add_request(desc, req)) +				continue; +			if (desc->pg_error < 0) +				return 0; +			break; +		} +	} while (desc->pg_recoalesce); +	return 1; +} + +int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, +		struct nfs_page *req) +{ +	int ret; + +	do { +		ret = __nfs_pageio_add_request(desc, req); +		if (ret) +			break; +		if (desc->pg_error < 0) +			break; +		ret = nfs_do_recoalesce(desc); +	} while (ret); +	return ret; +} +EXPORT_SYMBOL_GPL(nfs_pageio_add_request); +  /**   * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor   * @desc: pointer to io descriptor   */  void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)  { -	nfs_pageio_doio(desc); +	for (;;) { +		nfs_pageio_doio(desc); +		if (!desc->pg_recoalesce) +			break; +		if (!nfs_do_recoalesce(desc)) +			break; +	}  } +EXPORT_SYMBOL_GPL(nfs_pageio_complete);  /**   * nfs_pageio_cond_complete - Conditional I/O completion @@ -366,68 +1047,8 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)  	if (!list_empty(&desc->pg_list)) {  		struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev);  		if (index != prev->wb_index + 1) -			nfs_pageio_doio(desc); -	} -} - -#define NFS_SCAN_MAXENTRIES 16 -/** - * nfs_scan_list - Scan a list for matching requests - * @nfsi: NFS inode - * @dst: Destination list - * @idx_start: lower bound of page->index to scan - * @npages: idx_start + npages sets the upper bound to scan. - * @tag: tag to scan for - * - * Moves elements from one of the inode request lists. - * If the number of requests is set to 0, the entire address_space - * starting at index idx_start, is scanned. - * The requests are *not* checked to ensure that they form a contiguous set. - * You must be holding the inode's i_lock when calling this function - */ -int nfs_scan_list(struct nfs_inode *nfsi, -		struct list_head *dst, pgoff_t idx_start, -		unsigned int npages, int tag) -{ -	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; -	struct nfs_page *req; -	pgoff_t idx_end; -	int found, i; -	int res; - -	res = 0; -	if (npages == 0) -		idx_end = ~0; -	else -		idx_end = idx_start + npages - 1; - -	for (;;) { -		found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, -				(void **)&pgvec[0], idx_start, -				NFS_SCAN_MAXENTRIES, tag); -		if (found <= 0) -			break; -		for (i = 0; i < found; i++) { -			req = pgvec[i]; -			if (req->wb_index > idx_end) -				goto out; -			idx_start = req->wb_index + 1; -			if (nfs_set_page_tag_locked(req)) { -				kref_get(&req->wb_kref); -				nfs_list_remove_request(req); -				radix_tree_tag_clear(&nfsi->nfs_page_tree, -						req->wb_index, tag); -				nfs_list_add_request(req, dst); -				res++; -				if (res == INT_MAX) -					goto out; -			} -		} -		/* for latency reduction */ -		cond_resched_lock(&nfsi->vfs_inode.i_lock); +			nfs_pageio_complete(desc);  	} -out: -	return res;  }  int __init nfs_init_nfspagecache(void) @@ -447,3 +1068,13 @@ void nfs_destroy_nfspagecache(void)  	kmem_cache_destroy(nfs_page_cachep);  } +static const struct rpc_call_ops nfs_pgio_common_ops = { +	.rpc_call_prepare = nfs_pgio_prepare, +	.rpc_call_done = nfs_pgio_result, +	.rpc_release = nfs_pgio_release, +}; + +const struct nfs_pageio_ops nfs_pgio_rw_ops = { +	.pg_test = nfs_generic_pg_test, +	.pg_doio = nfs_generic_pg_pgios, +}; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index db773428f95..6fdcd233d6f 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -28,10 +28,15 @@   */  #include <linux/nfs_fs.h> +#include <linux/nfs_page.h> +#include <linux/module.h>  #include "internal.h"  #include "pnfs.h" +#include "iostat.h" +#include "nfs4trace.h"  #define NFSDBG_FACILITY		NFSDBG_PNFS +#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)  /* Locking:   * @@ -67,6 +72,10 @@ find_pnfs_driver(u32 id)  	spin_lock(&pnfs_spinlock);  	local = find_pnfs_driver_locked(id); +	if (local != NULL && !try_module_get(local->owner)) { +		dprintk("%s: Could not grab reference on module\n", __func__); +		local = NULL; +	}  	spin_unlock(&pnfs_spinlock);  	return local;  } @@ -75,7 +84,11 @@ void  unset_pnfs_layoutdriver(struct nfs_server *nfss)  {  	if (nfss->pnfs_curr_ld) { -		nfss->pnfs_curr_ld->clear_layoutdriver(nfss); +		if (nfss->pnfs_curr_ld->clear_layoutdriver) +			nfss->pnfs_curr_ld->clear_layoutdriver(nfss); +		/* Decrement the MDS count. Purge the deviceid cache if zero */ +		if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count)) +			nfs4_deviceid_purge_client(nfss->nfs_client);  		module_put(nfss->pnfs_curr_ld->owner);  	}  	nfss->pnfs_curr_ld = NULL; @@ -88,7 +101,8 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss)   * @id layout type. Zero (illegal layout type) indicates pNFS not in use.   */  void -set_pnfs_layoutdriver(struct nfs_server *server, u32 id) +set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, +		      u32 id)  {  	struct pnfs_layoutdriver_type *ld_type = NULL; @@ -96,8 +110,8 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)  		goto out_no_driver;  	if (!(server->nfs_client->cl_exchange_flags &  		 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { -		printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__, -		       id, server->nfs_client->cl_exchange_flags); +		printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n", +			__func__, id, server->nfs_client->cl_exchange_flags);  		goto out_no_driver;  	}  	ld_type = find_pnfs_driver(id); @@ -110,18 +124,17 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)  			goto out_no_driver;  		}  	} -	if (!try_module_get(ld_type->owner)) { -		dprintk("%s: Could not grab reference on module\n", __func__); -		goto out_no_driver; -	}  	server->pnfs_curr_ld = ld_type; -	if (ld_type->set_layoutdriver(server)) { -		printk(KERN_ERR -		       "%s: Error initializing mount point for layout driver %u.\n", -		       __func__, id); +	if (ld_type->set_layoutdriver +	    && ld_type->set_layoutdriver(server, mntfh)) { +		printk(KERN_ERR "NFS: %s: Error initializing pNFS layout " +			"driver %u.\n", __func__, id);  		module_put(ld_type->owner);  		goto out_no_driver;  	} +	/* Bump the MDS count */ +	atomic_inc(&server->nfs_client->cl_mds_count); +  	dprintk("%s: pNFS module for %u set\n", __func__, id);  	return; @@ -137,11 +150,11 @@ pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)  	struct pnfs_layoutdriver_type *tmp;  	if (ld_type->id == 0) { -		printk(KERN_ERR "%s id 0 is reserved\n", __func__); +		printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);  		return status;  	}  	if (!ld_type->alloc_lseg || !ld_type->free_lseg) { -		printk(KERN_ERR "%s Layout driver must provide " +		printk(KERN_ERR "NFS: %s Layout driver must provide "  		       "alloc_lseg and free_lseg.\n", __func__);  		return status;  	} @@ -154,7 +167,7 @@ pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)  		dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,  			ld_type->name);  	} else { -		printk(KERN_ERR "%s Module with id %d already loaded!\n", +		printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",  			__func__, ld_type->id);  	}  	spin_unlock(&pnfs_spinlock); @@ -177,105 +190,304 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);   * pNFS client layout cache   */ -static void -get_layout_hdr_locked(struct pnfs_layout_hdr *lo) +/* Need to hold i_lock if caller does not already hold reference */ +void +pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)  { -	assert_spin_locked(&lo->inode->i_lock); -	lo->refcount++; +	atomic_inc(&lo->plh_refcount); +} + +static struct pnfs_layout_hdr * +pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags) +{ +	struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; +	return ld->alloc_layout_hdr(ino, gfp_flags);  }  static void -put_layout_hdr_locked(struct pnfs_layout_hdr *lo) +pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)  { -	assert_spin_locked(&lo->inode->i_lock); -	BUG_ON(lo->refcount == 0); +	struct nfs_server *server = NFS_SERVER(lo->plh_inode); +	struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; -	lo->refcount--; -	if (!lo->refcount) { -		dprintk("%s: freeing layout cache %p\n", __func__, lo); -		BUG_ON(!list_empty(&lo->layouts)); -		NFS_I(lo->inode)->layout = NULL; -		kfree(lo); +	if (!list_empty(&lo->plh_layouts)) { +		struct nfs_client *clp = server->nfs_client; + +		spin_lock(&clp->cl_lock); +		list_del_init(&lo->plh_layouts); +		spin_unlock(&clp->cl_lock);  	} +	put_rpccred(lo->plh_lc_cred); +	return ld->free_layout_hdr(lo); +} + +static void +pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo) +{ +	struct nfs_inode *nfsi = NFS_I(lo->plh_inode); +	dprintk("%s: freeing layout cache %p\n", __func__, lo); +	nfsi->layout = NULL; +	/* Reset MDS Threshold I/O counters */ +	nfsi->write_io = 0; +	nfsi->read_io = 0;  }  void -put_layout_hdr(struct inode *inode) +pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) +{ +	struct inode *inode = lo->plh_inode; + +	if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { +		pnfs_detach_layout_hdr(lo); +		spin_unlock(&inode->i_lock); +		pnfs_free_layout_hdr(lo); +	} +} + +static int +pnfs_iomode_to_fail_bit(u32 iomode) +{ +	return iomode == IOMODE_RW ? +		NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; +} + +static void +pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) +{ +	lo->plh_retry_timestamp = jiffies; +	if (!test_and_set_bit(fail_bit, &lo->plh_flags)) +		atomic_inc(&lo->plh_refcount); +} + +static void +pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) +{ +	if (test_and_clear_bit(fail_bit, &lo->plh_flags)) +		atomic_dec(&lo->plh_refcount); +} + +static void +pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)  { +	struct inode *inode = lo->plh_inode; +	struct pnfs_layout_range range = { +		.iomode = iomode, +		.offset = 0, +		.length = NFS4_MAX_UINT64, +	}; +	LIST_HEAD(head); +  	spin_lock(&inode->i_lock); -	put_layout_hdr_locked(NFS_I(inode)->layout); +	pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); +	pnfs_mark_matching_lsegs_invalid(lo, &head, &range);  	spin_unlock(&inode->i_lock); +	pnfs_free_lseg_list(&head); +	dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, +			iomode == IOMODE_RW ?  "RW" : "READ"); +} + +static bool +pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) +{ +	unsigned long start, end; +	int fail_bit = pnfs_iomode_to_fail_bit(iomode); + +	if (test_bit(fail_bit, &lo->plh_flags) == 0) +		return false; +	end = jiffies; +	start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT; +	if (!time_in_range(lo->plh_retry_timestamp, start, end)) { +		/* It is time to retry the failed layoutgets */ +		pnfs_layout_clear_fail_bit(lo, fail_bit); +		return false; +	} +	return true;  }  static void  init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)  { -	INIT_LIST_HEAD(&lseg->fi_list); -	kref_init(&lseg->kref); -	lseg->layout = lo; +	INIT_LIST_HEAD(&lseg->pls_list); +	INIT_LIST_HEAD(&lseg->pls_lc_list); +	atomic_set(&lseg->pls_refcount, 1); +	smp_mb(); +	set_bit(NFS_LSEG_VALID, &lseg->pls_flags); +	lseg->pls_layout = lo;  } -/* Called without i_lock held, as the free_lseg call may sleep */ -static void -destroy_lseg(struct kref *kref) +static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)  { -	struct pnfs_layout_segment *lseg = -		container_of(kref, struct pnfs_layout_segment, kref); -	struct inode *ino = lseg->layout->inode; +	struct inode *ino = lseg->pls_layout->plh_inode; -	dprintk("--> %s\n", __func__);  	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); -	/* Matched by get_layout_hdr_locked in pnfs_insert_layout */ -	put_layout_hdr(ino);  }  static void -put_lseg(struct pnfs_layout_segment *lseg) +pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, +		struct pnfs_layout_segment *lseg)  { +	struct inode *inode = lo->plh_inode; + +	WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); +	list_del_init(&lseg->pls_list); +	/* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ +	atomic_dec(&lo->plh_refcount); +	if (list_empty(&lo->plh_segs)) +		clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); +	rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); +} + +void +pnfs_put_lseg(struct pnfs_layout_segment *lseg) +{ +	struct pnfs_layout_hdr *lo; +	struct inode *inode; +  	if (!lseg)  		return; -	dprintk("%s: lseg %p ref %d\n", __func__, lseg, -		atomic_read(&lseg->kref.refcount)); -	kref_put(&lseg->kref, destroy_lseg); +	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, +		atomic_read(&lseg->pls_refcount), +		test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); +	lo = lseg->pls_layout; +	inode = lo->plh_inode; +	if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { +		pnfs_get_layout_hdr(lo); +		pnfs_layout_remove_lseg(lo, lseg); +		spin_unlock(&inode->i_lock); +		pnfs_free_lseg(lseg); +		pnfs_put_layout_hdr(lo); +	}  } +EXPORT_SYMBOL_GPL(pnfs_put_lseg); -static void -pnfs_clear_lseg_list(struct pnfs_layout_hdr *lo, struct list_head *tmp_list) +static u64 +end_offset(u64 start, u64 len)  { -	struct pnfs_layout_segment *lseg, *next; -	struct nfs_client *clp; +	u64 end; -	dprintk("%s:Begin lo %p\n", __func__, lo); +	end = start + len; +	return end >= start ? end : NFS4_MAX_UINT64; +} -	assert_spin_locked(&lo->inode->i_lock); -	list_for_each_entry_safe(lseg, next, &lo->segs, fi_list) { -		dprintk("%s: freeing lseg %p\n", __func__, lseg); -		list_move(&lseg->fi_list, tmp_list); +/* + * is l2 fully contained in l1? + *   start1                             end1 + *   [----------------------------------) + *           start2           end2 + *           [----------------) + */ +static bool +pnfs_lseg_range_contained(const struct pnfs_layout_range *l1, +		 const struct pnfs_layout_range *l2) +{ +	u64 start1 = l1->offset; +	u64 end1 = end_offset(start1, l1->length); +	u64 start2 = l2->offset; +	u64 end2 = end_offset(start2, l2->length); + +	return (start1 <= start2) && (end1 >= end2); +} + +/* + * is l1 and l2 intersecting? + *   start1                             end1 + *   [----------------------------------) + *                              start2           end2 + *                              [----------------) + */ +static bool +pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, +		    const struct pnfs_layout_range *l2) +{ +	u64 start1 = l1->offset; +	u64 end1 = end_offset(start1, l1->length); +	u64 start2 = l2->offset; +	u64 end2 = end_offset(start2, l2->length); + +	return (end1 == NFS4_MAX_UINT64 || end1 > start2) && +	       (end2 == NFS4_MAX_UINT64 || end2 > start1); +} + +static bool +should_free_lseg(const struct pnfs_layout_range *lseg_range, +		 const struct pnfs_layout_range *recall_range) +{ +	return (recall_range->iomode == IOMODE_ANY || +		lseg_range->iomode == recall_range->iomode) && +	       pnfs_lseg_range_intersecting(lseg_range, recall_range); +} + +static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, +		struct list_head *tmp_list) +{ +	if (!atomic_dec_and_test(&lseg->pls_refcount)) +		return false; +	pnfs_layout_remove_lseg(lseg->pls_layout, lseg); +	list_add(&lseg->pls_list, tmp_list); +	return true; +} + +/* Returns 1 if lseg is removed from list, 0 otherwise */ +static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, +			     struct list_head *tmp_list) +{ +	int rv = 0; + +	if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { +		/* Remove the reference keeping the lseg in the +		 * list.  It will now be removed when all +		 * outstanding io is finished. +		 */ +		dprintk("%s: lseg %p ref %d\n", __func__, lseg, +			atomic_read(&lseg->pls_refcount)); +		if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list)) +			rv = 1;  	} -	clp = NFS_SERVER(lo->inode)->nfs_client; -	spin_lock(&clp->cl_lock); -	/* List does not take a reference, so no need for put here */ -	list_del_init(&lo->layouts); -	spin_unlock(&clp->cl_lock); -	write_seqlock(&lo->seqlock); -	clear_bit(NFS_LAYOUT_STATEID_SET, &lo->state); -	write_sequnlock(&lo->seqlock); +	return rv; +} -	dprintk("%s:Return\n", __func__); +/* Returns count of number of matching invalid lsegs remaining in list + * after call. + */ +int +pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, +			    struct list_head *tmp_list, +			    struct pnfs_layout_range *recall_range) +{ +	struct pnfs_layout_segment *lseg, *next; +	int invalid = 0, removed = 0; + +	dprintk("%s:Begin lo %p\n", __func__, lo); + +	if (list_empty(&lo->plh_segs)) +		return 0; +	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) +		if (!recall_range || +		    should_free_lseg(&lseg->pls_range, recall_range)) { +			dprintk("%s: freeing lseg %p iomode %d " +				"offset %llu length %llu\n", __func__, +				lseg, lseg->pls_range.iomode, lseg->pls_range.offset, +				lseg->pls_range.length); +			invalid++; +			removed += mark_lseg_invalid(lseg, tmp_list); +		} +	dprintk("%s:Return %i\n", __func__, invalid - removed); +	return invalid - removed;  } -static void -pnfs_free_lseg_list(struct list_head *tmp_list) +/* note free_me must contain lsegs from a single layout_hdr */ +void +pnfs_free_lseg_list(struct list_head *free_me)  { -	struct pnfs_layout_segment *lseg; +	struct pnfs_layout_segment *lseg, *tmp; -	while (!list_empty(tmp_list)) { -		lseg = list_entry(tmp_list->next, struct pnfs_layout_segment, -				fi_list); -		dprintk("%s calling put_lseg on %p\n", __func__, lseg); -		list_del(&lseg->fi_list); -		put_lseg(lseg); +	if (list_empty(free_me)) +		return; + +	list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { +		list_del(&lseg->pls_list); +		pnfs_free_lseg(lseg);  	}  } @@ -288,12 +500,147 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)  	spin_lock(&nfsi->vfs_inode.i_lock);  	lo = nfsi->layout;  	if (lo) { -		pnfs_clear_lseg_list(lo, &tmp_list); -		/* Matched by refcount set to 1 in alloc_init_layout_hdr */ -		put_layout_hdr_locked(lo); +		lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ +		pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); +		pnfs_get_layout_hdr(lo); +		pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); +		pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); +		spin_unlock(&nfsi->vfs_inode.i_lock); +		pnfs_free_lseg_list(&tmp_list); +		pnfs_put_layout_hdr(lo); +	} else +		spin_unlock(&nfsi->vfs_inode.i_lock); +} +EXPORT_SYMBOL_GPL(pnfs_destroy_layout); + +static bool +pnfs_layout_add_bulk_destroy_list(struct inode *inode, +		struct list_head *layout_list) +{ +	struct pnfs_layout_hdr *lo; +	bool ret = false; + +	spin_lock(&inode->i_lock); +	lo = NFS_I(inode)->layout; +	if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) { +		pnfs_get_layout_hdr(lo); +		list_add(&lo->plh_bulk_destroy, layout_list); +		ret = true;  	} -	spin_unlock(&nfsi->vfs_inode.i_lock); -	pnfs_free_lseg_list(&tmp_list); +	spin_unlock(&inode->i_lock); +	return ret; +} + +/* Caller must hold rcu_read_lock and clp->cl_lock */ +static int +pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, +		struct nfs_server *server, +		struct list_head *layout_list) +{ +	struct pnfs_layout_hdr *lo, *next; +	struct inode *inode; + +	list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { +		inode = igrab(lo->plh_inode); +		if (inode == NULL) +			continue; +		list_del_init(&lo->plh_layouts); +		if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) +			continue; +		rcu_read_unlock(); +		spin_unlock(&clp->cl_lock); +		iput(inode); +		spin_lock(&clp->cl_lock); +		rcu_read_lock(); +		return -EAGAIN; +	} +	return 0; +} + +static int +pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, +		bool is_bulk_recall) +{ +	struct pnfs_layout_hdr *lo; +	struct inode *inode; +	struct pnfs_layout_range range = { +		.iomode = IOMODE_ANY, +		.offset = 0, +		.length = NFS4_MAX_UINT64, +	}; +	LIST_HEAD(lseg_list); +	int ret = 0; + +	while (!list_empty(layout_list)) { +		lo = list_entry(layout_list->next, struct pnfs_layout_hdr, +				plh_bulk_destroy); +		dprintk("%s freeing layout for inode %lu\n", __func__, +			lo->plh_inode->i_ino); +		inode = lo->plh_inode; +		spin_lock(&inode->i_lock); +		list_del_init(&lo->plh_bulk_destroy); +		lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ +		if (is_bulk_recall) +			set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); +		if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range)) +			ret = -EAGAIN; +		spin_unlock(&inode->i_lock); +		pnfs_free_lseg_list(&lseg_list); +		pnfs_put_layout_hdr(lo); +		iput(inode); +	} +	return ret; +} + +int +pnfs_destroy_layouts_byfsid(struct nfs_client *clp, +		struct nfs_fsid *fsid, +		bool is_recall) +{ +	struct nfs_server *server; +	LIST_HEAD(layout_list); + +	spin_lock(&clp->cl_lock); +	rcu_read_lock(); +restart: +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +		if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0) +			continue; +		if (pnfs_layout_bulk_destroy_byserver_locked(clp, +				server, +				&layout_list) != 0) +			goto restart; +	} +	rcu_read_unlock(); +	spin_unlock(&clp->cl_lock); + +	if (list_empty(&layout_list)) +		return 0; +	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +} + +int +pnfs_destroy_layouts_byclid(struct nfs_client *clp, +		bool is_recall) +{ +	struct nfs_server *server; +	LIST_HEAD(layout_list); + +	spin_lock(&clp->cl_lock); +	rcu_read_lock(); +restart: +	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +		if (pnfs_layout_bulk_destroy_byserver_locked(clp, +					server, +					&layout_list) != 0) +			goto restart; +	} +	rcu_read_unlock(); +	spin_unlock(&clp->cl_lock); + +	if (list_empty(&layout_list)) +		return 0; +	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);  }  /* @@ -303,85 +650,100 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)  void  pnfs_destroy_all_layouts(struct nfs_client *clp)  { -	struct pnfs_layout_hdr *lo; -	LIST_HEAD(tmp_list); - -	spin_lock(&clp->cl_lock); -	list_splice_init(&clp->cl_layouts, &tmp_list); -	spin_unlock(&clp->cl_lock); +	nfs4_deviceid_mark_client_invalid(clp); +	nfs4_deviceid_purge_client(clp); -	while (!list_empty(&tmp_list)) { -		lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, -				layouts); -		dprintk("%s freeing layout for inode %lu\n", __func__, -			lo->inode->i_ino); -		pnfs_destroy_layout(NFS_I(lo->inode)); -	} +	pnfs_destroy_layouts_byclid(clp, false);  } -/* update lo->stateid with new if is more recent - * - * lo->stateid could be the open stateid, in which case we just use what given. +/* + * Compare 2 layout stateid sequence ids, to see which is newer, + * taking into account wraparound issues.   */ -static void -pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, -			const nfs4_stateid *new) +static bool pnfs_seqid_is_newer(u32 s1, u32 s2)  { -	nfs4_stateid *old = &lo->stateid; -	bool overwrite = false; +	return (s32)(s1 - s2) > 0; +} -	write_seqlock(&lo->seqlock); -	if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state) || -	    memcmp(old->stateid.other, new->stateid.other, sizeof(new->stateid.other))) -		overwrite = true; -	else { -		u32 oldseq, newseq; +static void +pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo, +		const nfs4_stateid *new, +		struct list_head *free_me_list) +{ +	if (nfs4_stateid_match_other(&lo->plh_stateid, new)) +		return; +	/* Layout is new! Kill existing layout segments */ +	pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL); +} -		oldseq = be32_to_cpu(old->stateid.seqid); -		newseq = be32_to_cpu(new->stateid.seqid); -		if ((int)(newseq - oldseq) > 0) -			overwrite = true; +/* update lo->plh_stateid with new if is more recent */ +void +pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, +			bool update_barrier) +{ +	u32 oldseq, newseq, new_barrier; +	int empty = list_empty(&lo->plh_segs); + +	oldseq = be32_to_cpu(lo->plh_stateid.seqid); +	newseq = be32_to_cpu(new->seqid); +	if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { +		nfs4_stateid_copy(&lo->plh_stateid, new); +		if (update_barrier) { +			new_barrier = be32_to_cpu(new->seqid); +		} else { +			/* Because of wraparound, we want to keep the barrier +			 * "close" to the current seqids. +			 */ +			new_barrier = newseq - atomic_read(&lo->plh_outstanding); +		} +		if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) +			lo->plh_barrier = new_barrier;  	} -	if (overwrite) -		memcpy(&old->stateid, &new->stateid, sizeof(new->stateid)); -	write_sequnlock(&lo->seqlock);  } -static void -pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo, -			      struct nfs4_state *state) +static bool +pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, +		const nfs4_stateid *stateid)  { -	int seq; +	u32 seqid = be32_to_cpu(stateid->seqid); -	dprintk("--> %s\n", __func__); -	write_seqlock(&lo->seqlock); -	do { -		seq = read_seqbegin(&state->seqlock); -		memcpy(lo->stateid.data, state->stateid.data, -		       sizeof(state->stateid.data)); -	} while (read_seqretry(&state->seqlock, seq)); -	set_bit(NFS_LAYOUT_STATEID_SET, &lo->state); -	write_sequnlock(&lo->seqlock); -	dprintk("<-- %s\n", __func__); +	return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);  } -void -pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, -			struct nfs4_state *open_state) +/* lget is set to 1 if called from inside send_layoutget call chain */ +static bool +pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget) +{ +	return lo->plh_block_lgets || +		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || +		(list_empty(&lo->plh_segs) && +		 (atomic_read(&lo->plh_outstanding) > lget)); +} + +int +pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, +			      struct nfs4_state *open_state)  { -	int seq; +	int status = 0;  	dprintk("--> %s\n", __func__); -	do { -		seq = read_seqbegin(&lo->seqlock); -		if (!test_bit(NFS_LAYOUT_STATEID_SET, &lo->state)) { -			/* This will trigger retry of the read */ -			pnfs_layout_from_open_stateid(lo, open_state); -		} else -			memcpy(dst->data, lo->stateid.data, -			       sizeof(lo->stateid.data)); -	} while (read_seqretry(&lo->seqlock, seq)); +	spin_lock(&lo->plh_inode->i_lock); +	if (pnfs_layoutgets_blocked(lo, 1)) { +		status = -EAGAIN; +	} else if (!nfs4_valid_open_stateid(open_state)) { +		status = -EBADF; +	} else if (list_empty(&lo->plh_segs)) { +		int seq; + +		do { +			seq = read_seqbegin(&open_state->seqlock); +			nfs4_stateid_copy(dst, &open_state->stateid); +		} while (read_seqretry(&open_state->seqlock, seq)); +	} else +		nfs4_stateid_copy(dst, &lo->plh_stateid); +	spin_unlock(&lo->plh_inode->i_lock);  	dprintk("<-- %s\n", __func__); +	return status;  }  /* @@ -393,134 +755,342 @@ pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,  static struct pnfs_layout_segment *  send_layoutget(struct pnfs_layout_hdr *lo,  	   struct nfs_open_context *ctx, -	   u32 iomode) +	   struct pnfs_layout_range *range, +	   gfp_t gfp_flags)  { -	struct inode *ino = lo->inode; +	struct inode *ino = lo->plh_inode;  	struct nfs_server *server = NFS_SERVER(ino);  	struct nfs4_layoutget *lgp; -	struct pnfs_layout_segment *lseg = NULL; +	struct pnfs_layout_segment *lseg;  	dprintk("--> %s\n", __func__); -	BUG_ON(ctx == NULL); -	lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); -	if (lgp == NULL) { -		put_layout_hdr(lo->inode); +	lgp = kzalloc(sizeof(*lgp), gfp_flags); +	if (lgp == NULL)  		return NULL; -	} -	lgp->args.minlength = NFS4_MAX_UINT64; + +	lgp->args.minlength = PAGE_CACHE_SIZE; +	if (lgp->args.minlength > range->length) +		lgp->args.minlength = range->length;  	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; -	lgp->args.range.iomode = iomode; -	lgp->args.range.offset = 0; -	lgp->args.range.length = NFS4_MAX_UINT64; +	lgp->args.range = *range;  	lgp->args.type = server->pnfs_curr_ld->id;  	lgp->args.inode = ino;  	lgp->args.ctx = get_nfs_open_context(ctx); -	lgp->lsegpp = &lseg; +	lgp->gfp_flags = gfp_flags; +	lgp->cred = lo->plh_lc_cred;  	/* Synchronously retrieve layout information from server and  	 * store in lseg.  	 */ -	nfs4_proc_layoutget(lgp); -	if (!lseg) { -		/* remember that LAYOUTGET failed and suspend trying */ -		set_bit(lo_fail_bit(iomode), &lo->state); +	lseg = nfs4_proc_layoutget(lgp, gfp_flags); +	if (IS_ERR(lseg)) { +		switch (PTR_ERR(lseg)) { +		case -ENOMEM: +		case -ERESTARTSYS: +			break; +		default: +			/* remember that LAYOUTGET failed and suspend trying */ +			pnfs_layout_io_set_failed(lo, range->iomode); +		} +		return NULL;  	} +  	return lseg;  } +static void pnfs_clear_layoutcommit(struct inode *inode, +		struct list_head *head) +{ +	struct nfs_inode *nfsi = NFS_I(inode); +	struct pnfs_layout_segment *lseg, *tmp; + +	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) +		return; +	list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) { +		if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) +			continue; +		pnfs_lseg_dec_and_remove_zero(lseg, head); +	} +} + +/* + * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr + * when the layout segment list is empty. + * + * Note that a pnfs_layout_hdr can exist with an empty layout segment + * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the + * deviceid is marked invalid. + */ +int +_pnfs_return_layout(struct inode *ino) +{ +	struct pnfs_layout_hdr *lo = NULL; +	struct nfs_inode *nfsi = NFS_I(ino); +	LIST_HEAD(tmp_list); +	struct nfs4_layoutreturn *lrp; +	nfs4_stateid stateid; +	int status = 0, empty; + +	dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); + +	spin_lock(&ino->i_lock); +	lo = nfsi->layout; +	if (!lo) { +		spin_unlock(&ino->i_lock); +		dprintk("NFS: %s no layout to return\n", __func__); +		goto out; +	} +	stateid = nfsi->layout->plh_stateid; +	/* Reference matched in nfs4_layoutreturn_release */ +	pnfs_get_layout_hdr(lo); +	empty = list_empty(&lo->plh_segs); +	pnfs_clear_layoutcommit(ino, &tmp_list); +	pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); +	/* Don't send a LAYOUTRETURN if list was initially empty */ +	if (empty) { +		spin_unlock(&ino->i_lock); +		pnfs_put_layout_hdr(lo); +		dprintk("NFS: %s no layout segments to return\n", __func__); +		goto out; +	} +	lo->plh_block_lgets++; +	spin_unlock(&ino->i_lock); +	pnfs_free_lseg_list(&tmp_list); + +	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); +	if (unlikely(lrp == NULL)) { +		status = -ENOMEM; +		spin_lock(&ino->i_lock); +		lo->plh_block_lgets--; +		spin_unlock(&ino->i_lock); +		pnfs_put_layout_hdr(lo); +		goto out; +	} + +	lrp->args.stateid = stateid; +	lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; +	lrp->args.inode = ino; +	lrp->args.layout = lo; +	lrp->clp = NFS_SERVER(ino)->nfs_client; +	lrp->cred = lo->plh_lc_cred; + +	status = nfs4_proc_layoutreturn(lrp); +out: +	dprintk("<-- %s status: %d\n", __func__, status); +	return status; +} +EXPORT_SYMBOL_GPL(_pnfs_return_layout); + +int +pnfs_commit_and_return_layout(struct inode *inode) +{ +	struct pnfs_layout_hdr *lo; +	int ret; + +	spin_lock(&inode->i_lock); +	lo = NFS_I(inode)->layout; +	if (lo == NULL) { +		spin_unlock(&inode->i_lock); +		return 0; +	} +	pnfs_get_layout_hdr(lo); +	/* Block new layoutgets and read/write to ds */ +	lo->plh_block_lgets++; +	spin_unlock(&inode->i_lock); +	filemap_fdatawait(inode->i_mapping); +	ret = pnfs_layoutcommit_inode(inode, true); +	if (ret == 0) +		ret = _pnfs_return_layout(inode); +	spin_lock(&inode->i_lock); +	lo->plh_block_lgets--; +	spin_unlock(&inode->i_lock); +	pnfs_put_layout_hdr(lo); +	return ret; +} + +bool pnfs_roc(struct inode *ino) +{ +	struct pnfs_layout_hdr *lo; +	struct pnfs_layout_segment *lseg, *tmp; +	LIST_HEAD(tmp_list); +	bool found = false; + +	spin_lock(&ino->i_lock); +	lo = NFS_I(ino)->layout; +	if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || +	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) +		goto out_nolayout; +	list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) +		if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { +			mark_lseg_invalid(lseg, &tmp_list); +			found = true; +		} +	if (!found) +		goto out_nolayout; +	lo->plh_block_lgets++; +	pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ +	spin_unlock(&ino->i_lock); +	pnfs_free_lseg_list(&tmp_list); +	return true; + +out_nolayout: +	spin_unlock(&ino->i_lock); +	return false; +} + +void pnfs_roc_release(struct inode *ino) +{ +	struct pnfs_layout_hdr *lo; + +	spin_lock(&ino->i_lock); +	lo = NFS_I(ino)->layout; +	lo->plh_block_lgets--; +	if (atomic_dec_and_test(&lo->plh_refcount)) { +		pnfs_detach_layout_hdr(lo); +		spin_unlock(&ino->i_lock); +		pnfs_free_layout_hdr(lo); +	} else +		spin_unlock(&ino->i_lock); +} + +void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) +{ +	struct pnfs_layout_hdr *lo; + +	spin_lock(&ino->i_lock); +	lo = NFS_I(ino)->layout; +	if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) +		lo->plh_barrier = barrier; +	spin_unlock(&ino->i_lock); +} + +bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) +{ +	struct nfs_inode *nfsi = NFS_I(ino); +	struct pnfs_layout_hdr *lo; +	struct pnfs_layout_segment *lseg; +	u32 current_seqid; +	bool found = false; + +	spin_lock(&ino->i_lock); +	list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) +		if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { +			rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); +			found = true; +			goto out; +		} +	lo = nfsi->layout; +	current_seqid = be32_to_cpu(lo->plh_stateid.seqid); + +	/* Since close does not return a layout stateid for use as +	 * a barrier, we choose the worst-case barrier. +	 */ +	*barrier = current_seqid + atomic_read(&lo->plh_outstanding); +out: +	spin_unlock(&ino->i_lock); +	return found; +} +  /*   * Compare two layout segments for sorting into layout cache.   * We want to preferentially return RW over RO layouts, so ensure those   * are seen first.   */  static s64 -cmp_layout(u32 iomode1, u32 iomode2) +pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1, +	   const struct pnfs_layout_range *l2)  { +	s64 d; + +	/* high offset > low offset */ +	d = l1->offset - l2->offset; +	if (d) +		return d; + +	/* short length > long length */ +	d = l2->length - l1->length; +	if (d) +		return d; +  	/* read > read/write */ -	return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ); +	return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);  }  static void -pnfs_insert_layout(struct pnfs_layout_hdr *lo, +pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,  		   struct pnfs_layout_segment *lseg)  {  	struct pnfs_layout_segment *lp; -	int found = 0;  	dprintk("%s:Begin\n", __func__); -	assert_spin_locked(&lo->inode->i_lock); -	if (list_empty(&lo->segs)) { -		struct nfs_client *clp = NFS_SERVER(lo->inode)->nfs_client; - -		spin_lock(&clp->cl_lock); -		BUG_ON(!list_empty(&lo->layouts)); -		list_add_tail(&lo->layouts, &clp->cl_layouts); -		spin_unlock(&clp->cl_lock); -	} -	list_for_each_entry(lp, &lo->segs, fi_list) { -		if (cmp_layout(lp->range.iomode, lseg->range.iomode) > 0) +	list_for_each_entry(lp, &lo->plh_segs, pls_list) { +		if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)  			continue; -		list_add_tail(&lseg->fi_list, &lp->fi_list); +		list_add_tail(&lseg->pls_list, &lp->pls_list);  		dprintk("%s: inserted lseg %p "  			"iomode %d offset %llu length %llu before "  			"lp %p iomode %d offset %llu length %llu\n", -			__func__, lseg, lseg->range.iomode, -			lseg->range.offset, lseg->range.length, -			lp, lp->range.iomode, lp->range.offset, -			lp->range.length); -		found = 1; -		break; -	} -	if (!found) { -		list_add_tail(&lseg->fi_list, &lo->segs); -		dprintk("%s: inserted lseg %p " -			"iomode %d offset %llu length %llu at tail\n", -			__func__, lseg, lseg->range.iomode, -			lseg->range.offset, lseg->range.length); +			__func__, lseg, lseg->pls_range.iomode, +			lseg->pls_range.offset, lseg->pls_range.length, +			lp, lp->pls_range.iomode, lp->pls_range.offset, +			lp->pls_range.length); +		goto out;  	} -	get_layout_hdr_locked(lo); +	list_add_tail(&lseg->pls_list, &lo->plh_segs); +	dprintk("%s: inserted lseg %p " +		"iomode %d offset %llu length %llu at tail\n", +		__func__, lseg, lseg->pls_range.iomode, +		lseg->pls_range.offset, lseg->pls_range.length); +out: +	pnfs_get_layout_hdr(lo);  	dprintk("%s:Return\n", __func__);  }  static struct pnfs_layout_hdr * -alloc_init_layout_hdr(struct inode *ino) +alloc_init_layout_hdr(struct inode *ino, +		      struct nfs_open_context *ctx, +		      gfp_t gfp_flags)  {  	struct pnfs_layout_hdr *lo; -	lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL); +	lo = pnfs_alloc_layout_hdr(ino, gfp_flags);  	if (!lo)  		return NULL; -	lo->refcount = 1; -	INIT_LIST_HEAD(&lo->layouts); -	INIT_LIST_HEAD(&lo->segs); -	seqlock_init(&lo->seqlock); -	lo->inode = ino; +	atomic_set(&lo->plh_refcount, 1); +	INIT_LIST_HEAD(&lo->plh_layouts); +	INIT_LIST_HEAD(&lo->plh_segs); +	INIT_LIST_HEAD(&lo->plh_bulk_destroy); +	lo->plh_inode = ino; +	lo->plh_lc_cred = get_rpccred(ctx->cred);  	return lo;  }  static struct pnfs_layout_hdr * -pnfs_find_alloc_layout(struct inode *ino) +pnfs_find_alloc_layout(struct inode *ino, +		       struct nfs_open_context *ctx, +		       gfp_t gfp_flags)  {  	struct nfs_inode *nfsi = NFS_I(ino);  	struct pnfs_layout_hdr *new = NULL;  	dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); -	assert_spin_locked(&ino->i_lock); -	if (nfsi->layout) -		return nfsi->layout; - +	if (nfsi->layout != NULL) +		goto out_existing;  	spin_unlock(&ino->i_lock); -	new = alloc_init_layout_hdr(ino); +	new = alloc_init_layout_hdr(ino, ctx, gfp_flags);  	spin_lock(&ino->i_lock); -	if (likely(nfsi->layout == NULL))	/* Won the race? */ +	if (likely(nfsi->layout == NULL)) {	/* Won the race? */  		nfsi->layout = new; -	else -		kfree(new); +		return new; +	} else if (new != NULL) +		pnfs_free_layout_hdr(new); +out_existing: +	pnfs_get_layout_hdr(nfsi->layout);  	return nfsi->layout;  } @@ -535,34 +1105,121 @@ pnfs_find_alloc_layout(struct inode *ino)   * READ		READ	true   * READ		RW	true   */ -static int -is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) +static bool +pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, +		 const struct pnfs_layout_range *range)  { -	return (iomode != IOMODE_RW || lseg->range.iomode == IOMODE_RW); +	struct pnfs_layout_range range1; + +	if ((range->iomode == IOMODE_RW && +	     ls_range->iomode != IOMODE_RW) || +	    !pnfs_lseg_range_intersecting(ls_range, range)) +		return 0; + +	/* range1 covers only the first byte in the range */ +	range1 = *range; +	range1.length = 1; +	return pnfs_lseg_range_contained(ls_range, &range1);  }  /*   * lookup range in layout   */  static struct pnfs_layout_segment * -pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode) +pnfs_find_lseg(struct pnfs_layout_hdr *lo, +		struct pnfs_layout_range *range)  {  	struct pnfs_layout_segment *lseg, *ret = NULL;  	dprintk("%s:Begin\n", __func__); -	assert_spin_locked(&lo->inode->i_lock); -	list_for_each_entry(lseg, &lo->segs, fi_list) { -		if (is_matching_lseg(lseg, iomode)) { -			ret = lseg; +	list_for_each_entry(lseg, &lo->plh_segs, pls_list) { +		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && +		    pnfs_lseg_range_match(&lseg->pls_range, range)) { +			ret = pnfs_get_lseg(lseg);  			break;  		} -		if (cmp_layout(iomode, lseg->range.iomode) > 0) +		if (lseg->pls_range.offset > range->offset)  			break;  	}  	dprintk("%s:Return lseg %p ref %d\n", -		__func__, ret, ret ? atomic_read(&ret->kref.refcount) : 0); +		__func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0); +	return ret; +} + +/* + * Use mdsthreshold hints set at each OPEN to determine if I/O should go + * to the MDS or over pNFS + * + * The nfs_inode read_io and write_io fields are cumulative counters reset + * when there are no layout segments. Note that in pnfs_update_layout iomode + * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a + * WRITE request. + * + * A return of true means use MDS I/O. + * + * From rfc 5661: + * If a file's size is smaller than the file size threshold, data accesses + * SHOULD be sent to the metadata server.  If an I/O request has a length that + * is below the I/O size threshold, the I/O SHOULD be sent to the metadata + * server.  If both file size and I/O size are provided, the client SHOULD + * reach or exceed  both thresholds before sending its read or write + * requests to the data server. + */ +static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, +				     struct inode *ino, int iomode) +{ +	struct nfs4_threshold *t = ctx->mdsthreshold; +	struct nfs_inode *nfsi = NFS_I(ino); +	loff_t fsize = i_size_read(ino); +	bool size = false, size_set = false, io = false, io_set = false, ret = false; + +	if (t == NULL) +		return ret; + +	dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n", +		__func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz); + +	switch (iomode) { +	case IOMODE_READ: +		if (t->bm & THRESHOLD_RD) { +			dprintk("%s fsize %llu\n", __func__, fsize); +			size_set = true; +			if (fsize < t->rd_sz) +				size = true; +		} +		if (t->bm & THRESHOLD_RD_IO) { +			dprintk("%s nfsi->read_io %llu\n", __func__, +				nfsi->read_io); +			io_set = true; +			if (nfsi->read_io < t->rd_io_sz) +				io = true; +		} +		break; +	case IOMODE_RW: +		if (t->bm & THRESHOLD_WR) { +			dprintk("%s fsize %llu\n", __func__, fsize); +			size_set = true; +			if (fsize < t->wr_sz) +				size = true; +		} +		if (t->bm & THRESHOLD_WR_IO) { +			dprintk("%s nfsi->write_io %llu\n", __func__, +				nfsi->write_io); +			io_set = true; +			if (nfsi->write_io < t->wr_io_sz) +				io = true; +		} +		break; +	} +	if (size_set && io_set) { +		if (size && io) +			ret = true; +	} else if (size || io) +		ret = true; + +	dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);  	return ret;  } @@ -573,57 +1230,107 @@ pnfs_has_layout(struct pnfs_layout_hdr *lo, u32 iomode)  struct pnfs_layout_segment *  pnfs_update_layout(struct inode *ino,  		   struct nfs_open_context *ctx, -		   enum pnfs_iomode iomode) +		   loff_t pos, +		   u64 count, +		   enum pnfs_iomode iomode, +		   gfp_t gfp_flags)  { -	struct nfs_inode *nfsi = NFS_I(ino); +	struct pnfs_layout_range arg = { +		.iomode = iomode, +		.offset = pos, +		.length = count, +	}; +	unsigned pg_offset; +	struct nfs_server *server = NFS_SERVER(ino); +	struct nfs_client *clp = server->nfs_client;  	struct pnfs_layout_hdr *lo;  	struct pnfs_layout_segment *lseg = NULL; +	bool first;  	if (!pnfs_enabled_sb(NFS_SERVER(ino))) -		return NULL; +		goto out; + +	if (pnfs_within_mdsthreshold(ctx, ino, iomode)) +		goto out; +  	spin_lock(&ino->i_lock); -	lo = pnfs_find_alloc_layout(ino); +	lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);  	if (lo == NULL) { -		dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); -		goto out_unlock; +		spin_unlock(&ino->i_lock); +		goto out;  	} -	/* Check to see if the layout for the given range already exists */ -	lseg = pnfs_has_layout(lo, iomode); -	if (lseg) { -		dprintk("%s: Using cached lseg %p for iomode %d)\n", -			__func__, lseg, iomode); +	/* Do we even need to bother with this? */ +	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { +		dprintk("%s matches recall, use MDS\n", __func__);  		goto out_unlock;  	}  	/* if LAYOUTGET already failed once we don't try again */ -	if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) +	if (pnfs_layout_io_test_failed(lo, iomode)) +		goto out_unlock; + +	/* Check to see if the layout for the given range already exists */ +	lseg = pnfs_find_lseg(lo, &arg); +	if (lseg) +		goto out_unlock; + +	if (pnfs_layoutgets_blocked(lo, 0))  		goto out_unlock; +	atomic_inc(&lo->plh_outstanding); -	get_layout_hdr_locked(lo); /* Matched in nfs4_layoutget_release */ +	first = list_empty(&lo->plh_layouts) ? true : false;  	spin_unlock(&ino->i_lock); -	lseg = send_layoutget(lo, ctx, iomode); +	if (first) { +		/* The lo must be on the clp list if there is any +		 * chance of a CB_LAYOUTRECALL(FILE) coming in. +		 */ +		spin_lock(&clp->cl_lock); +		list_add_tail(&lo->plh_layouts, &server->layouts); +		spin_unlock(&clp->cl_lock); +	} + +	pg_offset = arg.offset & ~PAGE_CACHE_MASK; +	if (pg_offset) { +		arg.offset -= pg_offset; +		arg.length += pg_offset; +	} +	if (arg.length != NFS4_MAX_UINT64) +		arg.length = PAGE_CACHE_ALIGN(arg.length); + +	lseg = send_layoutget(lo, ctx, &arg, gfp_flags); +	atomic_dec(&lo->plh_outstanding); +out_put_layout_hdr: +	pnfs_put_layout_hdr(lo);  out: -	dprintk("%s end, state 0x%lx lseg %p\n", __func__, -		nfsi->layout->state, lseg); +	dprintk("%s: inode %s/%llu pNFS layout segment %s for " +			"(%s, offset: %llu, length: %llu)\n", +			__func__, ino->i_sb->s_id, +			(unsigned long long)NFS_FILEID(ino), +			lseg == NULL ? "not found" : "found", +			iomode==IOMODE_RW ?  "read/write" : "read-only", +			(unsigned long long)pos, +			(unsigned long long)count);  	return lseg;  out_unlock:  	spin_unlock(&ino->i_lock); -	goto out; +	goto out_put_layout_hdr;  } +EXPORT_SYMBOL_GPL(pnfs_update_layout); -int +struct pnfs_layout_segment *  pnfs_layout_process(struct nfs4_layoutget *lgp)  {  	struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;  	struct nfs4_layoutget_res *res = &lgp->res;  	struct pnfs_layout_segment *lseg; -	struct inode *ino = lo->inode; +	struct inode *ino = lo->plh_inode; +	LIST_HEAD(free_me);  	int status = 0;  	/* Inject layout blob into I/O device driver */ -	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res); +	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);  	if (!lseg || IS_ERR(lseg)) {  		if (!lseg)  			status = -ENOMEM; @@ -635,149 +1342,607 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)  	}  	spin_lock(&ino->i_lock); -	init_lseg(lo, lseg); -	lseg->range = res->range; -	*lgp->lsegpp = lseg; -	pnfs_insert_layout(lo, lseg); +	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { +		dprintk("%s forget reply due to recall\n", __func__); +		goto out_forget_reply; +	} + +	if (pnfs_layoutgets_blocked(lo, 1) || +	    pnfs_layout_stateid_blocked(lo, &res->stateid)) { +		dprintk("%s forget reply due to state\n", __func__); +		goto out_forget_reply; +	} +	/* Check that the new stateid matches the old stateid */ +	pnfs_verify_layout_stateid(lo, &res->stateid, &free_me);  	/* Done processing layoutget. Set the layout stateid */ -	pnfs_set_layout_stateid(lo, &res->stateid); +	pnfs_set_layout_stateid(lo, &res->stateid, false); + +	init_lseg(lo, lseg); +	lseg->pls_range = res->range; +	pnfs_get_lseg(lseg); +	pnfs_layout_insert_lseg(lo, lseg); + +	if (res->return_on_close) { +		set_bit(NFS_LSEG_ROC, &lseg->pls_flags); +		set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); +	} +  	spin_unlock(&ino->i_lock); +	pnfs_free_lseg_list(&free_me); +	return lseg;  out: -	return status; +	return ERR_PTR(status); + +out_forget_reply: +	spin_unlock(&ino->i_lock); +	lseg->pls_layout = lo; +	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); +	goto out;  } +void +pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ +	u64 rd_size = req->wb_bytes; + +	WARN_ON_ONCE(pgio->pg_lseg != NULL); + +	if (pgio->pg_dreq == NULL) +		rd_size = i_size_read(pgio->pg_inode) - req_offset(req); +	else +		rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); + +	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, +					   req->wb_context, +					   req_offset(req), +					   rd_size, +					   IOMODE_READ, +					   GFP_KERNEL); +	/* If no lseg, fall back to read through mds */ +	if (pgio->pg_lseg == NULL) +		nfs_pageio_reset_read_mds(pgio); + +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); + +void +pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, +			   struct nfs_page *req, u64 wb_size) +{ +	WARN_ON_ONCE(pgio->pg_lseg != NULL); + +	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, +					   req->wb_context, +					   req_offset(req), +					   wb_size, +					   IOMODE_RW, +					   GFP_NOFS); +	/* If no lseg, fall back to write through mds */ +	if (pgio->pg_lseg == NULL) +		nfs_pageio_reset_write_mds(pgio); +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); +  /* - * Device ID cache. Currently supports one layout type per struct nfs_client. - * Add layout type to the lookup key to expand to support multiple types. + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced.   */ -int -pnfs_alloc_init_deviceid_cache(struct nfs_client *clp, -			 void (*free_callback)(struct pnfs_deviceid_node *)) +size_t +pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, +		     struct nfs_page *req) +{ +	unsigned int size; +	u64 seg_end, req_start, seg_left; + +	size = nfs_generic_pg_test(pgio, prev, req); +	if (!size) +		return 0; + +	/* +	 * 'size' contains the number of bytes left in the current page (up +	 * to the original size asked for in @req->wb_bytes). +	 * +	 * Calculate how many bytes are left in the layout segment +	 * and if there are less bytes than 'size', return that instead. +	 * +	 * Please also note that 'end_offset' is actually the offset of the +	 * first byte that lies outside the pnfs_layout_range. FIXME? +	 * +	 */ +	if (pgio->pg_lseg) { +		seg_end = end_offset(pgio->pg_lseg->pls_range.offset, +				     pgio->pg_lseg->pls_range.length); +		req_start = req_offset(req); +		WARN_ON_ONCE(req_start > seg_end); +		/* start of request is past the last byte of this segment */ +		if (req_start >= seg_end) +			return 0; + +		/* adjust 'size' iff there are fewer bytes left in the +		 * segment than what nfs_generic_pg_test returned */ +		seg_left = seg_end - req_start; +		if (seg_left < size) +			size = (unsigned int)seg_left; +	} + +	return size; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); + +int pnfs_write_done_resend_to_mds(struct inode *inode, +				struct list_head *head, +				const struct nfs_pgio_completion_ops *compl_ops, +				struct nfs_direct_req *dreq)  { -	struct pnfs_deviceid_cache *c; +	struct nfs_pageio_descriptor pgio; +	LIST_HEAD(failed); + +	/* Resend all requests through the MDS */ +	nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops); +	pgio.pg_dreq = dreq; +	while (!list_empty(head)) { +		struct nfs_page *req = nfs_list_entry(head->next); + +		nfs_list_remove_request(req); +		if (!nfs_pageio_add_request(&pgio, req)) +			nfs_list_add_request(req, &failed); +	} +	nfs_pageio_complete(&pgio); + +	if (!list_empty(&failed)) { +		/* For some reason our attempt to resend pages. Mark the +		 * overall send request as having failed, and let +		 * nfs_writeback_release_full deal with the error. +		 */ +		list_move(&failed, head); +		return -EIO; +	} +	return 0; +} +EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); + +static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data) +{ +	struct nfs_pgio_header *hdr = data->header; -	c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL); -	if (!c) +	dprintk("pnfs write error = %d\n", hdr->pnfs_error); +	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & +	    PNFS_LAYOUTRET_ON_ERROR) { +		pnfs_return_layout(hdr->inode); +	} +	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) +		data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, +							&hdr->pages, +							hdr->completion_ops, +							hdr->dreq); +} + +/* + * Called by non rpc-based layout drivers + */ +void pnfs_ld_write_done(struct nfs_pgio_data *data) +{ +	struct nfs_pgio_header *hdr = data->header; + +	trace_nfs4_pnfs_write(data, hdr->pnfs_error); +	if (!hdr->pnfs_error) { +		pnfs_set_layoutcommit(data); +		hdr->mds_ops->rpc_call_done(&data->task, data); +	} else +		pnfs_ld_handle_write_error(data); +	hdr->mds_ops->rpc_release(data); +} +EXPORT_SYMBOL_GPL(pnfs_ld_write_done); + +static void +pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, +		struct nfs_pgio_data *data) +{ +	struct nfs_pgio_header *hdr = data->header; + +	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { +		list_splice_tail_init(&hdr->pages, &desc->pg_list); +		nfs_pageio_reset_write_mds(desc); +		desc->pg_recoalesce = 1; +	} +	nfs_pgio_data_release(data); +} + +static enum pnfs_try_status +pnfs_try_to_write_data(struct nfs_pgio_data *wdata, +			const struct rpc_call_ops *call_ops, +			struct pnfs_layout_segment *lseg, +			int how) +{ +	struct nfs_pgio_header *hdr = wdata->header; +	struct inode *inode = hdr->inode; +	enum pnfs_try_status trypnfs; +	struct nfs_server *nfss = NFS_SERVER(inode); + +	hdr->mds_ops = call_ops; + +	dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, +		inode->i_ino, wdata->args.count, wdata->args.offset, how); +	trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how); +	if (trypnfs != PNFS_NOT_ATTEMPTED) +		nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); +	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); +	return trypnfs; +} + +static void +pnfs_do_write(struct nfs_pageio_descriptor *desc, +	      struct nfs_pgio_header *hdr, int how) +{ +	struct nfs_pgio_data *data = hdr->data; +	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; +	struct pnfs_layout_segment *lseg = desc->pg_lseg; +	enum pnfs_try_status trypnfs; + +	desc->pg_lseg = NULL; +	trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); +	if (trypnfs == PNFS_NOT_ATTEMPTED) +		pnfs_write_through_mds(desc, data); +	pnfs_put_lseg(lseg); +} + +static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) +{ +	pnfs_put_lseg(hdr->lseg); +	nfs_rw_header_free(hdr); +} +EXPORT_SYMBOL_GPL(pnfs_writehdr_free); + +int +pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) +{ +	struct nfs_rw_header *whdr; +	struct nfs_pgio_header *hdr; +	int ret; + +	whdr = nfs_rw_header_alloc(desc->pg_rw_ops); +	if (!whdr) { +		desc->pg_completion_ops->error_cleanup(&desc->pg_list); +		pnfs_put_lseg(desc->pg_lseg); +		desc->pg_lseg = NULL;  		return -ENOMEM; -	spin_lock(&clp->cl_lock); -	if (clp->cl_devid_cache != NULL) { -		atomic_inc(&clp->cl_devid_cache->dc_ref); -		dprintk("%s [kref [%d]]\n", __func__, -			atomic_read(&clp->cl_devid_cache->dc_ref)); -		kfree(c); -	} else { -		/* kzalloc initializes hlists */ -		spin_lock_init(&c->dc_lock); -		atomic_set(&c->dc_ref, 1); -		c->dc_free_callback = free_callback; -		clp->cl_devid_cache = c; -		dprintk("%s [new]\n", __func__);  	} -	spin_unlock(&clp->cl_lock); +	hdr = &whdr->header; +	nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); +	hdr->lseg = pnfs_get_lseg(desc->pg_lseg); +	atomic_inc(&hdr->refcnt); +	ret = nfs_generic_pgio(desc, hdr); +	if (ret != 0) { +		pnfs_put_lseg(desc->pg_lseg); +		desc->pg_lseg = NULL; +	} else +		pnfs_do_write(desc, hdr, desc->pg_ioflags); +	if (atomic_dec_and_test(&hdr->refcnt)) +		hdr->completion_ops->completion(hdr); +	return ret; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); + +int pnfs_read_done_resend_to_mds(struct inode *inode, +				struct list_head *head, +				const struct nfs_pgio_completion_ops *compl_ops, +				struct nfs_direct_req *dreq) +{ +	struct nfs_pageio_descriptor pgio; +	LIST_HEAD(failed); + +	/* Resend all requests through the MDS */ +	nfs_pageio_init_read(&pgio, inode, true, compl_ops); +	pgio.pg_dreq = dreq; +	while (!list_empty(head)) { +		struct nfs_page *req = nfs_list_entry(head->next); + +		nfs_list_remove_request(req); +		if (!nfs_pageio_add_request(&pgio, req)) +			nfs_list_add_request(req, &failed); +	} +	nfs_pageio_complete(&pgio); + +	if (!list_empty(&failed)) { +		list_move(&failed, head); +		return -EIO; +	}  	return 0;  } -EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache); +EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); + +static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data) +{ +	struct nfs_pgio_header *hdr = data->header; + +	dprintk("pnfs read error = %d\n", hdr->pnfs_error); +	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & +	    PNFS_LAYOUTRET_ON_ERROR) { +		pnfs_return_layout(hdr->inode); +	} +	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) +		data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, +							&hdr->pages, +							hdr->completion_ops, +							hdr->dreq); +}  /* - * Called from pnfs_layoutdriver_type->free_lseg - * last layout segment reference frees deviceid + * Called by non rpc-based layout drivers   */ -void -pnfs_put_deviceid(struct pnfs_deviceid_cache *c, -		  struct pnfs_deviceid_node *devid) +void pnfs_ld_read_done(struct nfs_pgio_data *data)  { -	struct nfs4_deviceid *id = &devid->de_id; -	struct pnfs_deviceid_node *d; -	struct hlist_node *n; -	long h = nfs4_deviceid_hash(id); +	struct nfs_pgio_header *hdr = data->header; + +	trace_nfs4_pnfs_read(data, hdr->pnfs_error); +	if (likely(!hdr->pnfs_error)) { +		__nfs4_read_done_cb(data); +		hdr->mds_ops->rpc_call_done(&data->task, data); +	} else +		pnfs_ld_handle_read_error(data); +	hdr->mds_ops->rpc_release(data); +} +EXPORT_SYMBOL_GPL(pnfs_ld_read_done); -	dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); -	if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock)) -		return; +static void +pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, +		struct nfs_pgio_data *data) +{ +	struct nfs_pgio_header *hdr = data->header; -	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node) -		if (!memcmp(&d->de_id, id, sizeof(*id))) { -			hlist_del_rcu(&d->de_node); -			spin_unlock(&c->dc_lock); -			synchronize_rcu(); -			c->dc_free_callback(devid); -			return; -		} -	spin_unlock(&c->dc_lock); -	/* Why wasn't it found in  the list? */ -	BUG(); +	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { +		list_splice_tail_init(&hdr->pages, &desc->pg_list); +		nfs_pageio_reset_read_mds(desc); +		desc->pg_recoalesce = 1; +	} +	nfs_pgio_data_release(data); +} + +/* + * Call the appropriate parallel I/O subsystem read function. + */ +static enum pnfs_try_status +pnfs_try_to_read_data(struct nfs_pgio_data *rdata, +		       const struct rpc_call_ops *call_ops, +		       struct pnfs_layout_segment *lseg) +{ +	struct nfs_pgio_header *hdr = rdata->header; +	struct inode *inode = hdr->inode; +	struct nfs_server *nfss = NFS_SERVER(inode); +	enum pnfs_try_status trypnfs; + +	hdr->mds_ops = call_ops; + +	dprintk("%s: Reading ino:%lu %u@%llu\n", +		__func__, inode->i_ino, rdata->args.count, rdata->args.offset); + +	trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata); +	if (trypnfs != PNFS_NOT_ATTEMPTED) +		nfs_inc_stats(inode, NFSIOS_PNFS_READ); +	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); +	return trypnfs;  } -EXPORT_SYMBOL_GPL(pnfs_put_deviceid); -/* Find and reference a deviceid */ -struct pnfs_deviceid_node * -pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id) +static void +pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)  { -	struct pnfs_deviceid_node *d; -	struct hlist_node *n; -	long hash = nfs4_deviceid_hash(id); +	struct nfs_pgio_data *data = hdr->data; +	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; +	struct pnfs_layout_segment *lseg = desc->pg_lseg; +	enum pnfs_try_status trypnfs; + +	desc->pg_lseg = NULL; +	trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); +	if (trypnfs == PNFS_NOT_ATTEMPTED) +		pnfs_read_through_mds(desc, data); +	pnfs_put_lseg(lseg); +} -	dprintk("--> %s hash %ld\n", __func__, hash); -	rcu_read_lock(); -	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { -		if (!memcmp(&d->de_id, id, sizeof(*id))) { -			if (!atomic_inc_not_zero(&d->de_ref)) { -				goto fail; -			} else { -				rcu_read_unlock(); -				return d; -			} -		} +static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) +{ +	pnfs_put_lseg(hdr->lseg); +	nfs_rw_header_free(hdr); +} +EXPORT_SYMBOL_GPL(pnfs_readhdr_free); + +int +pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) +{ +	struct nfs_rw_header *rhdr; +	struct nfs_pgio_header *hdr; +	int ret; + +	rhdr = nfs_rw_header_alloc(desc->pg_rw_ops); +	if (!rhdr) { +		desc->pg_completion_ops->error_cleanup(&desc->pg_list); +		ret = -ENOMEM; +		pnfs_put_lseg(desc->pg_lseg); +		desc->pg_lseg = NULL; +		return ret;  	} -fail: -	rcu_read_unlock(); -	return NULL; +	hdr = &rhdr->header; +	nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); +	hdr->lseg = pnfs_get_lseg(desc->pg_lseg); +	atomic_inc(&hdr->refcnt); +	ret = nfs_generic_pgio(desc, hdr); +	if (ret != 0) { +		pnfs_put_lseg(desc->pg_lseg); +		desc->pg_lseg = NULL; +	} else +		pnfs_do_read(desc, hdr); +	if (atomic_dec_and_test(&hdr->refcnt)) +		hdr->completion_ops->completion(hdr); +	return ret; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); + +static void pnfs_clear_layoutcommitting(struct inode *inode) +{ +	unsigned long *bitlock = &NFS_I(inode)->flags; + +	clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); +	smp_mb__after_atomic(); +	wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);  } -EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);  /* - * Add a deviceid to the cache. - * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new + * There can be multiple RW segments.   */ -struct pnfs_deviceid_node * -pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new) -{ -	struct pnfs_deviceid_node *d; -	long hash = nfs4_deviceid_hash(&new->de_id); - -	dprintk("--> %s hash %ld\n", __func__, hash); -	spin_lock(&c->dc_lock); -	d = pnfs_find_get_deviceid(c, &new->de_id); -	if (d) { -		spin_unlock(&c->dc_lock); -		dprintk("%s [discard]\n", __func__); -		c->dc_free_callback(new); -		return d; +static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) +{ +	struct pnfs_layout_segment *lseg; + +	list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { +		if (lseg->pls_range.iomode == IOMODE_RW && +		    test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) +			list_add(&lseg->pls_lc_list, listp); +	} +} + +static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) +{ +	struct pnfs_layout_segment *lseg, *tmp; + +	/* Matched by references in pnfs_set_layoutcommit */ +	list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { +		list_del_init(&lseg->pls_lc_list); +		pnfs_put_lseg(lseg);  	} -	INIT_HLIST_NODE(&new->de_node); -	atomic_set(&new->de_ref, 1); -	hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); -	spin_unlock(&c->dc_lock); -	dprintk("%s [new]\n", __func__); -	return new; + +	pnfs_clear_layoutcommitting(inode);  } -EXPORT_SYMBOL_GPL(pnfs_add_deviceid); + +void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) +{ +	pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); +} +EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);  void -pnfs_put_deviceid_cache(struct nfs_client *clp) +pnfs_set_layoutcommit(struct nfs_pgio_data *wdata)  { -	struct pnfs_deviceid_cache *local = clp->cl_devid_cache; +	struct nfs_pgio_header *hdr = wdata->header; +	struct inode *inode = hdr->inode; +	struct nfs_inode *nfsi = NFS_I(inode); +	loff_t end_pos = wdata->mds_offset + wdata->res.count; +	bool mark_as_dirty = false; -	dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); -	if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) { -		int i; -		/* Verify cache is empty */ -		for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) -			BUG_ON(!hlist_empty(&local->dc_deviceids[i])); -		clp->cl_devid_cache = NULL; -		spin_unlock(&clp->cl_lock); -		kfree(local); +	spin_lock(&inode->i_lock); +	if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { +		mark_as_dirty = true; +		dprintk("%s: Set layoutcommit for inode %lu ", +			__func__, inode->i_ino); +	} +	if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { +		/* references matched in nfs4_layoutcommit_release */ +		pnfs_get_lseg(hdr->lseg); +	} +	if (end_pos > nfsi->layout->plh_lwb) +		nfsi->layout->plh_lwb = end_pos; +	spin_unlock(&inode->i_lock); +	dprintk("%s: lseg %p end_pos %llu\n", +		__func__, hdr->lseg, nfsi->layout->plh_lwb); + +	/* if pnfs_layoutcommit_inode() runs between inode locks, the next one +	 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ +	if (mark_as_dirty) +		mark_inode_dirty_sync(inode); +} +EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); + +void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) +{ +	struct nfs_server *nfss = NFS_SERVER(data->args.inode); + +	if (nfss->pnfs_curr_ld->cleanup_layoutcommit) +		nfss->pnfs_curr_ld->cleanup_layoutcommit(data); +	pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list); +} + +/* + * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and + * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough + * data to disk to allow the server to recover the data if it crashes. + * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag + * is off, and a COMMIT is sent to a data server, or + * if WRITEs to a data server return NFS_DATA_SYNC. + */ +int +pnfs_layoutcommit_inode(struct inode *inode, bool sync) +{ +	struct nfs4_layoutcommit_data *data; +	struct nfs_inode *nfsi = NFS_I(inode); +	loff_t end_pos; +	int status; + +	if (!pnfs_layoutcommit_outstanding(inode)) +		return 0; + +	dprintk("--> %s inode %lu\n", __func__, inode->i_ino); + +	status = -EAGAIN; +	if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { +		if (!sync) +			goto out; +		status = wait_on_bit_lock(&nfsi->flags, +				NFS_INO_LAYOUTCOMMITTING, +				nfs_wait_bit_killable, +				TASK_KILLABLE); +		if (status) +			goto out; +	} + +	status = -ENOMEM; +	/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ +	data = kzalloc(sizeof(*data), GFP_NOFS); +	if (!data) +		goto clear_layoutcommitting; + +	status = 0; +	spin_lock(&inode->i_lock); +	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) +		goto out_unlock; + +	INIT_LIST_HEAD(&data->lseg_list); +	pnfs_list_write_lseg(inode, &data->lseg_list); + +	end_pos = nfsi->layout->plh_lwb; +	nfsi->layout->plh_lwb = 0; + +	nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); +	spin_unlock(&inode->i_lock); + +	data->args.inode = inode; +	data->cred = get_rpccred(nfsi->layout->plh_lc_cred); +	nfs_fattr_init(&data->fattr); +	data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; +	data->res.fattr = &data->fattr; +	data->args.lastbytewritten = end_pos - 1; +	data->res.server = NFS_SERVER(inode); + +	status = nfs4_proc_layoutcommit(data, sync); +out: +	if (status) +		mark_inode_dirty_sync(inode); +	dprintk("<-- %s status %d\n", __func__, status); +	return status; +out_unlock: +	spin_unlock(&inode->i_lock); +	kfree(data); +clear_layoutcommitting: +	pnfs_clear_layoutcommitting(inode); +	goto out; +} + +struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) +{ +	struct nfs4_threshold *thp; + +	thp = kzalloc(sizeof(*thp), GFP_NOFS); +	if (!thp) { +		dprintk("%s mdsthreshold allocation failed\n", __func__); +		return NULL;  	} +	return thp;  } -EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index e12367d5048..4fb309a2b4c 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -30,11 +30,27 @@  #ifndef FS_NFS_PNFS_H  #define FS_NFS_PNFS_H +#include <linux/nfs_fs.h> +#include <linux/nfs_page.h> + +enum { +	NFS_LSEG_VALID = 0,	/* cleared when lseg is recalled/returned */ +	NFS_LSEG_ROC,		/* roc bit received from server */ +	NFS_LSEG_LAYOUTCOMMIT,	/* layoutcommit bit set for layoutcommit */ +}; +  struct pnfs_layout_segment { -	struct list_head fi_list; -	struct pnfs_layout_range range; -	struct kref kref; -	struct pnfs_layout_hdr *layout; +	struct list_head pls_list; +	struct list_head pls_lc_list; +	struct pnfs_layout_range pls_range; +	atomic_t pls_refcount; +	unsigned long pls_flags; +	struct pnfs_layout_hdr *pls_layout; +}; + +enum pnfs_try_status { +	PNFS_ATTEMPTED     = 0, +	PNFS_NOT_ATTEMPTED = 1,  };  #ifdef CONFIG_NFS_V4_1 @@ -44,113 +60,220 @@ struct pnfs_layout_segment {  enum {  	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */  	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */ -	NFS_LAYOUT_STATEID_SET,		/* have a valid layout stateid */ +	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */ +	NFS_LAYOUT_ROC,			/* some lseg had roc bit set */ +	NFS_LAYOUT_RETURN,		/* Return this layout ASAP */ +}; + +enum layoutdriver_policy_flags { +	/* Should the pNFS client commit and return the layout upon a setattr */ +	PNFS_LAYOUTRET_ON_SETATTR	= 1 << 0, +	PNFS_LAYOUTRET_ON_ERROR		= 1 << 1,  }; +struct nfs4_deviceid_node; +  /* Per-layout driver specific registration structure */  struct pnfs_layoutdriver_type {  	struct list_head pnfs_tblid;  	const u32 id;  	const char *name;  	struct module *owner; -	int (*set_layoutdriver) (struct nfs_server *); +	unsigned flags; + +	int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);  	int (*clear_layoutdriver) (struct nfs_server *); -	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); + +	struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags); +	void (*free_layout_hdr) (struct pnfs_layout_hdr *); + +	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);  	void (*free_lseg) (struct pnfs_layout_segment *lseg); + +	/* test for nfs page cache coalescing */ +	const struct nfs_pageio_ops *pg_read_ops; +	const struct nfs_pageio_ops *pg_write_ops; + +	struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode); +	void (*mark_request_commit) (struct nfs_page *req, +				     struct pnfs_layout_segment *lseg, +				     struct nfs_commit_info *cinfo); +	void (*clear_request_commit) (struct nfs_page *req, +				      struct nfs_commit_info *cinfo); +	int (*scan_commit_lists) (struct nfs_commit_info *cinfo, +				  int max); +	void (*recover_commit_reqs) (struct list_head *list, +				     struct nfs_commit_info *cinfo); +	int (*commit_pagelist)(struct inode *inode, +			       struct list_head *mds_pages, +			       int how, +			       struct nfs_commit_info *cinfo); + +	/* +	 * Return PNFS_ATTEMPTED to indicate the layout code has attempted +	 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS +	 */ +	enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data); +	enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how); + +	void (*free_deviceid_node) (struct nfs4_deviceid_node *); + +	void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, +				     struct xdr_stream *xdr, +				     const struct nfs4_layoutreturn_args *args); + +	void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); + +	void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, +				     struct xdr_stream *xdr, +				     const struct nfs4_layoutcommit_args *args);  };  struct pnfs_layout_hdr { -	unsigned long		refcount; -	struct list_head	layouts;   /* other client layouts */ -	struct list_head	segs;      /* layout segments list */ -	seqlock_t		seqlock;   /* Protects the stateid */ -	nfs4_stateid		stateid; -	unsigned long		state; -	struct inode		*inode; +	atomic_t		plh_refcount; +	struct list_head	plh_layouts;   /* other client layouts */ +	struct list_head	plh_bulk_destroy; +	struct list_head	plh_segs;      /* layout segments list */ +	nfs4_stateid		plh_stateid; +	atomic_t		plh_outstanding; /* number of RPCs out */ +	unsigned long		plh_block_lgets; /* block LAYOUTGET if >0 */ +	u32			plh_barrier; /* ignore lower seqids */ +	unsigned long		plh_retry_timestamp; +	unsigned long		plh_flags; +	loff_t			plh_lwb; /* last write byte for layoutcommit */ +	struct rpc_cred		*plh_lc_cred; /* layoutcommit cred */ +	struct inode		*plh_inode;  };  struct pnfs_device {  	struct nfs4_deviceid dev_id;  	unsigned int  layout_type;  	unsigned int  mincount; +	unsigned int  maxcount;	/* gdia_maxcount */  	struct page **pages; -	void          *area;  	unsigned int  pgbase; -	unsigned int  pglen; +	unsigned int  pglen;	/* reply buffer length */  }; -/* - * Device ID RCU cache. A device ID is unique per client ID and layout type. - */ -#define NFS4_DEVICE_ID_HASH_BITS	5 -#define NFS4_DEVICE_ID_HASH_SIZE	(1 << NFS4_DEVICE_ID_HASH_BITS) -#define NFS4_DEVICE_ID_HASH_MASK	(NFS4_DEVICE_ID_HASH_SIZE - 1) - -static inline u32 -nfs4_deviceid_hash(struct nfs4_deviceid *id) -{ -	unsigned char *cptr = (unsigned char *)id->data; -	unsigned int nbytes = NFS4_DEVICEID4_SIZE; -	u32 x = 0; - -	while (nbytes--) { -		x *= 37; -		x += *cptr++; -	} -	return x & NFS4_DEVICE_ID_HASH_MASK; -} +#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 -struct pnfs_deviceid_node { -	struct hlist_node	de_node; -	struct nfs4_deviceid	de_id; -	atomic_t		de_ref; +struct pnfs_devicelist { +	unsigned int		eof; +	unsigned int		num_devs; +	struct nfs4_deviceid	dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];  }; -struct pnfs_deviceid_cache { -	spinlock_t		dc_lock; -	atomic_t		dc_ref; -	void			(*dc_free_callback)(struct pnfs_deviceid_node *); -	struct hlist_head	dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; -}; - -extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *, -			void (*free_callback)(struct pnfs_deviceid_node *)); -extern void pnfs_put_deviceid_cache(struct nfs_client *); -extern struct pnfs_deviceid_node *pnfs_find_get_deviceid( -				struct pnfs_deviceid_cache *, -				struct nfs4_deviceid *); -extern struct pnfs_deviceid_node *pnfs_add_deviceid( -				struct pnfs_deviceid_cache *, -				struct pnfs_deviceid_node *); -extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c, -			      struct pnfs_deviceid_node *devid); -  extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);  extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);  /* nfs4proc.c */ +extern int nfs4_proc_getdevicelist(struct nfs_server *server, +				   const struct nfs_fh *fh, +				   struct pnfs_devicelist *devlist);  extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, -				   struct pnfs_device *dev); -extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); +				   struct pnfs_device *dev, +				   struct rpc_cred *cred); +extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); +extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);  /* pnfs.c */ -struct pnfs_layout_segment * -pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, -		   enum pnfs_iomode access_type); -void set_pnfs_layoutdriver(struct nfs_server *, u32 id); +void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); +void pnfs_put_lseg(struct pnfs_layout_segment *lseg); + +void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);  void unset_pnfs_layoutdriver(struct nfs_server *); -int pnfs_layout_process(struct nfs4_layoutget *lgp); +void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); +int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); +void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, +			        struct nfs_page *req, u64 wb_size); +int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); +size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, +			    struct nfs_page *prev, struct nfs_page *req); +void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); +struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); +void pnfs_free_lseg_list(struct list_head *tmp_list);  void pnfs_destroy_layout(struct nfs_inode *);  void pnfs_destroy_all_layouts(struct nfs_client *); -void put_layout_hdr(struct inode *inode); -void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, -			     struct nfs4_state *open_state); +int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, +		struct nfs_fsid *fsid, +		bool is_recall); +int pnfs_destroy_layouts_byclid(struct nfs_client *clp, +		bool is_recall); +void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); +void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, +			     const nfs4_stateid *new, +			     bool update_barrier); +int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, +				  struct pnfs_layout_hdr *lo, +				  struct nfs4_state *open_state); +int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, +				struct list_head *tmp_list, +				struct pnfs_layout_range *recall_range); +bool pnfs_roc(struct inode *ino); +void pnfs_roc_release(struct inode *ino); +void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); +bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); +void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata); +void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); +int pnfs_layoutcommit_inode(struct inode *inode, bool sync); +int _pnfs_return_layout(struct inode *); +int pnfs_commit_and_return_layout(struct inode *); +void pnfs_ld_write_done(struct nfs_pgio_data *); +void pnfs_ld_read_done(struct nfs_pgio_data *); +struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, +					       struct nfs_open_context *ctx, +					       loff_t pos, +					       u64 count, +					       enum pnfs_iomode iomode, +					       gfp_t gfp_flags); +void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); +int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head, +			const struct nfs_pgio_completion_ops *compl_ops, +			struct nfs_direct_req *dreq); +int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head, +			const struct nfs_pgio_completion_ops *compl_ops, +			struct nfs_direct_req *dreq); +struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); -static inline int lo_fail_bit(u32 iomode) +/* nfs4_deviceid_flags */ +enum { +	NFS_DEVICEID_INVALID = 0,       /* set when MDS clientid recalled */ +	NFS_DEVICEID_UNAVAILABLE,	/* device temporarily unavailable */ +}; + +/* pnfs_dev.c */ +struct nfs4_deviceid_node { +	struct hlist_node		node; +	struct hlist_node		tmpnode; +	const struct pnfs_layoutdriver_type *ld; +	const struct nfs_client		*nfs_client; +	unsigned long 			flags; +	unsigned long			timestamp_unavailable; +	struct nfs4_deviceid		deviceid; +	atomic_t			ref; +}; + +struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); +void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); +void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, +			     const struct pnfs_layoutdriver_type *, +			     const struct nfs_client *, +			     const struct nfs4_deviceid *); +struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *); +bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); +void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node); +bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); +void nfs4_deviceid_purge_client(const struct nfs_client *); + +static inline struct pnfs_layout_segment * +pnfs_get_lseg(struct pnfs_layout_segment *lseg)  { -	return iomode == IOMODE_RW ? -			 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; +	if (lseg) { +		atomic_inc(&lseg->pls_refcount); +		smp_mb__after_atomic(); +	} +	return lseg;  }  /* Return true if a layout driver is being used for this mountpoint */ @@ -159,6 +282,114 @@ static inline int pnfs_enabled_sb(struct nfs_server *nfss)  	return nfss->pnfs_curr_ld != NULL;  } +static inline int +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how, +		 struct nfs_commit_info *cinfo) +{ +	if (cinfo->ds == NULL || cinfo->ds->ncommitting == 0) +		return PNFS_NOT_ATTEMPTED; +	return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how, cinfo); +} + +static inline struct pnfs_ds_commit_info * +pnfs_get_ds_info(struct inode *inode) +{ +	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + +	if (ld == NULL || ld->get_ds_info == NULL) +		return NULL; +	return ld->get_ds_info(inode); +} + +static inline bool +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, +			 struct nfs_commit_info *cinfo) +{ +	struct inode *inode = req->wb_context->dentry->d_inode; +	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + +	if (lseg == NULL || ld->mark_request_commit == NULL) +		return false; +	ld->mark_request_commit(req, lseg, cinfo); +	return true; +} + +static inline bool +pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) +{ +	struct inode *inode = req->wb_context->dentry->d_inode; +	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + +	if (ld == NULL || ld->clear_request_commit == NULL) +		return false; +	ld->clear_request_commit(req, cinfo); +	return true; +} + +static inline int +pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, +		       int max) +{ +	if (cinfo->ds == NULL || cinfo->ds->nwritten == 0) +		return 0; +	else +		return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max); +} + +static inline void +pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list, +			 struct nfs_commit_info *cinfo) +{ +	if (cinfo->ds == NULL || cinfo->ds->nwritten == 0) +		return; +	NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); +} + +/* Should the pNFS client commit and return the layout upon a setattr */ +static inline bool +pnfs_ld_layoutret_on_setattr(struct inode *inode) +{ +	if (!pnfs_enabled_sb(NFS_SERVER(inode))) +		return false; +	return NFS_SERVER(inode)->pnfs_curr_ld->flags & +		PNFS_LAYOUTRET_ON_SETATTR; +} + +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ +	struct nfs_inode *nfsi = NFS_I(inode); + +	return test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags) != 0 || +		test_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags) != 0; +} + +static inline int pnfs_return_layout(struct inode *ino) +{ +	struct nfs_inode *nfsi = NFS_I(ino); +	struct nfs_server *nfss = NFS_SERVER(ino); + +	if (pnfs_enabled_sb(nfss) && nfsi->layout) +		return _pnfs_return_layout(ino); + +	return 0; +} + +static inline bool +pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, +		   struct nfs_server *nfss) +{ +	return (dst && src && src->bm != 0 && nfss->pnfs_curr_ld && +					nfss->pnfs_curr_ld->id == src->l_type); +} + +#ifdef NFS_DEBUG +void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); +#else +static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id) +{ +} +#endif /* NFS_DEBUG */  #else  /* CONFIG_NFS_V4_1 */  static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) @@ -170,13 +401,55 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)  }  static inline struct pnfs_layout_segment * -pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, -		   enum pnfs_iomode access_type) +pnfs_get_lseg(struct pnfs_layout_segment *lseg)  {  	return NULL;  } -static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) +static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg) +{ +} + +static inline int pnfs_return_layout(struct inode *ino) +{ +	return 0; +} + +static inline int pnfs_commit_and_return_layout(struct inode *inode) +{ +	return 0; +} + +static inline bool +pnfs_ld_layoutret_on_setattr(struct inode *inode) +{ +	return false; +} + +static inline bool +pnfs_roc(struct inode *ino) +{ +	return false; +} + +static inline void +pnfs_roc_release(struct inode *ino) +{ +} + +static inline void +pnfs_roc_set_barrier(struct inode *ino, u32 barrier) +{ +} + +static inline bool +pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) +{ +	return false; +} + +static inline void set_pnfs_layoutdriver(struct nfs_server *s, +					 const struct nfs_fh *mntfh, u32 id)  {  } @@ -184,6 +457,69 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)  {  } +static inline int +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how, +		 struct nfs_commit_info *cinfo) +{ +	return PNFS_NOT_ATTEMPTED; +} + +static inline struct pnfs_ds_commit_info * +pnfs_get_ds_info(struct inode *inode) +{ +	return NULL; +} + +static inline bool +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, +			 struct nfs_commit_info *cinfo) +{ +	return false; +} + +static inline bool +pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) +{ +	return false; +} + +static inline int +pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, +		       int max) +{ +	return 0; +} + +static inline void +pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list, +			 struct nfs_commit_info *cinfo) +{ +} + +static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) +{ +	return 0; +} + +static inline bool +pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, +		   struct nfs_server *nfss) +{ +	return false; +} + +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ +	return false; +} + + +static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) +{ +	return NULL; +} +  #endif /* CONFIG_NFS_V4_1 */  #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c new file mode 100644 index 00000000000..6da209bd940 --- /dev/null +++ b/fs/nfs/pnfs_dev.c @@ -0,0 +1,302 @@ +/* + *  Device operations for the pnfs client. + * + *  Copyright (c) 2002 + *  The Regents of the University of Michigan + *  All Rights Reserved + * + *  Dean Hildebrand <dhildebz@umich.edu> + *  Garth Goodson   <Garth.Goodson@netapp.com> + * + *  Permission is granted to use, copy, create derivative works, and + *  redistribute this software and such derivative works for any purpose, + *  so long as the name of the University of Michigan is not used in + *  any advertising or publicity pertaining to the use or distribution + *  of this software without specific, written prior authorization. If + *  the above copyright notice or any other identification of the + *  University of Michigan is included in any copy of any portion of + *  this software, then the disclaimer below must also be included. + * + *  This software is provided as is, without representation or warranty + *  of any kind either express or implied, including without limitation + *  the implied warranties of merchantability, fitness for a particular + *  purpose, or noninfringement.  The Regents of the University of + *  Michigan shall not be liable for any damages, including special, + *  indirect, incidental, or consequential damages, with respect to any + *  claim arising out of or in connection with the use of the software, + *  even if it has been or is hereafter advised of the possibility of + *  such damages. + */ + +#include <linux/export.h> +#include "pnfs.h" + +#define NFSDBG_FACILITY		NFSDBG_PNFS + +/* + * Device ID RCU cache. A device ID is unique per server and layout type. + */ +#define NFS4_DEVICE_ID_HASH_BITS	5 +#define NFS4_DEVICE_ID_HASH_SIZE	(1 << NFS4_DEVICE_ID_HASH_BITS) +#define NFS4_DEVICE_ID_HASH_MASK	(NFS4_DEVICE_ID_HASH_SIZE - 1) + +#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ) + +static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; +static DEFINE_SPINLOCK(nfs4_deviceid_lock); + +#ifdef NFS_DEBUG +void +nfs4_print_deviceid(const struct nfs4_deviceid *id) +{ +	u32 *p = (u32 *)id; + +	dprintk("%s: device id= [%x%x%x%x]\n", __func__, +		p[0], p[1], p[2], p[3]); +} +EXPORT_SYMBOL_GPL(nfs4_print_deviceid); +#endif + +static inline u32 +nfs4_deviceid_hash(const struct nfs4_deviceid *id) +{ +	unsigned char *cptr = (unsigned char *)id->data; +	unsigned int nbytes = NFS4_DEVICEID4_SIZE; +	u32 x = 0; + +	while (nbytes--) { +		x *= 37; +		x += *cptr++; +	} +	return x & NFS4_DEVICE_ID_HASH_MASK; +} + +static struct nfs4_deviceid_node * +_lookup_deviceid(const struct pnfs_layoutdriver_type *ld, +		 const struct nfs_client *clp, const struct nfs4_deviceid *id, +		 long hash) +{ +	struct nfs4_deviceid_node *d; + +	hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node) +		if (d->ld == ld && d->nfs_client == clp && +		    !memcmp(&d->deviceid, id, sizeof(*id))) { +			if (atomic_read(&d->ref)) +				return d; +			else +				continue; +		} +	return NULL; +} + +/* + * Lookup a deviceid in cache and get a reference count on it if found + * + * @clp nfs_client associated with deviceid + * @id deviceid to look up + */ +static struct nfs4_deviceid_node * +_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, +		   const struct nfs_client *clp, const struct nfs4_deviceid *id, +		   long hash) +{ +	struct nfs4_deviceid_node *d; + +	rcu_read_lock(); +	d = _lookup_deviceid(ld, clp, id, hash); +	if (d != NULL) +		atomic_inc(&d->ref); +	rcu_read_unlock(); +	return d; +} + +struct nfs4_deviceid_node * +nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, +		       const struct nfs_client *clp, const struct nfs4_deviceid *id) +{ +	return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); +} +EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); + +/* + * Remove a deviceid from cache + * + * @clp nfs_client associated with deviceid + * @id the deviceid to unhash + * + * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise. + */ +void +nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, +			 const struct nfs_client *clp, const struct nfs4_deviceid *id) +{ +	struct nfs4_deviceid_node *d; + +	spin_lock(&nfs4_deviceid_lock); +	rcu_read_lock(); +	d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); +	rcu_read_unlock(); +	if (!d) { +		spin_unlock(&nfs4_deviceid_lock); +		return; +	} +	hlist_del_init_rcu(&d->node); +	spin_unlock(&nfs4_deviceid_lock); +	synchronize_rcu(); + +	/* balance the initial ref set in pnfs_insert_deviceid */ +	if (atomic_dec_and_test(&d->ref)) +		d->ld->free_deviceid_node(d); +} +EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); + +void +nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, +			const struct pnfs_layoutdriver_type *ld, +			const struct nfs_client *nfs_client, +			const struct nfs4_deviceid *id) +{ +	INIT_HLIST_NODE(&d->node); +	INIT_HLIST_NODE(&d->tmpnode); +	d->ld = ld; +	d->nfs_client = nfs_client; +	d->flags = 0; +	d->deviceid = *id; +	atomic_set(&d->ref, 1); +} +EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); + +/* + * Uniquely initialize and insert a deviceid node into cache + * + * @new new deviceid node + *      Note that the caller must set up the following members: + *        new->ld + *        new->nfs_client + *        new->deviceid + * + * @ret the inserted node, if none found, otherwise, the found entry. + */ +struct nfs4_deviceid_node * +nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new) +{ +	struct nfs4_deviceid_node *d; +	long hash; + +	spin_lock(&nfs4_deviceid_lock); +	hash = nfs4_deviceid_hash(&new->deviceid); +	d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash); +	if (d) { +		spin_unlock(&nfs4_deviceid_lock); +		return d; +	} + +	hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); +	spin_unlock(&nfs4_deviceid_lock); +	atomic_inc(&new->ref); + +	return new; +} +EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node); + +/* + * Dereference a deviceid node and delete it when its reference count drops + * to zero. + * + * @d deviceid node to put + * + * return true iff the node was deleted + * Note that since the test for d->ref == 0 is sufficient to establish + * that the node is no longer hashed in the global device id cache. + */ +bool +nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) +{ +	if (!atomic_dec_and_test(&d->ref)) +		return false; +	d->ld->free_deviceid_node(d); +	return true; +} +EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node); + +void +nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node) +{ +	node->timestamp_unavailable = jiffies; +	set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); +} +EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable); + +bool +nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node) +{ +	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) { +		unsigned long start, end; + +		end = jiffies; +		start = end - PNFS_DEVICE_RETRY_TIMEOUT; +		if (time_in_range(node->timestamp_unavailable, start, end)) +			return true; +		clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); +	} +	return false; +} +EXPORT_SYMBOL_GPL(nfs4_test_deviceid_unavailable); + +static void +_deviceid_purge_client(const struct nfs_client *clp, long hash) +{ +	struct nfs4_deviceid_node *d; +	HLIST_HEAD(tmp); + +	spin_lock(&nfs4_deviceid_lock); +	rcu_read_lock(); +	hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node) +		if (d->nfs_client == clp && atomic_read(&d->ref)) { +			hlist_del_init_rcu(&d->node); +			hlist_add_head(&d->tmpnode, &tmp); +		} +	rcu_read_unlock(); +	spin_unlock(&nfs4_deviceid_lock); + +	if (hlist_empty(&tmp)) +		return; + +	synchronize_rcu(); +	while (!hlist_empty(&tmp)) { +		d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode); +		hlist_del(&d->tmpnode); +		if (atomic_dec_and_test(&d->ref)) +			d->ld->free_deviceid_node(d); +	} +} + +void +nfs4_deviceid_purge_client(const struct nfs_client *clp) +{ +	long h; + +	if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) +		return; +	for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++) +		_deviceid_purge_client(clp, h); +} + +/* + * Stop use of all deviceids associated with an nfs_client + */ +void +nfs4_deviceid_mark_client_invalid(struct nfs_client *clp) +{ +	struct nfs4_deviceid_node *d; +	int i; + +	rcu_read_lock(); +	for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){ +		hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[i], node) +			if (d->nfs_client == clp) +				set_bit(NFS_DEVICEID_INVALID, &d->flags); +	} +	rcu_read_unlock(); +} + diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 58e7f84fc1f..c171ce1a8a3 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -41,44 +41,12 @@  #include <linux/nfs_fs.h>  #include <linux/nfs_page.h>  #include <linux/lockd/bind.h> +#include <linux/freezer.h>  #include "internal.h"  #define NFSDBG_FACILITY		NFSDBG_PROC  /* - * wrapper to handle the -EKEYEXPIRED error message. This should generally - * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't - * support the NFSERR_JUKEBOX error code, but we handle this situation in the - * same way that we handle that error with NFSv3. - */ -static int -nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) -{ -	int res; -	do { -		res = rpc_call_sync(clnt, msg, flags); -		if (res != -EKEYEXPIRED) -			break; -		schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); -		res = -ERESTARTSYS; -	} while (!fatal_signal_pending(current)); -	return res; -} - -#define rpc_call_sync(clnt, msg, flags)	nfs_rpc_wrapper(clnt, msg, flags) - -static int -nfs_async_handle_expired_key(struct rpc_task *task) -{ -	if (task->tk_status != -EKEYEXPIRED) -		return 0; -	task->tk_status = 0; -	rpc_restart_call(task); -	rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); -	return 1; -} - -/*   * Bare-bones access to getattr: this is for nfs_read_super.   */  static int @@ -130,7 +98,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,   */  static int  nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, -		struct nfs_fattr *fattr) +		struct nfs_fattr *fattr, struct nfs4_label *label)  {  	struct rpc_message msg = {  		.rpc_proc	= &nfs_procedures[NFSPROC_GETATTR], @@ -178,7 +146,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,  static int  nfs_proc_lookup(struct inode *dir, struct qstr *name, -		struct nfs_fh *fhandle, struct nfs_fattr *fattr) +		struct nfs_fh *fhandle, struct nfs_fattr *fattr, +		struct nfs4_label *label)  {  	struct nfs_diropargs	arg = {  		.fh		= NFS_FH(dir), @@ -258,7 +227,7 @@ static void nfs_free_createdata(const struct nfs_createdata *data)  static int  nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, -		int flags, struct nfs_open_context *ctx) +		int flags)  {  	struct nfs_createdata *data;  	struct rpc_message msg = { @@ -266,7 +235,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  	};  	int status = -ENOMEM; -	dprintk("NFS call  create %s\n", dentry->d_name.name); +	dprintk("NFS call  create %pd\n", dentry);  	data = nfs_alloc_createdata(dir, dentry, sattr);  	if (data == NULL)  		goto out; @@ -275,7 +244,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);  	nfs_mark_for_revalidate(dir);  	if (status == 0) -		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); +		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);  	nfs_free_createdata(data);  out:  	dprintk("NFS reply create: %d\n", status); @@ -296,7 +265,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  	umode_t mode;  	int status = -ENOMEM; -	dprintk("NFS call  mknod %s\n", dentry->d_name.name); +	dprintk("NFS call  mknod %pd\n", dentry);  	mode = sattr->ia_mode;  	if (S_ISFIFO(mode)) { @@ -322,7 +291,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,  		status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);  	}  	if (status == 0) -		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); +		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);  	nfs_free_createdata(data);  out:  	dprintk("NFS reply mknod: %d\n", status); @@ -334,8 +303,7 @@ nfs_proc_remove(struct inode *dir, struct qstr *name)  {  	struct nfs_removeargs arg = {  		.fh = NFS_FH(dir), -		.name.len = name->len, -		.name.name = name->name, +		.name = *name,  	};  	struct rpc_message msg = {   		.rpc_proc = &nfs_procedures[NFSPROC_REMOVE], @@ -357,10 +325,13 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)  	msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE];  } +static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) +{ +	rpc_call_start(task); +} +  static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)  { -	if (nfs_async_handle_expired_key(task)) -		return 0;  	nfs_mark_for_revalidate(dir);  	return 1;  } @@ -371,42 +342,21 @@ nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)  	msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];  } +static void nfs_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) +{ +	rpc_call_start(task); +} +  static int  nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,  		     struct inode *new_dir)  { -	if (nfs_async_handle_expired_key(task)) -		return 0;  	nfs_mark_for_revalidate(old_dir);  	nfs_mark_for_revalidate(new_dir);  	return 1;  }  static int -nfs_proc_rename(struct inode *old_dir, struct qstr *old_name, -		struct inode *new_dir, struct qstr *new_name) -{ -	struct nfs_renameargs	arg = { -		.old_dir	= NFS_FH(old_dir), -		.old_name	= old_name, -		.new_dir	= NFS_FH(new_dir), -		.new_name	= new_name, -	}; -	struct rpc_message msg = { -		.rpc_proc	= &nfs_procedures[NFSPROC_RENAME], -		.rpc_argp	= &arg, -	}; -	int			status; - -	dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name); -	status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); -	nfs_mark_for_revalidate(old_dir); -	nfs_mark_for_revalidate(new_dir); -	dprintk("NFS reply rename: %d\n", status); -	return status; -} - -static int  nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)  {  	struct nfs_linkargs	arg = { @@ -449,7 +399,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,  	};  	int status = -ENAMETOOLONG; -	dprintk("NFS call  symlink %s\n", dentry->d_name.name); +	dprintk("NFS call  symlink %pd\n", dentry);  	if (len > NFS2_MAXPATHLEN)  		goto out; @@ -458,7 +408,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,  	fattr = nfs_alloc_fattr();  	status = -ENOMEM;  	if (fh == NULL || fattr == NULL) -		goto out; +		goto out_free;  	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);  	nfs_mark_for_revalidate(dir); @@ -469,8 +419,9 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,  	 * should fill in the data with a LOOKUP call on the wire.  	 */  	if (status == 0) -		status = nfs_instantiate(dentry, fh, fattr); +		status = nfs_instantiate(dentry, fh, fattr, NULL); +out_free:  	nfs_free_fattr(fattr);  	nfs_free_fhandle(fh);  out: @@ -487,7 +438,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)  	};  	int status = -ENOMEM; -	dprintk("NFS call  mkdir %s\n", dentry->d_name.name); +	dprintk("NFS call  mkdir %pd\n", dentry);  	data = nfs_alloc_createdata(dir, dentry, sattr);  	if (data == NULL)  		goto out; @@ -497,7 +448,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)  	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);  	nfs_mark_for_revalidate(dir);  	if (status == 0) -		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); +		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);  	nfs_free_createdata(data);  out:  	dprintk("NFS reply mkdir: %d\n", status); @@ -627,47 +578,56 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,  	return 0;  } -static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data)  { -	if (nfs_async_handle_expired_key(task)) -		return -EAGAIN; +	struct inode *inode = data->header->inode; -	nfs_invalidate_atime(data->inode); +	nfs_invalidate_atime(inode);  	if (task->tk_status >= 0) { -		nfs_refresh_inode(data->inode, data->res.fattr); +		nfs_refresh_inode(inode, data->res.fattr);  		/* Emulate the eof flag, which isn't normally needed in NFSv2  		 * as it is guaranteed to always return the file attributes  		 */ -		if (data->args.offset + data->args.count >= data->res.fattr->size) +		if (data->args.offset + data->res.count >= data->res.fattr->size)  			data->res.eof = 1;  	}  	return 0;  } -static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)  {  	msg->rpc_proc = &nfs_procedures[NFSPROC_READ];  } -static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)  { -	if (nfs_async_handle_expired_key(task)) -		return -EAGAIN; +	rpc_call_start(task); +	return 0; +} + +static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data) +{ +	struct inode *inode = data->header->inode;  	if (task->tk_status >= 0) -		nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr); +		nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);  	return 0;  } -static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)  {  	/* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */  	data->args.stable = NFS_FILE_SYNC;  	msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];  } +static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) +{ +	BUG(); +} +  static void -nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) +nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)  {  	BUG();  } @@ -675,7 +635,7 @@ nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)  static int  nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)  { -	struct inode *inode = filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(filp);  	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);  } @@ -704,12 +664,47 @@ out_einval:  	return -EINVAL;  } +static int nfs_have_delegation(struct inode *inode, fmode_t flags) +{ +	return 0; +} + +static int nfs_return_delegation(struct inode *inode) +{ +	nfs_wb_all(inode); +	return 0; +} + +static const struct inode_operations nfs_dir_inode_operations = { +	.create		= nfs_create, +	.lookup		= nfs_lookup, +	.link		= nfs_link, +	.unlink		= nfs_unlink, +	.symlink	= nfs_symlink, +	.mkdir		= nfs_mkdir, +	.rmdir		= nfs_rmdir, +	.mknod		= nfs_mknod, +	.rename		= nfs_rename, +	.permission	= nfs_permission, +	.getattr	= nfs_getattr, +	.setattr	= nfs_setattr, +}; + +static const struct inode_operations nfs_file_inode_operations = { +	.permission	= nfs_permission, +	.getattr	= nfs_getattr, +	.setattr	= nfs_setattr, +}; +  const struct nfs_rpc_ops nfs_v2_clientops = {  	.version	= 2,		       /* protocol version */  	.dentry_ops	= &nfs_dentry_operations,  	.dir_inode_ops	= &nfs_dir_inode_operations,  	.file_inode_ops	= &nfs_file_inode_operations, +	.file_ops	= &nfs_file_operations,  	.getroot	= nfs_proc_get_root, +	.submount	= nfs_submount, +	.try_mount	= nfs_try_mount,  	.getattr	= nfs_proc_getattr,  	.setattr	= nfs_proc_setattr,  	.lookup		= nfs_proc_lookup, @@ -718,9 +713,10 @@ const struct nfs_rpc_ops nfs_v2_clientops = {  	.create		= nfs_proc_create,  	.remove		= nfs_proc_remove,  	.unlink_setup	= nfs_proc_unlink_setup, +	.unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare,  	.unlink_done	= nfs_proc_unlink_done, -	.rename		= nfs_proc_rename,  	.rename_setup	= nfs_proc_rename_setup, +	.rename_rpc_prepare = nfs_proc_rename_rpc_prepare,  	.rename_done	= nfs_proc_rename_done,  	.link		= nfs_proc_link,  	.symlink	= nfs_proc_symlink, @@ -731,13 +727,22 @@ const struct nfs_rpc_ops nfs_v2_clientops = {  	.statfs		= nfs_proc_statfs,  	.fsinfo		= nfs_proc_fsinfo,  	.pathconf	= nfs_proc_pathconf, -	.decode_dirent	= nfs_decode_dirent, +	.decode_dirent	= nfs2_decode_dirent, +	.pgio_rpc_prepare = nfs_proc_pgio_rpc_prepare,  	.read_setup	= nfs_proc_read_setup,  	.read_done	= nfs_read_done,  	.write_setup	= nfs_proc_write_setup,  	.write_done	= nfs_write_done,  	.commit_setup	= nfs_proc_commit_setup, +	.commit_rpc_prepare = nfs_proc_commit_rpc_prepare,  	.lock		= nfs_proc_lock,  	.lock_check_bounds = nfs_lock_check_bounds,  	.close_context	= nfs_close_context, +	.have_delegation = nfs_have_delegation, +	.return_delegation = nfs_return_delegation, +	.alloc_client	= nfs_alloc_client, +	.init_client	= nfs_init_client, +	.free_client	= nfs_free_client, +	.create_server	= nfs_create_server, +	.clone_server	= nfs_clone_server,  }; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index e4b62c6f5a6..e818a475ca6 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -18,8 +18,7 @@  #include <linux/sunrpc/clnt.h>  #include <linux/nfs_fs.h>  #include <linux/nfs_page.h> - -#include <asm/system.h> +#include <linux/module.h>  #include "nfs4_fs.h"  #include "internal.h" @@ -29,48 +28,19 @@  #define NFSDBG_FACILITY		NFSDBG_PAGECACHE -static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int); -static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int); -static const struct rpc_call_ops nfs_read_partial_ops; -static const struct rpc_call_ops nfs_read_full_ops; +static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; +static const struct nfs_rw_ops nfs_rw_read_ops;  static struct kmem_cache *nfs_rdata_cachep; -static mempool_t *nfs_rdata_mempool; - -#define MIN_POOL_READ	(32) - -struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) -{ -	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL); - -	if (p) { -		memset(p, 0, sizeof(*p)); -		INIT_LIST_HEAD(&p->pages); -		p->npages = pagecount; -		if (pagecount <= ARRAY_SIZE(p->page_array)) -			p->pagevec = p->page_array; -		else { -			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); -			if (!p->pagevec) { -				mempool_free(p, nfs_rdata_mempool); -				p = NULL; -			} -		} -	} -	return p; -} -void nfs_readdata_free(struct nfs_read_data *p) +static struct nfs_rw_header *nfs_readhdr_alloc(void)  { -	if (p && (p->pagevec != &p->page_array[0])) -		kfree(p->pagevec); -	mempool_free(p, nfs_rdata_mempool); +	return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);  } -static void nfs_readdata_release(struct nfs_read_data *rdata) +static void nfs_readhdr_free(struct nfs_rw_header *rhdr)  { -	put_nfs_open_context(rdata->args.context); -	nfs_readdata_free(rdata); +	kmem_cache_free(nfs_rdata_cachep, rhdr);  }  static @@ -82,47 +52,40 @@ int nfs_return_empty_page(struct page *page)  	return 0;  } -static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) +void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, +			      struct inode *inode, bool force_mds, +			      const struct nfs_pgio_completion_ops *compl_ops)  { -	unsigned int remainder = data->args.count - data->res.count; -	unsigned int base = data->args.pgbase + data->res.count; -	unsigned int pglen; -	struct page **pages; +	struct nfs_server *server = NFS_SERVER(inode); +	const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops; + +#ifdef CONFIG_NFS_V4_1 +	if (server->pnfs_curr_ld && !force_mds) +		pg_ops = server->pnfs_curr_ld->pg_read_ops; +#endif +	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, +			server->rsize, 0); +} +EXPORT_SYMBOL_GPL(nfs_pageio_init_read); -	if (data->res.eof == 0 || remainder == 0) -		return; -	/* -	 * Note: "remainder" can never be negative, since we check for -	 * 	this in the XDR code. -	 */ -	pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; -	base &= ~PAGE_CACHE_MASK; -	pglen = PAGE_CACHE_SIZE - base; -	for (;;) { -		if (remainder <= pglen) { -			zero_user(*pages, base, remainder); -			break; -		} -		zero_user(*pages, base, pglen); -		pages++; -		remainder -= pglen; -		pglen = PAGE_CACHE_SIZE; -		base = 0; -	} +void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) +{ +	pgio->pg_ops = &nfs_pgio_rw_ops; +	pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;  } +EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);  int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,  		       struct page *page)  { -	LIST_HEAD(one_request);  	struct nfs_page	*new;  	unsigned int len; +	struct nfs_pageio_descriptor pgio;  	len = nfs_page_length(page);  	if (len == 0)  		return nfs_return_empty_page(page); -	pnfs_update_layout(inode, ctx, IOMODE_READ); -	new = nfs_create_request(ctx, inode, page, 0, len); +	new = nfs_create_request(ctx, page, NULL, 0, len);  	if (IS_ERR(new)) {  		unlock_page(page);  		return PTR_ERR(new); @@ -130,370 +93,171 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,  	if (len < PAGE_CACHE_SIZE)  		zero_user_segment(page, len, PAGE_CACHE_SIZE); -	nfs_list_add_request(new, &one_request); -	if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) -		nfs_pagein_multi(inode, &one_request, 1, len, 0); -	else -		nfs_pagein_one(inode, &one_request, 1, len, 0); +	nfs_pageio_init_read(&pgio, inode, false, +			     &nfs_async_read_completion_ops); +	nfs_pageio_add_request(&pgio, new); +	nfs_pageio_complete(&pgio); +	NFS_I(inode)->read_io += pgio.pg_bytes_written;  	return 0;  }  static void nfs_readpage_release(struct nfs_page *req)  { -	struct inode *d_inode = req->wb_context->path.dentry->d_inode; +	struct inode *d_inode = req->wb_context->dentry->d_inode; -	if (PageUptodate(req->wb_page)) -		nfs_readpage_to_fscache(d_inode, req->wb_page, 0); +	dprintk("NFS: read done (%s/%llu %d@%lld)\n", d_inode->i_sb->s_id, +		(unsigned long long)NFS_FILEID(d_inode), req->wb_bytes, +		(long long)req_offset(req)); -	unlock_page(req->wb_page); +	if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { +		if (PageUptodate(req->wb_page)) +			nfs_readpage_to_fscache(d_inode, req->wb_page, 0); -	dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", -			req->wb_context->path.dentry->d_inode->i_sb->s_id, -			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), +		unlock_page(req->wb_page); +	} + +	dprintk("NFS: read done (%s/%Lu %d@%Ld)\n", +			req->wb_context->dentry->d_inode->i_sb->s_id, +			(unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),  			req->wb_bytes,  			(long long)req_offset(req)); -	nfs_clear_request(req);  	nfs_release_request(req);  } -/* - * Set up the NFS read request struct - */ -static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, -		const struct rpc_call_ops *call_ops, -		unsigned int count, unsigned int offset) +static void nfs_page_group_set_uptodate(struct nfs_page *req)  { -	struct inode *inode = req->wb_context->path.dentry->d_inode; -	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; -	struct rpc_task *task; -	struct rpc_message msg = { -		.rpc_argp = &data->args, -		.rpc_resp = &data->res, -		.rpc_cred = req->wb_context->cred, -	}; -	struct rpc_task_setup task_setup_data = { -		.task = &data->task, -		.rpc_client = NFS_CLIENT(inode), -		.rpc_message = &msg, -		.callback_ops = call_ops, -		.callback_data = data, -		.workqueue = nfsiod_workqueue, -		.flags = RPC_TASK_ASYNC | swap_flags, -	}; - -	data->req	  = req; -	data->inode	  = inode; -	data->cred	  = msg.rpc_cred; - -	data->args.fh     = NFS_FH(inode); -	data->args.offset = req_offset(req) + offset; -	data->args.pgbase = req->wb_pgbase + offset; -	data->args.pages  = data->pagevec; -	data->args.count  = count; -	data->args.context = get_nfs_open_context(req->wb_context); -	data->args.lock_context = req->wb_lock_context; - -	data->res.fattr   = &data->fattr; -	data->res.count   = count; -	data->res.eof     = 0; -	nfs_fattr_init(&data->fattr); - -	/* Set up the initial task struct. */ -	NFS_PROTO(inode)->read_setup(data, &msg); - -	dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", -			data->task.tk_pid, -			inode->i_sb->s_id, -			(long long)NFS_FILEID(inode), -			count, -			(unsigned long long)data->args.offset); - -	task = rpc_run_task(&task_setup_data); -	if (IS_ERR(task)) -		return PTR_ERR(task); -	rpc_put_task(task); -	return 0; +	if (nfs_page_group_sync_on_bit(req, PG_UPTODATE)) +		SetPageUptodate(req->wb_page);  } -static void -nfs_async_read_error(struct list_head *head) +static void nfs_read_completion(struct nfs_pgio_header *hdr)  { -	struct nfs_page	*req; +	unsigned long bytes = 0; -	while (!list_empty(head)) { -		req = nfs_list_entry(head->next); +	if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) +		goto out; +	while (!list_empty(&hdr->pages)) { +		struct nfs_page *req = nfs_list_entry(hdr->pages.next); +		struct page *page = req->wb_page; +		unsigned long start = req->wb_pgbase; +		unsigned long end = req->wb_pgbase + req->wb_bytes; + +		if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { +			/* note: regions of the page not covered by a +			 * request are zeroed in nfs_readpage_async / +			 * readpage_async_filler */ +			if (bytes > hdr->good_bytes) { +				/* nothing in this request was good, so zero +				 * the full extent of the request */ +				zero_user_segment(page, start, end); + +			} else if (hdr->good_bytes - bytes < req->wb_bytes) { +				/* part of this request has good bytes, but +				 * not all. zero the bad bytes */ +				start += hdr->good_bytes - bytes; +				WARN_ON(start < req->wb_pgbase); +				zero_user_segment(page, start, end); +			} +		} +		bytes += req->wb_bytes; +		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { +			if (bytes <= hdr->good_bytes) +				nfs_page_group_set_uptodate(req); +		} else +			nfs_page_group_set_uptodate(req);  		nfs_list_remove_request(req); -		SetPageError(req->wb_page);  		nfs_readpage_release(req);  	} +out: +	hdr->release(hdr);  } -/* - * Generate multiple requests to fill a single page. - * - * We optimize to reduce the number of read operations on the wire.  If we - * detect that we're reading a page, or an area of a page, that is past the - * end of file, we do not generate NFS read operations but just clear the - * parts of the page that would have come back zero from the server anyway. - * - * We rely on the cached value of i_size to make this determination; another - * client can fill pages on the server past our cached end-of-file, but we - * won't see the new data until our attribute cache is updated.  This is more - * or less conventional NFS client behavior. - */ -static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) +static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg, +			      struct rpc_task_setup *task_setup_data, int how)  { -	struct nfs_page *req = nfs_list_entry(head->next); -	struct page *page = req->wb_page; -	struct nfs_read_data *data; -	size_t rsize = NFS_SERVER(inode)->rsize, nbytes; -	unsigned int offset; -	int requests = 0; -	int ret = 0; -	LIST_HEAD(list); - -	nfs_list_remove_request(req); - -	nbytes = count; -	do { -		size_t len = min(nbytes,rsize); - -		data = nfs_readdata_alloc(1); -		if (!data) -			goto out_bad; -		list_add(&data->pages, &list); -		requests++; -		nbytes -= len; -	} while(nbytes != 0); -	atomic_set(&req->wb_complete, requests); - -	ClearPageError(page); -	offset = 0; -	nbytes = count; -	do { -		int ret2; - -		data = list_entry(list.next, struct nfs_read_data, pages); -		list_del_init(&data->pages); - -		data->pagevec[0] = page; - -		if (nbytes < rsize) -			rsize = nbytes; -		ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, -				  rsize, offset); -		if (ret == 0) -			ret = ret2; -		offset += rsize; -		nbytes -= rsize; -	} while (nbytes != 0); - -	return ret; +	struct inode *inode = data->header->inode; +	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; -out_bad: -	while (!list_empty(&list)) { -		data = list_entry(list.next, struct nfs_read_data, pages); -		list_del(&data->pages); -		nfs_readdata_free(data); -	} -	SetPageError(page); -	nfs_readpage_release(req); -	return -ENOMEM; +	task_setup_data->flags |= swap_flags; +	NFS_PROTO(inode)->read_setup(data, msg);  } -static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) +static void +nfs_async_read_error(struct list_head *head)  { -	struct nfs_page		*req; -	struct page		**pages; -	struct nfs_read_data	*data; -	int ret = -ENOMEM; - -	data = nfs_readdata_alloc(npages); -	if (!data) -		goto out_bad; +	struct nfs_page	*req; -	pages = data->pagevec;  	while (!list_empty(head)) {  		req = nfs_list_entry(head->next);  		nfs_list_remove_request(req); -		nfs_list_add_request(req, &data->pages); -		ClearPageError(req->wb_page); -		*pages++ = req->wb_page; +		nfs_readpage_release(req);  	} -	req = nfs_list_entry(data->pages.next); - -	return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); -out_bad: -	nfs_async_read_error(head); -	return ret;  } +static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = { +	.error_cleanup = nfs_async_read_error, +	.completion = nfs_read_completion, +}; +  /*   * This is the callback from RPC telling us whether a reply was   * received or some error occurred (timeout or socket shutdown).   */ -int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data, +			     struct inode *inode)  { -	int status; - -	dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid, -			task->tk_status); - -	status = NFS_PROTO(data->inode)->read_done(task, data); +	int status = NFS_PROTO(inode)->read_done(task, data);  	if (status != 0)  		return status; -	nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, data->res.count); +	nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, data->res.count);  	if (task->tk_status == -ESTALE) { -		set_bit(NFS_INO_STALE, &NFS_I(data->inode)->flags); -		nfs_mark_for_revalidate(data->inode); +		set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); +		nfs_mark_for_revalidate(inode);  	}  	return 0;  } -static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data) +static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data)  { -	struct nfs_readargs *argp = &data->args; -	struct nfs_readres *resp = &data->res; - -	if (resp->eof || resp->count == argp->count) -		return; +	struct nfs_pgio_args *argp = &data->args; +	struct nfs_pgio_res  *resp = &data->res;  	/* This is a short read! */ -	nfs_inc_stats(data->inode, NFSIOS_SHORTREAD); +	nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD);  	/* Has the server at least made some progress? */ -	if (resp->count == 0) +	if (resp->count == 0) { +		nfs_set_pgio_error(data->header, -EIO, argp->offset);  		return; - +	}  	/* Yes, so retry the read at the end of the data */ +	data->mds_offset += resp->count;  	argp->offset += resp->count;  	argp->pgbase += resp->count;  	argp->count -= resp->count; -	nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); -} - -/* - * Handle a read reply that fills part of a page. - */ -static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata) -{ -	struct nfs_read_data *data = calldata; -  -	if (nfs_readpage_result(task, data) != 0) -		return; -	if (task->tk_status < 0) -		return; - -	nfs_readpage_truncate_uninitialised_page(data); -	nfs_readpage_retry(task, data); +	rpc_restart_call_prepare(task);  } -static void nfs_readpage_release_partial(void *calldata) +static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data)  { -	struct nfs_read_data *data = calldata; -	struct nfs_page *req = data->req; -	struct page *page = req->wb_page; -	int status = data->task.tk_status; +	struct nfs_pgio_header *hdr = data->header; -	if (status < 0) -		SetPageError(page); +	if (data->res.eof) { +		loff_t bound; -	if (atomic_dec_and_test(&req->wb_complete)) { -		if (!PageError(page)) -			SetPageUptodate(page); -		nfs_readpage_release(req); -	} -	nfs_readdata_release(calldata); -} - -#if defined(CONFIG_NFS_V4_1) -void nfs_read_prepare(struct rpc_task *task, void *calldata) -{ -	struct nfs_read_data *data = calldata; - -	if (nfs4_setup_sequence(NFS_SERVER(data->inode), -				&data->args.seq_args, &data->res.seq_res, -				0, task)) -		return; -	rpc_call_start(task); -} -#endif /* CONFIG_NFS_V4_1 */ - -static const struct rpc_call_ops nfs_read_partial_ops = { -#if defined(CONFIG_NFS_V4_1) -	.rpc_call_prepare = nfs_read_prepare, -#endif /* CONFIG_NFS_V4_1 */ -	.rpc_call_done = nfs_readpage_result_partial, -	.rpc_release = nfs_readpage_release_partial, -}; - -static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data) -{ -	unsigned int count = data->res.count; -	unsigned int base = data->args.pgbase; -	struct page **pages; - -	if (data->res.eof) -		count = data->args.count; -	if (unlikely(count == 0)) -		return; -	pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; -	base &= ~PAGE_CACHE_MASK; -	count += base; -	for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) -		SetPageUptodate(*pages); -	if (count == 0) -		return; -	/* Was this a short read? */ -	if (data->res.eof || data->res.count == data->args.count) -		SetPageUptodate(*pages); -} - -/* - * This is the callback from RPC telling us whether a reply was - * received or some error occurred (timeout or socket shutdown). - */ -static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) -{ -	struct nfs_read_data *data = calldata; - -	if (nfs_readpage_result(task, data) != 0) -		return; -	if (task->tk_status < 0) -		return; -	/* -	 * Note: nfs_readpage_retry may change the values of -	 * data->args. In the multi-page case, we therefore need -	 * to ensure that we call nfs_readpage_set_pages_uptodate() -	 * first. -	 */ -	nfs_readpage_truncate_uninitialised_page(data); -	nfs_readpage_set_pages_uptodate(data); -	nfs_readpage_retry(task, data); -} - -static void nfs_readpage_release_full(void *calldata) -{ -	struct nfs_read_data *data = calldata; - -	while (!list_empty(&data->pages)) { -		struct nfs_page *req = nfs_list_entry(data->pages.next); - -		nfs_list_remove_request(req); -		nfs_readpage_release(req); -	} -	nfs_readdata_release(calldata); +		bound = data->args.offset + data->res.count; +		spin_lock(&hdr->lock); +		if (bound < hdr->io_start + hdr->good_bytes) { +			set_bit(NFS_IOHDR_EOF, &hdr->flags); +			clear_bit(NFS_IOHDR_ERROR, &hdr->flags); +			hdr->good_bytes = bound - hdr->io_start; +		} +		spin_unlock(&hdr->lock); +	} else if (data->res.count != data->args.count) +		nfs_readpage_retry(task, data);  } -static const struct rpc_call_ops nfs_read_full_ops = { -#if defined(CONFIG_NFS_V4_1) -	.rpc_call_prepare = nfs_read_prepare, -#endif /* CONFIG_NFS_V4_1 */ -	.rpc_call_done = nfs_readpage_result_full, -	.rpc_release = nfs_readpage_release_full, -}; -  /*   * Read a page over NFS.   * We read the page synchronously in the following case: @@ -503,11 +267,11 @@ static const struct rpc_call_ops nfs_read_full_ops = {  int nfs_readpage(struct file *file, struct page *page)  {  	struct nfs_open_context *ctx; -	struct inode *inode = page->mapping->host; +	struct inode *inode = page_file_mapping(page)->host;  	int		error;  	dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", -		page, PAGE_CACHE_SIZE, page->index); +		page, PAGE_CACHE_SIZE, page_file_index(page));  	nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);  	nfs_add_stats(inode, NFSIOS_READPAGES, 1); @@ -561,7 +325,6 @@ static int  readpage_async_filler(void *data, struct page *page)  {  	struct nfs_readdesc *desc = (struct nfs_readdesc *)data; -	struct inode *inode = page->mapping->host;  	struct nfs_page *new;  	unsigned int len;  	int error; @@ -570,7 +333,7 @@ readpage_async_filler(void *data, struct page *page)  	if (len == 0)  		return nfs_return_empty_page(page); -	new = nfs_create_request(desc->ctx, inode, page, 0, len); +	new = nfs_create_request(desc->ctx, page, NULL, 0, len);  	if (IS_ERR(new))  		goto out_error; @@ -583,7 +346,6 @@ readpage_async_filler(void *data, struct page *page)  	return 0;  out_error:  	error = PTR_ERR(new); -	SetPageError(page);  out_unlock:  	unlock_page(page);  	return error; @@ -597,14 +359,12 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,  		.pgio = &pgio,  	};  	struct inode *inode = mapping->host; -	struct nfs_server *server = NFS_SERVER(inode); -	size_t rsize = server->rsize;  	unsigned long npages;  	int ret = -ESTALE; -	dprintk("NFS: nfs_readpages (%s/%Ld %d)\n", +	dprintk("NFS: nfs_readpages (%s/%Lu %d)\n",  			inode->i_sb->s_id, -			(long long)NFS_FILEID(inode), +			(unsigned long long)NFS_FILEID(inode),  			nr_pages);  	nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); @@ -626,15 +386,13 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,  	if (ret == 0)  		goto read_complete; /* all pages were read */ -	pnfs_update_layout(inode, desc.ctx, IOMODE_READ); -	if (rsize < PAGE_CACHE_SIZE) -		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); -	else -		nfs_pageio_init(&pgio, inode, nfs_pagein_one, rsize, 0); +	nfs_pageio_init_read(&pgio, inode, false, +			     &nfs_async_read_completion_ops);  	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);  	nfs_pageio_complete(&pgio); +	NFS_I(inode)->read_io += pgio.pg_bytes_written;  	npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;  	nfs_add_stats(inode, NFSIOS_READPAGES, npages);  read_complete: @@ -646,22 +404,25 @@ out:  int __init nfs_init_readpagecache(void)  {  	nfs_rdata_cachep = kmem_cache_create("nfs_read_data", -					     sizeof(struct nfs_read_data), +					     sizeof(struct nfs_rw_header),  					     0, SLAB_HWCACHE_ALIGN,  					     NULL);  	if (nfs_rdata_cachep == NULL)  		return -ENOMEM; -	nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ, -						     nfs_rdata_cachep); -	if (nfs_rdata_mempool == NULL) -		return -ENOMEM; -  	return 0;  }  void nfs_destroy_readpagecache(void)  { -	mempool_destroy(nfs_rdata_mempool);  	kmem_cache_destroy(nfs_rdata_cachep);  } + +static const struct nfs_rw_ops nfs_rw_read_ops = { +	.rw_mode		= FMODE_READ, +	.rw_alloc_header	= nfs_readhdr_alloc, +	.rw_free_header		= nfs_readhdr_free, +	.rw_done		= nfs_readpage_done, +	.rw_result		= nfs_readpage_result, +	.rw_initiate		= nfs_initiate_read, +}; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0a42e8f4adc..084af1060d7 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -31,6 +31,7 @@  #include <linux/errno.h>  #include <linux/unistd.h>  #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h>  #include <linux/sunrpc/stats.h>  #include <linux/sunrpc/metrics.h>  #include <linux/sunrpc/xprtsock.h> @@ -39,10 +40,8 @@  #include <linux/nfs_mount.h>  #include <linux/nfs4_mount.h>  #include <linux/lockd/bind.h> -#include <linux/smp_lock.h>  #include <linux/seq_file.h>  #include <linux/mount.h> -#include <linux/mnt_namespace.h>  #include <linux/namei.h>  #include <linux/nfs_idmap.h>  #include <linux/vfs.h> @@ -54,8 +53,9 @@  #include <linux/nfs_xdr.h>  #include <linux/magic.h>  #include <linux/parser.h> +#include <linux/nsproxy.h> +#include <linux/rcupdate.h> -#include <asm/system.h>  #include <asm/uaccess.h>  #include "nfs4_fs.h" @@ -64,8 +64,18 @@  #include "iostat.h"  #include "internal.h"  #include "fscache.h" +#include "nfs4session.h" +#include "pnfs.h" +#include "nfs.h"  #define NFSDBG_FACILITY		NFSDBG_VFS +#define NFS_TEXT_DATA		1 + +#if IS_ENABLED(CONFIG_NFS_V3) +#define NFS_DEFAULT_VERSION 3 +#else +#define NFS_DEFAULT_VERSION 2 +#endif  enum {  	/* Mount options that take no arguments */ @@ -74,13 +84,13 @@ enum {  	Opt_cto, Opt_nocto,  	Opt_ac, Opt_noac,  	Opt_lock, Opt_nolock, -	Opt_v2, Opt_v3, Opt_v4,  	Opt_udp, Opt_tcp, Opt_rdma,  	Opt_acl, Opt_noacl,  	Opt_rdirplus, Opt_nordirplus,  	Opt_sharecache, Opt_nosharecache,  	Opt_resvport, Opt_noresvport,  	Opt_fscache, Opt_nofscache, +	Opt_migration, Opt_nomigration,  	/* Mount options that take integer arguments */  	Opt_port, @@ -92,10 +102,10 @@ enum {  	Opt_namelen,  	Opt_mountport,  	Opt_mountvers, -	Opt_nfsvers,  	Opt_minorversion,  	/* Mount options that take string arguments */ +	Opt_nfsvers,  	Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,  	Opt_addr, Opt_mountaddr, Opt_clientaddr,  	Opt_lookupcache, @@ -127,9 +137,6 @@ static const match_table_t nfs_mount_option_tokens = {  	{ Opt_noac, "noac" },  	{ Opt_lock, "lock" },  	{ Opt_nolock, "nolock" }, -	{ Opt_v2, "v2" }, -	{ Opt_v3, "v3" }, -	{ Opt_v4, "v4" },  	{ Opt_udp, "udp" },  	{ Opt_tcp, "tcp" },  	{ Opt_rdma, "rdma" }, @@ -143,6 +150,8 @@ static const match_table_t nfs_mount_option_tokens = {  	{ Opt_noresvport, "noresvport" },  	{ Opt_fscache, "fsc" },  	{ Opt_nofscache, "nofsc" }, +	{ Opt_migration, "migration" }, +	{ Opt_nomigration, "nomigration" },  	{ Opt_port, "port=%s" },  	{ Opt_rsize, "rsize=%s" }, @@ -158,9 +167,10 @@ static const match_table_t nfs_mount_option_tokens = {  	{ Opt_namelen, "namlen=%s" },  	{ Opt_mountport, "mountport=%s" },  	{ Opt_mountvers, "mountvers=%s" }, +	{ Opt_minorversion, "minorversion=%s" }, +  	{ Opt_nfsvers, "nfsvers=%s" },  	{ Opt_nfsvers, "vers=%s" }, -	{ Opt_minorversion, "minorversion=%s" },  	{ Opt_sec, "sec=%s" },  	{ Opt_proto, "proto=%s" }, @@ -174,6 +184,9 @@ static const match_table_t nfs_mount_option_tokens = {  	{ Opt_fscache_uniq, "fsc=%s" },  	{ Opt_local_lock, "local_lock=%s" }, +	/* The following needs to be listed after all other options */ +	{ Opt_nfsvers, "v%s" }, +  	{ Opt_err, NULL }  }; @@ -254,120 +267,101 @@ static match_table_t nfs_local_lock_tokens = {  	{ Opt_local_lock_err, NULL }  }; +enum { +	Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0, +	Opt_vers_4_1, Opt_vers_4_2, + +	Opt_vers_err +}; + +static match_table_t nfs_vers_tokens = { +	{ Opt_vers_2, "2" }, +	{ Opt_vers_3, "3" }, +	{ Opt_vers_4, "4" }, +	{ Opt_vers_4_0, "4.0" }, +	{ Opt_vers_4_1, "4.1" }, +	{ Opt_vers_4_2, "4.2" }, + +	{ Opt_vers_err, NULL } +}; -static void nfs_umount_begin(struct super_block *); -static int  nfs_statfs(struct dentry *, struct kstatfs *); -static int  nfs_show_options(struct seq_file *, struct vfsmount *); -static int  nfs_show_stats(struct seq_file *, struct vfsmount *); -static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);  static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,  		int flags, const char *dev_name, void *raw_data); -static void nfs_put_super(struct super_block *); -static void nfs_kill_super(struct super_block *); -static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); -static struct file_system_type nfs_fs_type = { +struct file_system_type nfs_fs_type = {  	.owner		= THIS_MODULE,  	.name		= "nfs", -	.get_sb		= nfs_get_sb, +	.mount		= nfs_fs_mount,  	.kill_sb	= nfs_kill_super, -	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,  }; +MODULE_ALIAS_FS("nfs"); +EXPORT_SYMBOL_GPL(nfs_fs_type);  struct file_system_type nfs_xdev_fs_type = {  	.owner		= THIS_MODULE,  	.name		= "nfs",  	.mount		= nfs_xdev_mount,  	.kill_sb	= nfs_kill_super, -	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,  }; -static const struct super_operations nfs_sops = { +const struct super_operations nfs_sops = {  	.alloc_inode	= nfs_alloc_inode,  	.destroy_inode	= nfs_destroy_inode,  	.write_inode	= nfs_write_inode, +	.drop_inode	= nfs_drop_inode,  	.put_super	= nfs_put_super,  	.statfs		= nfs_statfs,  	.evict_inode	= nfs_evict_inode,  	.umount_begin	= nfs_umount_begin,  	.show_options	= nfs_show_options, +	.show_devname	= nfs_show_devname, +	.show_path	= nfs_show_path,  	.show_stats	= nfs_show_stats,  	.remount_fs	= nfs_remount,  }; +EXPORT_SYMBOL_GPL(nfs_sops); -#ifdef CONFIG_NFS_V4 -static int nfs4_validate_text_mount_data(void *options, +#if IS_ENABLED(CONFIG_NFS_V4) +static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *); +static int nfs4_validate_mount_data(void *options,  	struct nfs_parsed_mount_data *args, const char *dev_name); -static int nfs4_try_mount(int flags, const char *dev_name, -	struct nfs_parsed_mount_data *data, struct vfsmount *mnt); -static int nfs4_get_sb(struct file_system_type *fs_type, -	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); -static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type, -	int flags, const char *dev_name, void *raw_data); -static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type, -	int flags, const char *dev_name, void *raw_data); -static int nfs4_referral_get_sb(struct file_system_type *fs_type, -	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); -static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type, -	int flags, const char *dev_name, void *raw_data); -static void nfs4_kill_super(struct super_block *sb); - -static struct file_system_type nfs4_fs_type = { -	.owner		= THIS_MODULE, -	.name		= "nfs4", -	.get_sb		= nfs4_get_sb, -	.kill_sb	= nfs4_kill_super, -	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, -}; - -static struct file_system_type nfs4_remote_fs_type = { -	.owner		= THIS_MODULE, -	.name		= "nfs4", -	.mount		= nfs4_remote_mount, -	.kill_sb	= nfs4_kill_super, -	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, -}; -struct file_system_type nfs4_xdev_fs_type = { +struct file_system_type nfs4_fs_type = {  	.owner		= THIS_MODULE,  	.name		= "nfs4", -	.mount		= nfs4_xdev_mount, -	.kill_sb	= nfs4_kill_super, -	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +	.mount		= nfs_fs_mount, +	.kill_sb	= nfs_kill_super, +	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,  }; +MODULE_ALIAS_FS("nfs4"); +MODULE_ALIAS("nfs4"); +EXPORT_SYMBOL_GPL(nfs4_fs_type); -static struct file_system_type nfs4_remote_referral_fs_type = { -	.owner		= THIS_MODULE, -	.name		= "nfs4", -	.mount		= nfs4_remote_referral_mount, -	.kill_sb	= nfs4_kill_super, -	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, -}; +static int __init register_nfs4_fs(void) +{ +	return register_filesystem(&nfs4_fs_type); +} -struct file_system_type nfs4_referral_fs_type = { -	.owner		= THIS_MODULE, -	.name		= "nfs4", -	.get_sb		= nfs4_referral_get_sb, -	.kill_sb	= nfs4_kill_super, -	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, -}; +static void unregister_nfs4_fs(void) +{ +	unregister_filesystem(&nfs4_fs_type); +} +#else +static int __init register_nfs4_fs(void) +{ +	return 0; +} -static const struct super_operations nfs4_sops = { -	.alloc_inode	= nfs_alloc_inode, -	.destroy_inode	= nfs_destroy_inode, -	.write_inode	= nfs_write_inode, -	.put_super	= nfs_put_super, -	.statfs		= nfs_statfs, -	.evict_inode	= nfs4_evict_inode, -	.umount_begin	= nfs_umount_begin, -	.show_options	= nfs_show_options, -	.show_stats	= nfs_show_stats, -	.remount_fs	= nfs_remount, -}; +static void unregister_nfs4_fs(void) +{ +}  #endif  static struct shrinker acl_shrinker = { -	.shrink		= nfs_access_cache_shrinker, +	.count_objects	= nfs_access_cache_count, +	.scan_objects	= nfs_access_cache_scan,  	.seeks		= DEFAULT_SEEKS,  }; @@ -382,21 +376,18 @@ int __init register_nfs_fs(void)  	if (ret < 0)  		goto error_0; -	ret = nfs_register_sysctl(); +	ret = register_nfs4_fs();  	if (ret < 0)  		goto error_1; -#ifdef CONFIG_NFS_V4 -	ret = register_filesystem(&nfs4_fs_type); + +	ret = nfs_register_sysctl();  	if (ret < 0)  		goto error_2; -#endif  	register_shrinker(&acl_shrinker);  	return 0; -#ifdef CONFIG_NFS_V4  error_2: -	nfs_unregister_sysctl(); -#endif +	unregister_nfs4_fs();  error_1:  	unregister_filesystem(&nfs_fs_type);  error_0: @@ -409,10 +400,8 @@ error_0:  void __exit unregister_nfs_fs(void)  {  	unregister_shrinker(&acl_shrinker); -#ifdef CONFIG_NFS_V4 -	unregister_filesystem(&nfs4_fs_type); -#endif  	nfs_unregister_sysctl(); +	unregister_nfs4_fs();  	unregister_filesystem(&nfs_fs_type);  } @@ -423,6 +412,7 @@ void nfs_sb_active(struct super_block *sb)  	if (atomic_inc_return(&server->active) == 1)  		atomic_inc(&sb->s_active);  } +EXPORT_SYMBOL_GPL(nfs_sb_active);  void nfs_sb_deactive(struct super_block *sb)  { @@ -431,11 +421,12 @@ void nfs_sb_deactive(struct super_block *sb)  	if (atomic_dec_and_test(&server->active))  		deactivate_super(sb);  } +EXPORT_SYMBOL_GPL(nfs_sb_deactive);  /*   * Deliver file system statistics to userspace   */ -static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) +int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)  {  	struct nfs_server *server = NFS_SB(dentry->d_sb);  	unsigned char blockbits; @@ -496,6 +487,7 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)  	dprintk("%s: statfs error = %d\n", __func__, -error);  	return error;  } +EXPORT_SYMBOL_GPL(nfs_statfs);  /*   * Map the security flavour number to a name @@ -505,7 +497,8 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)  	static const struct {  		rpc_authflavor_t flavour;  		const char *str; -	} sec_flavours[] = { +	} sec_flavours[NFS_AUTH_INFO_MAX_FLAVORS] = { +		/* update NFS_AUTH_INFO_MAX_FLAVORS when this list changes! */  		{ RPC_AUTH_NULL, "null" },  		{ RPC_AUTH_UNIX, "sys" },  		{ RPC_AUTH_GSS_KRB5, "krb5" }, @@ -593,20 +586,21 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,  	if (nfss->mountd_version || showdefaults)  		seq_printf(m, ",mountvers=%u", nfss->mountd_version); -	if (nfss->mountd_port || showdefaults) +	if ((nfss->mountd_port && +		nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) || +		showdefaults)  		seq_printf(m, ",mountport=%u", nfss->mountd_port);  	nfs_show_mountd_netid(m, nfss, showdefaults);  } -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4)  static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,  				    int showdefaults)  {  	struct nfs_client *clp = nfss->nfs_client;  	seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); -	seq_printf(m, ",minorversion=%u", clp->cl_minorversion);  }  #else  static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss, @@ -615,6 +609,15 @@ static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,  }  #endif +static void nfs_show_nfs_version(struct seq_file *m, +		unsigned int version, +		unsigned int minorversion) +{ +	seq_printf(m, ",vers=%u", version); +	if (version == 4) +		seq_printf(m, ".%u", minorversion); +} +  /*   * Describe the mount options in force on this server representation   */ @@ -642,7 +645,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,  	u32 version = clp->rpc_ops->version;  	int local_flock, local_fcntl; -	seq_printf(m, ",vers=%u", version); +	nfs_show_nfs_version(m, version, clp->cl_minorversion);  	seq_printf(m, ",rsize=%u", nfss->rsize);  	seq_printf(m, ",wsize=%u", nfss->wsize);  	if (nfss->bsize != 0) @@ -662,8 +665,10 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,  		else  			seq_puts(m, nfs_infop->nostr);  	} +	rcu_read_lock();  	seq_printf(m, ",proto=%s",  		   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID)); +	rcu_read_unlock();  	if (version == 4) {  		if (nfss->port != NFS_PORT)  			seq_printf(m, ",port=%u", nfss->port); @@ -683,6 +688,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,  	if (nfss->options & NFS_OPTION_FSCACHE)  		seq_printf(m, ",fsc"); +	if (nfss->options & NFS_OPTION_MIGRATION) +		seq_printf(m, ",migration"); +  	if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {  		if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)  			seq_printf(m, ",lookupcache=none"); @@ -706,26 +714,96 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,  /*   * Describe the mount options on this VFS mountpoint   */ -static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) +int nfs_show_options(struct seq_file *m, struct dentry *root)  { -	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); +	struct nfs_server *nfss = NFS_SB(root->d_sb);  	nfs_show_mount_options(m, nfss, 0); +	rcu_read_lock();  	seq_printf(m, ",addr=%s",  			rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient,  							RPC_DISPLAY_ADDR)); +	rcu_read_unlock();  	return 0;  } +EXPORT_SYMBOL_GPL(nfs_show_options); + +#if IS_ENABLED(CONFIG_NFS_V4) +#ifdef CONFIG_NFS_V4_1 +static void show_sessions(struct seq_file *m, struct nfs_server *server) +{ +	if (nfs4_has_session(server->nfs_client)) +		seq_printf(m, ",sessions"); +} +#else +static void show_sessions(struct seq_file *m, struct nfs_server *server) {} +#endif +#endif + +#ifdef CONFIG_NFS_V4_1 +static void show_pnfs(struct seq_file *m, struct nfs_server *server) +{ +	seq_printf(m, ",pnfs="); +	if (server->pnfs_curr_ld) +		seq_printf(m, "%s", server->pnfs_curr_ld->name); +	else +		seq_printf(m, "not configured"); +} + +static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss) +{ +	if (nfss->nfs_client && nfss->nfs_client->cl_implid) { +		struct nfs41_impl_id *impl_id = nfss->nfs_client->cl_implid; +		seq_printf(m, "\n\timpl_id:\tname='%s',domain='%s'," +			   "date='%llu,%u'", +			   impl_id->name, impl_id->domain, +			   impl_id->date.seconds, impl_id->date.nseconds); +	} +} +#else +#if IS_ENABLED(CONFIG_NFS_V4) +static void show_pnfs(struct seq_file *m, struct nfs_server *server) +{ +} +#endif +static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss) +{ +} +#endif + +int nfs_show_devname(struct seq_file *m, struct dentry *root) +{ +	char *page = (char *) __get_free_page(GFP_KERNEL); +	char *devname, *dummy; +	int err = 0; +	if (!page) +		return -ENOMEM; +	devname = nfs_path(&dummy, root, page, PAGE_SIZE, 0); +	if (IS_ERR(devname)) +		err = PTR_ERR(devname); +	else +		seq_escape(m, devname, " \t\n\\"); +	free_page((unsigned long)page); +	return err; +} +EXPORT_SYMBOL_GPL(nfs_show_devname); + +int nfs_show_path(struct seq_file *m, struct dentry *dentry) +{ +	seq_puts(m, "/"); +	return 0; +} +EXPORT_SYMBOL_GPL(nfs_show_path);  /*   * Present statistical information for this VFS mountpoint   */ -static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) +int nfs_show_stats(struct seq_file *m, struct dentry *root)  {  	int i, cpu; -	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); +	struct nfs_server *nfss = NFS_SB(root->d_sb);  	struct rpc_auth *auth = nfss->client->cl_auth;  	struct nfs_iostats totals = { }; @@ -735,14 +813,16 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)  	 * Display all mount option settings  	 */  	seq_printf(m, "\n\topts:\t"); -	seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw"); -	seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); -	seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : ""); -	seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); +	seq_puts(m, root->d_sb->s_flags & MS_RDONLY ? "ro" : "rw"); +	seq_puts(m, root->d_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); +	seq_puts(m, root->d_sb->s_flags & MS_NOATIME ? ",noatime" : ""); +	seq_puts(m, root->d_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");  	nfs_show_mount_options(m, nfss, 1);  	seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ); +	show_implementation_id(m, nfss); +  	seq_printf(m, "\n\tcaps:\t");  	seq_printf(m, "caps=0x%x", nfss->caps);  	seq_printf(m, ",wtmult=%u", nfss->wtmult); @@ -750,12 +830,15 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)  	seq_printf(m, ",bsize=%u", nfss->bsize);  	seq_printf(m, ",namlen=%u", nfss->namelen); -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4)  	if (nfss->nfs_client->rpc_ops->version == 4) {  		seq_printf(m, "\n\tnfsv4:\t");  		seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);  		seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); +		seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]);  		seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); +		show_sessions(m, nfss); +		show_pnfs(m, nfss);  	}  #endif @@ -806,12 +889,13 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)  	return 0;  } +EXPORT_SYMBOL_GPL(nfs_show_stats);  /*   * Begin unmount by attempting to remove all automounted mountpoints we added   * in response to xdev traversals and referrals   */ -static void nfs_umount_begin(struct super_block *sb) +void nfs_umount_begin(struct super_block *sb)  {  	struct nfs_server *server;  	struct rpc_clnt *rpc; @@ -825,8 +909,9 @@ static void nfs_umount_begin(struct super_block *sb)  	if (!IS_ERR(rpc))  		rpc_killall_tasks(rpc);  } +EXPORT_SYMBOL_GPL(nfs_umount_begin); -static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version) +static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)  {  	struct nfs_parsed_mount_data *data; @@ -839,14 +924,28 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve  		data->mount_server.port	= NFS_UNSPEC_PORT;  		data->nfs_server.port	= NFS_UNSPEC_PORT;  		data->nfs_server.protocol = XPRT_TRANSPORT_TCP; -		data->auth_flavors[0]	= RPC_AUTH_UNIX; -		data->auth_flavor_len	= 1; -		data->version		= version; +		data->selected_flavor	= RPC_AUTH_MAXFLAVOR;  		data->minorversion	= 0; +		data->need_mount	= true; +		data->net		= current->nsproxy->net_ns; +		security_init_mnt_opts(&data->lsm_opts);  	}  	return data;  } +static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data) +{ +	if (data) { +		kfree(data->client_address); +		kfree(data->mount_server.hostname); +		kfree(data->nfs_server.export_path); +		kfree(data->nfs_server.hostname); +		kfree(data->fscache_uniq); +		security_free_mnt_opts(&data->lsm_opts); +		kfree(data); +	} +} +  /*   * Sanity-check a server address provided by the mount command.   * @@ -921,57 +1020,170 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)  }  /* + * Add 'flavor' to 'auth_info' if not already present. + * Returns true if 'flavor' ends up in the list, false otherwise + */ +static bool nfs_auth_info_add(struct nfs_auth_info *auth_info, +			      rpc_authflavor_t flavor) +{ +	unsigned int i; +	unsigned int max_flavor_len = (sizeof(auth_info->flavors) / +				       sizeof(auth_info->flavors[0])); + +	/* make sure this flavor isn't already in the list */ +	for (i = 0; i < auth_info->flavor_len; i++) { +		if (flavor == auth_info->flavors[i]) +			return true; +	} + +	if (auth_info->flavor_len + 1 >= max_flavor_len) { +		dfprintk(MOUNT, "NFS: too many sec= flavors\n"); +		return false; +	} + +	auth_info->flavors[auth_info->flavor_len++] = flavor; +	return true; +} + +/* + * Return true if 'match' is in auth_info or auth_info is empty. + * Return false otherwise. + */ +bool nfs_auth_info_match(const struct nfs_auth_info *auth_info, +			 rpc_authflavor_t match) +{ +	int i; + +	if (!auth_info->flavor_len) +		return true; + +	for (i = 0; i < auth_info->flavor_len; i++) { +		if (auth_info->flavors[i] == match) +			return true; +	} +	return false; +} +EXPORT_SYMBOL_GPL(nfs_auth_info_match); + +/*   * Parse the value of the 'sec=' option.   */  static int nfs_parse_security_flavors(char *value,  				      struct nfs_parsed_mount_data *mnt)  {  	substring_t args[MAX_OPT_ARGS]; +	rpc_authflavor_t pseudoflavor; +	char *p;  	dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value); -	switch (match_token(value, nfs_secflavor_tokens, args)) { -	case Opt_sec_none: -		mnt->auth_flavors[0] = RPC_AUTH_NULL; -		break; -	case Opt_sec_sys: -		mnt->auth_flavors[0] = RPC_AUTH_UNIX; -		break; -	case Opt_sec_krb5: -		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; -		break; -	case Opt_sec_krb5i: -		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; -		break; -	case Opt_sec_krb5p: -		mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; -		break; -	case Opt_sec_lkey: -		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; +	while ((p = strsep(&value, ":")) != NULL) { +		switch (match_token(p, nfs_secflavor_tokens, args)) { +		case Opt_sec_none: +			pseudoflavor = RPC_AUTH_NULL; +			break; +		case Opt_sec_sys: +			pseudoflavor = RPC_AUTH_UNIX; +			break; +		case Opt_sec_krb5: +			pseudoflavor = RPC_AUTH_GSS_KRB5; +			break; +		case Opt_sec_krb5i: +			pseudoflavor = RPC_AUTH_GSS_KRB5I; +			break; +		case Opt_sec_krb5p: +			pseudoflavor = RPC_AUTH_GSS_KRB5P; +			break; +		case Opt_sec_lkey: +			pseudoflavor = RPC_AUTH_GSS_LKEY; +			break; +		case Opt_sec_lkeyi: +			pseudoflavor = RPC_AUTH_GSS_LKEYI; +			break; +		case Opt_sec_lkeyp: +			pseudoflavor = RPC_AUTH_GSS_LKEYP; +			break; +		case Opt_sec_spkm: +			pseudoflavor = RPC_AUTH_GSS_SPKM; +			break; +		case Opt_sec_spkmi: +			pseudoflavor = RPC_AUTH_GSS_SPKMI; +			break; +		case Opt_sec_spkmp: +			pseudoflavor = RPC_AUTH_GSS_SPKMP; +			break; +		default: +			dfprintk(MOUNT, +				 "NFS: sec= option '%s' not recognized\n", p); +			return 0; +		} + +		if (!nfs_auth_info_add(&mnt->auth_info, pseudoflavor)) +			return 0; +	} + +	return 1; +} + +static int nfs_parse_version_string(char *string, +		struct nfs_parsed_mount_data *mnt, +		substring_t *args) +{ +	mnt->flags &= ~NFS_MOUNT_VER3; +	switch (match_token(string, nfs_vers_tokens, args)) { +	case Opt_vers_2: +		mnt->version = 2;  		break; -	case Opt_sec_lkeyi: -		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; +	case Opt_vers_3: +		mnt->flags |= NFS_MOUNT_VER3; +		mnt->version = 3;  		break; -	case Opt_sec_lkeyp: -		mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; +	case Opt_vers_4: +		/* Backward compatibility option. In future, +		 * the mount program should always supply +		 * a NFSv4 minor version number. +		 */ +		mnt->version = 4;  		break; -	case Opt_sec_spkm: -		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; +	case Opt_vers_4_0: +		mnt->version = 4; +		mnt->minorversion = 0;  		break; -	case Opt_sec_spkmi: -		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; +	case Opt_vers_4_1: +		mnt->version = 4; +		mnt->minorversion = 1;  		break; -	case Opt_sec_spkmp: -		mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; +	case Opt_vers_4_2: +		mnt->version = 4; +		mnt->minorversion = 2;  		break;  	default:  		return 0;  	} - -	mnt->auth_flavor_len = 1;  	return 1;  } +static int nfs_get_option_str(substring_t args[], char **option) +{ +	kfree(*option); +	*option = match_strdup(args); +	return !*option; +} + +static int nfs_get_option_ul(substring_t args[], unsigned long *option) +{ +	int rc; +	char *string; + +	string = match_strdup(args); +	if (string == NULL) +		return -ENOMEM; +	rc = kstrtoul(string, 10, option); +	kfree(string); + +	return rc; +} +  /*   * Error-check and convert a string of mount options from user space into   * a data structure.  The whole mount string is processed; bad options are @@ -1056,20 +1268,6 @@ static int nfs_parse_mount_options(char *raw,  			mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |  				       NFS_MOUNT_LOCAL_FCNTL);  			break; -		case Opt_v2: -			mnt->flags &= ~NFS_MOUNT_VER3; -			mnt->version = 2; -			break; -		case Opt_v3: -			mnt->flags |= NFS_MOUNT_VER3; -			mnt->version = 3; -			break; -#ifdef CONFIG_NFS_V4 -		case Opt_v4: -			mnt->flags &= ~NFS_MOUNT_VER3; -			mnt->version = 4; -			break; -#endif  		case Opt_udp:  			mnt->flags &= ~NFS_MOUNT_TCP;  			mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; @@ -1117,187 +1315,93 @@ static int nfs_parse_mount_options(char *raw,  			kfree(mnt->fscache_uniq);  			mnt->fscache_uniq = NULL;  			break; +		case Opt_migration: +			mnt->options |= NFS_OPTION_MIGRATION; +			break; +		case Opt_nomigration: +			mnt->options &= NFS_OPTION_MIGRATION; +			break;  		/*  		 * options that take numeric values  		 */  		case Opt_port: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0 || option > USHRT_MAX) +			if (nfs_get_option_ul(args, &option) || +			    option > USHRT_MAX)  				goto out_invalid_value;  			mnt->nfs_server.port = option;  			break;  		case Opt_rsize: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0) +			if (nfs_get_option_ul(args, &option))  				goto out_invalid_value;  			mnt->rsize = option;  			break;  		case Opt_wsize: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0) +			if (nfs_get_option_ul(args, &option))  				goto out_invalid_value;  			mnt->wsize = option;  			break;  		case Opt_bsize: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0) +			if (nfs_get_option_ul(args, &option))  				goto out_invalid_value;  			mnt->bsize = option;  			break;  		case Opt_timeo: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0 || option == 0) +			if (nfs_get_option_ul(args, &option) || option == 0)  				goto out_invalid_value;  			mnt->timeo = option;  			break;  		case Opt_retrans: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0 || option == 0) +			if (nfs_get_option_ul(args, &option) || option == 0)  				goto out_invalid_value;  			mnt->retrans = option;  			break;  		case Opt_acregmin: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0) +			if (nfs_get_option_ul(args, &option))  				goto out_invalid_value;  			mnt->acregmin = option;  			break;  		case Opt_acregmax: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0) +			if (nfs_get_option_ul(args, &option))  				goto out_invalid_value;  			mnt->acregmax = option;  			break;  		case Opt_acdirmin: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0) +			if (nfs_get_option_ul(args, &option))  				goto out_invalid_value;  			mnt->acdirmin = option;  			break;  		case Opt_acdirmax: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0) +			if (nfs_get_option_ul(args, &option))  				goto out_invalid_value;  			mnt->acdirmax = option;  			break;  		case Opt_actimeo: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0) +			if (nfs_get_option_ul(args, &option))  				goto out_invalid_value;  			mnt->acregmin = mnt->acregmax =  			mnt->acdirmin = mnt->acdirmax = option;  			break;  		case Opt_namelen: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0) +			if (nfs_get_option_ul(args, &option))  				goto out_invalid_value;  			mnt->namlen = option;  			break;  		case Opt_mountport: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0 || option > USHRT_MAX) +			if (nfs_get_option_ul(args, &option) || +			    option > USHRT_MAX)  				goto out_invalid_value;  			mnt->mount_server.port = option;  			break;  		case Opt_mountvers: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0 || +			if (nfs_get_option_ul(args, &option) ||  			    option < NFS_MNT_VERSION ||  			    option > NFS_MNT3_VERSION)  				goto out_invalid_value;  			mnt->mount_server.version = option;  			break; -		case Opt_nfsvers: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0) -				goto out_invalid_value; -			switch (option) { -			case NFS2_VERSION: -				mnt->flags &= ~NFS_MOUNT_VER3; -				mnt->version = 2; -				break; -			case NFS3_VERSION: -				mnt->flags |= NFS_MOUNT_VER3; -				mnt->version = 3; -				break; -#ifdef CONFIG_NFS_V4 -			case NFS4_VERSION: -				mnt->flags &= ~NFS_MOUNT_VER3; -				mnt->version = 4; -				break; -#endif -			default: -				goto out_invalid_value; -			} -			break;  		case Opt_minorversion: -			string = match_strdup(args); -			if (string == NULL) -				goto out_nomem; -			rc = strict_strtoul(string, 10, &option); -			kfree(string); -			if (rc != 0) +			if (nfs_get_option_ul(args, &option))  				goto out_invalid_value;  			if (option > NFS4_MAX_MINOR_VERSION)  				goto out_invalid_value; @@ -1307,6 +1411,15 @@ static int nfs_parse_mount_options(char *raw,  		/*  		 * options that take text values  		 */ +		case Opt_nfsvers: +			string = match_strdup(args); +			if (string == NULL) +				goto out_nomem; +			rc = nfs_parse_version_string(string, mnt, args); +			kfree(string); +			if (!rc) +				goto out_invalid_value; +			break;  		case Opt_sec:  			string = match_strdup(args);  			if (string == NULL) @@ -1333,21 +1446,18 @@ static int nfs_parse_mount_options(char *raw,  			case Opt_xprt_udp:  				mnt->flags &= ~NFS_MOUNT_TCP;  				mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; -				kfree(string);  				break;  			case Opt_xprt_tcp6:  				protofamily = AF_INET6;  			case Opt_xprt_tcp:  				mnt->flags |= NFS_MOUNT_TCP;  				mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; -				kfree(string);  				break;  			case Opt_xprt_rdma:  				/* vector side protocols to TCP */  				mnt->flags |= NFS_MOUNT_TCP;  				mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;  				xprt_load_transport(string); -				kfree(string);  				break;  			default:  				dfprintk(MOUNT, "NFS:   unrecognized " @@ -1355,6 +1465,7 @@ static int nfs_parse_mount_options(char *raw,  				kfree(string);  				return 0;  			} +			kfree(string);  			break;  		case Opt_mountproto:  			string = match_strdup(args); @@ -1388,7 +1499,7 @@ static int nfs_parse_mount_options(char *raw,  			if (string == NULL)  				goto out_nomem;  			mnt->nfs_server.addrlen = -				rpc_pton(string, strlen(string), +				rpc_pton(mnt->net, string, strlen(string),  					(struct sockaddr *)  					&mnt->nfs_server.address,  					sizeof(mnt->nfs_server.address)); @@ -1397,25 +1508,20 @@ static int nfs_parse_mount_options(char *raw,  				goto out_invalid_address;  			break;  		case Opt_clientaddr: -			string = match_strdup(args); -			if (string == NULL) +			if (nfs_get_option_str(args, &mnt->client_address))  				goto out_nomem; -			kfree(mnt->client_address); -			mnt->client_address = string;  			break;  		case Opt_mounthost: -			string = match_strdup(args); -			if (string == NULL) +			if (nfs_get_option_str(args, +					       &mnt->mount_server.hostname))  				goto out_nomem; -			kfree(mnt->mount_server.hostname); -			mnt->mount_server.hostname = string;  			break;  		case Opt_mountaddr:  			string = match_strdup(args);  			if (string == NULL)  				goto out_nomem;  			mnt->mount_server.addrlen = -				rpc_pton(string, strlen(string), +				rpc_pton(mnt->net, string, strlen(string),  					(struct sockaddr *)  					&mnt->mount_server.address,  					sizeof(mnt->mount_server.address)); @@ -1448,11 +1554,8 @@ static int nfs_parse_mount_options(char *raw,  			};  			break;  		case Opt_fscache_uniq: -			string = match_strdup(args); -			if (string == NULL) +			if (nfs_get_option_str(args, &mnt->fscache_uniq))  				goto out_nomem; -			kfree(mnt->fscache_uniq); -			mnt->fscache_uniq = string;  			mnt->options |= NFS_OPTION_FSCACHE;  			break;  		case Opt_local_lock: @@ -1507,9 +1610,16 @@ static int nfs_parse_mount_options(char *raw,  	if (!sloppy && invalid_option)  		return 0; +	if (mnt->minorversion && mnt->version != 4) +		goto out_minorversion_mismatch; + +	if (mnt->options & NFS_OPTION_MIGRATION && +	    (mnt->version != 4 || mnt->minorversion != 0)) +		goto out_migration_misuse; +  	/*  	 * verify that any proto=/mountproto= options match the address -	 * familiies in the addr=/mountaddr= options. +	 * families in the addr=/mountaddr= options.  	 */  	if (protofamily != AF_UNSPEC &&  	    protofamily != mnt->nfs_server.address.ss_family) @@ -1540,6 +1650,14 @@ out_invalid_address:  out_invalid_value:  	printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p);  	return 0; +out_minorversion_mismatch: +	printk(KERN_INFO "NFS: mount option vers=%u does not support " +			 "minorversion=%u\n", mnt->version, mnt->minorversion); +	return 0; +out_migration_misuse: +	printk(KERN_INFO +		"NFS: 'migration' not supported for this NFS version\n"); +	return 0;  out_nomem:  	printk(KERN_INFO "NFS: not enough memory to parse option\n");  	return 0; @@ -1550,60 +1668,51 @@ out_security_failure:  }  /* - * Match the requested auth flavors with the list returned by - * the server.  Returns zero and sets the mount's authentication - * flavor on success; returns -EACCES if server does not support - * the requested flavor. + * Ensure that a specified authtype in args->auth_info is supported by + * the server. Returns 0 and sets args->selected_flavor if it's ok, and + * -EACCES if not.   */ -static int nfs_walk_authlist(struct nfs_parsed_mount_data *args, -			     struct nfs_mount_request *request) +static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, +			rpc_authflavor_t *server_authlist, unsigned int count)  { -	unsigned int i, j, server_authlist_len = *(request->auth_flav_len); - -	/* -	 * Certain releases of Linux's mountd return an empty -	 * flavor list.  To prevent behavioral regression with -	 * these servers (ie. rejecting mounts that used to -	 * succeed), revert to pre-2.6.32 behavior (no checking) -	 * if the returned flavor list is empty. -	 */ -	if (server_authlist_len == 0) -		return 0; +	rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; +	unsigned int i;  	/* -	 * We avoid sophisticated negotiating here, as there are -	 * plenty of cases where we can get it wrong, providing -	 * either too little or too much security. +	 * If the sec= mount option is used, the specified flavor or AUTH_NULL +	 * must be in the list returned by the server.  	 * -	 * RFC 2623, section 2.7 suggests we SHOULD prefer the -	 * flavor listed first.  However, some servers list -	 * AUTH_NULL first.  Our caller plants AUTH_SYS, the -	 * preferred default, in args->auth_flavors[0] if user -	 * didn't specify sec= mount option. +	 * AUTH_NULL has a special meaning when it's in the server list - it +	 * means that the server will ignore the rpc creds, so any flavor +	 * can be used.  	 */ -	for (i = 0; i < args->auth_flavor_len; i++) -		for (j = 0; j < server_authlist_len; j++) -			if (args->auth_flavors[i] == request->auth_flavs[j]) { -				dfprintk(MOUNT, "NFS: using auth flavor %d\n", -					request->auth_flavs[j]); -				args->auth_flavors[0] = request->auth_flavs[j]; -				return 0; -			} +	for (i = 0; i < count; i++) { +		flavor = server_authlist[i]; + +		if (nfs_auth_info_match(&args->auth_info, flavor) || +		    flavor == RPC_AUTH_NULL) +			goto out; +	} -	dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n"); -	nfs_umount(request); +	dfprintk(MOUNT, +		 "NFS: specified auth flavors not supported by server\n");  	return -EACCES; + +out: +	args->selected_flavor = flavor; +	dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor); +	return 0;  }  /*   * Use the remote server's MOUNT service to request the NFS file handle   * corresponding to the provided path.   */ -static int nfs_try_mount(struct nfs_parsed_mount_data *args, -			 struct nfs_fh *root_fh) +static int nfs_request_mount(struct nfs_parsed_mount_data *args, +			     struct nfs_fh *root_fh, +			     rpc_authflavor_t *server_authlist, +			     unsigned int *server_authlist_len)  { -	rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS]; -	unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);  	struct nfs_mount_request request = {  		.sap		= (struct sockaddr *)  						&args->mount_server.address, @@ -1611,8 +1720,9 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,  		.protocol	= args->mount_server.protocol,  		.fh		= root_fh,  		.noresvport	= args->flags & NFS_MOUNT_NORESVPORT, -		.auth_flav_len	= &server_authlist_len, +		.auth_flav_len	= server_authlist_len,  		.auth_flavs	= server_authlist, +		.net		= args->net,  	};  	int status; @@ -1654,107 +1764,153 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,  		return status;  	} -	/* -	 * MNTv1 (NFSv2) does not support auth flavor negotiation. -	 */ -	if (args->mount_server.version != NFS_MNT3_VERSION) -		return 0; -	return nfs_walk_authlist(args, &request); +	return 0;  } -static int nfs_parse_simple_hostname(const char *dev_name, -				     char **hostname, size_t maxnamlen, -				     char **export_path, size_t maxpathlen) +static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info, +					struct nfs_subversion *nfs_mod)  { -	size_t len; -	char *colon, *comma; - -	colon = strchr(dev_name, ':'); -	if (colon == NULL) -		goto out_bad_devname; - -	len = colon - dev_name; -	if (len > maxnamlen) -		goto out_hostname; +	int status; +	unsigned int i; +	bool tried_auth_unix = false; +	bool auth_null_in_list = false; +	struct nfs_server *server = ERR_PTR(-EACCES); +	struct nfs_parsed_mount_data *args = mount_info->parsed; +	rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS]; +	unsigned int authlist_len = ARRAY_SIZE(authlist); + +	status = nfs_request_mount(args, mount_info->mntfh, authlist, +					&authlist_len); +	if (status) +		return ERR_PTR(status); -	/* N.B. caller will free nfs_server.hostname in all cases */ -	*hostname = kstrndup(dev_name, len, GFP_KERNEL); -	if (!*hostname) -		goto out_nomem; +	/* +	 * Was a sec= authflavor specified in the options? First, verify +	 * whether the server supports it, and then just try to use it if so. +	 */ +	if (args->auth_info.flavor_len > 0) { +		status = nfs_verify_authflavors(args, authlist, authlist_len); +		dfprintk(MOUNT, "NFS: using auth flavor %u\n", +			 args->selected_flavor); +		if (status) +			return ERR_PTR(status); +		return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); +	} -	/* kill possible hostname list: not supported */ -	comma = strchr(*hostname, ','); -	if (comma != NULL) { -		if (comma == *hostname) -			goto out_bad_devname; -		*comma = '\0'; +	/* +	 * No sec= option was provided. RFC 2623, section 2.7 suggests we +	 * SHOULD prefer the flavor listed first. However, some servers list +	 * AUTH_NULL first. Avoid ever choosing AUTH_NULL. +	 */ +	for (i = 0; i < authlist_len; ++i) { +		rpc_authflavor_t flavor; +		struct rpcsec_gss_info info; + +		flavor = authlist[i]; +		switch (flavor) { +		case RPC_AUTH_UNIX: +			tried_auth_unix = true; +			break; +		case RPC_AUTH_NULL: +			auth_null_in_list = true; +			continue; +		default: +			if (rpcauth_get_gssinfo(flavor, &info) != 0) +				continue; +			/* Fallthrough */ +		} +		dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor); +		args->selected_flavor = flavor; +		server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); +		if (!IS_ERR(server)) +			return server;  	} -	colon++; -	len = strlen(colon); -	if (len > maxpathlen) -		goto out_path; -	*export_path = kstrndup(colon, len, GFP_KERNEL); -	if (!*export_path) -		goto out_nomem; +	/* +	 * Nothing we tried so far worked. At this point, give up if we've +	 * already tried AUTH_UNIX or if the server's list doesn't contain +	 * AUTH_NULL +	 */ +	if (tried_auth_unix || !auth_null_in_list) +		return server; -	dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path); -	return 0; +	/* Last chance! Try AUTH_UNIX */ +	dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX); +	args->selected_flavor = RPC_AUTH_UNIX; +	return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); +} -out_bad_devname: -	dfprintk(MOUNT, "NFS: device name not in host:path format\n"); -	return -EINVAL; +struct dentry *nfs_try_mount(int flags, const char *dev_name, +			     struct nfs_mount_info *mount_info, +			     struct nfs_subversion *nfs_mod) +{ +	struct nfs_server *server; -out_nomem: -	dfprintk(MOUNT, "NFS: not enough memory to parse device name\n"); -	return -ENOMEM; +	if (mount_info->parsed->need_mount) +		server = nfs_try_mount_request(mount_info, nfs_mod); +	else +		server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); -out_hostname: -	dfprintk(MOUNT, "NFS: server hostname too long\n"); -	return -ENAMETOOLONG; +	if (IS_ERR(server)) +		return ERR_CAST(server); -out_path: -	dfprintk(MOUNT, "NFS: export pathname too long\n"); -	return -ENAMETOOLONG; +	return nfs_fs_mount_common(server, flags, dev_name, mount_info, nfs_mod);  } +EXPORT_SYMBOL_GPL(nfs_try_mount);  /* - * Hostname has square brackets around it because it contains one or - * more colons.  We look for the first closing square bracket, and a - * colon must follow it. + * Split "dev_name" into "hostname:export_path". + * + * The leftmost colon demarks the split between the server's hostname + * and the export path.  If the hostname starts with a left square + * bracket, then it may contain colons. + * + * Note: caller frees hostname and export path, even on error.   */ -static int nfs_parse_protected_hostname(const char *dev_name, -					char **hostname, size_t maxnamlen, -					char **export_path, size_t maxpathlen) +static int nfs_parse_devname(const char *dev_name, +			     char **hostname, size_t maxnamlen, +			     char **export_path, size_t maxpathlen)  {  	size_t len; -	char *start, *end; +	char *end; + +	/* Is the host name protected with square brakcets? */ +	if (*dev_name == '[') { +		end = strchr(++dev_name, ']'); +		if (end == NULL || end[1] != ':') +			goto out_bad_devname; + +		len = end - dev_name; +		end++; +	} else { +		char *comma; -	start = (char *)(dev_name + 1); +		end = strchr(dev_name, ':'); +		if (end == NULL) +			goto out_bad_devname; +		len = end - dev_name; -	end = strchr(start, ']'); -	if (end == NULL) -		goto out_bad_devname; -	if (*(end + 1) != ':') -		goto out_bad_devname; +		/* kill possible hostname list: not supported */ +		comma = strchr(dev_name, ','); +		if (comma != NULL && comma < end) +			*comma = 0; +	} -	len = end - start;  	if (len > maxnamlen)  		goto out_hostname;  	/* N.B. caller will free nfs_server.hostname in all cases */ -	*hostname = kstrndup(start, len, GFP_KERNEL); +	*hostname = kstrndup(dev_name, len, GFP_KERNEL);  	if (*hostname == NULL)  		goto out_nomem; - -	end += 2; -	len = strlen(end); +	len = strlen(++end);  	if (len > maxpathlen)  		goto out_path;  	*export_path = kstrndup(end, len, GFP_KERNEL);  	if (!*export_path)  		goto out_nomem; +	dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);  	return 0;  out_bad_devname: @@ -1775,29 +1931,6 @@ out_path:  }  /* - * Split "dev_name" into "hostname:export_path". - * - * The leftmost colon demarks the split between the server's hostname - * and the export path.  If the hostname starts with a left square - * bracket, then it may contain colons. - * - * Note: caller frees hostname and export path, even on error. - */ -static int nfs_parse_devname(const char *dev_name, -			     char **hostname, size_t maxnamlen, -			     char **export_path, size_t maxpathlen) -{ -	if (*dev_name == '[') -		return nfs_parse_protected_hostname(dev_name, -						    hostname, maxnamlen, -						    export_path, maxpathlen); - -	return nfs_parse_simple_hostname(dev_name, -					 hostname, maxnamlen, -					 export_path, maxpathlen); -} - -/*   * Validate the NFS2/NFS3 mount data   * - fills in the mount root filehandle   * @@ -1813,17 +1946,19 @@ static int nfs_parse_devname(const char *dev_name,   * + breaking back: trying proto=udp after proto=tcp, v2 after v3,   *   mountproto=tcp after mountproto=udp, and so on   */ -static int nfs_validate_mount_data(void *options, -				   struct nfs_parsed_mount_data *args, -				   struct nfs_fh *mntfh, -				   const char *dev_name) +static int nfs23_validate_mount_data(void *options, +				     struct nfs_parsed_mount_data *args, +				     struct nfs_fh *mntfh, +				     const char *dev_name)  {  	struct nfs_mount_data *data = (struct nfs_mount_data *)options;  	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; +	int extra_flags = NFS_MOUNT_LEGACY_INTERFACE;  	if (data == NULL)  		goto out_no_data; +	args->version = NFS_DEFAULT_VERSION;  	switch (data->version) {  	case 1:  		data->namlen = 0; @@ -1834,6 +1969,8 @@ static int nfs_validate_mount_data(void *options,  			goto out_no_v3;  		data->root.size = NFS2_FHSIZE;  		memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); +		/* Turn off security negotiation */ +		extra_flags |= NFS_MOUNT_SECFLAVOUR;  	case 4:  		if (data->flags & NFS_MOUNT_SECFLAVOUR)  			goto out_no_sec; @@ -1861,7 +1998,7 @@ static int nfs_validate_mount_data(void *options,  		 * can deal with.  		 */  		args->flags		= data->flags & NFS_MOUNT_FLAGMASK; -		args->flags		|= NFS_MOUNT_LEGACY_INTERFACE; +		args->flags		|= extra_flags;  		args->rsize		= data->rsize;  		args->wsize		= data->wsize;  		args->timeo		= data->timeo; @@ -1870,9 +2007,11 @@ static int nfs_validate_mount_data(void *options,  		args->acregmax		= data->acregmax;  		args->acdirmin		= data->acdirmin;  		args->acdirmax		= data->acdirmax; +		args->need_mount	= false;  		memcpy(sap, &data->addr, sizeof(data->addr));  		args->nfs_server.addrlen = sizeof(data->addr); +		args->nfs_server.port = ntohs(data->addr.sin_port);  		if (!nfs_verify_server_address(sap))  			goto out_no_address; @@ -1884,7 +2023,9 @@ static int nfs_validate_mount_data(void *options,  		args->bsize		= data->bsize;  		if (data->flags & NFS_MOUNT_SECFLAVOUR) -			args->auth_flavors[0] = data->pseudoflavor; +			args->selected_flavor = data->pseudoflavor; +		else +			args->selected_flavor = RPC_AUTH_UNIX;  		if (!args->nfs_server.hostname)  			goto out_nomem; @@ -1921,46 +2062,11 @@ static int nfs_validate_mount_data(void *options,  		}  		break; -	default: { -		int status; - -		if (nfs_parse_mount_options((char *)options, args) == 0) -			return -EINVAL; - -		if (!nfs_verify_server_address(sap)) -			goto out_no_address; - -		if (args->version == 4) -#ifdef CONFIG_NFS_V4 -			return nfs4_validate_text_mount_data(options, -							     args, dev_name); -#else -			goto out_v4_not_compiled; -#endif - -		nfs_set_port(sap, &args->nfs_server.port, 0); - -		nfs_set_mount_transport_protocol(args); - -		status = nfs_parse_devname(dev_name, -					   &args->nfs_server.hostname, -					   PAGE_SIZE, -					   &args->nfs_server.export_path, -					   NFS_MAXPATHLEN); -		if (!status) -			status = nfs_try_mount(args, mntfh); - -		kfree(args->nfs_server.export_path); -		args->nfs_server.export_path = NULL; - -		if (status) -			return status; - -		break; -		} +	default: +		return NFS_TEXT_DATA;  	} -#ifndef CONFIG_NFS_V3 +#if !IS_ENABLED(CONFIG_NFS_V3)  	if (args->version == 3)  		goto out_v3_not_compiled;  #endif /* !CONFIG_NFS_V3 */ @@ -1980,18 +2086,12 @@ out_no_sec:  	dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n");  	return -EINVAL; -#ifndef CONFIG_NFS_V3 +#if !IS_ENABLED(CONFIG_NFS_V3)  out_v3_not_compiled:  	dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n");  	return -EPROTONOSUPPORT;  #endif /* !CONFIG_NFS_V3 */ -#ifndef CONFIG_NFS_V4 -out_v4_not_compiled: -	dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n"); -	return -EPROTONOSUPPORT; -#endif /* !CONFIG_NFS_V4 */ -  out_nomem:  	dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");  	return -ENOMEM; @@ -2005,15 +2105,104 @@ out_invalid_fh:  	return -EINVAL;  } +#if IS_ENABLED(CONFIG_NFS_V4) +static int nfs_validate_mount_data(struct file_system_type *fs_type, +				   void *options, +				   struct nfs_parsed_mount_data *args, +				   struct nfs_fh *mntfh, +				   const char *dev_name) +{ +	if (fs_type == &nfs_fs_type) +		return nfs23_validate_mount_data(options, args, mntfh, dev_name); +	return nfs4_validate_mount_data(options, args, dev_name); +} +#else +static int nfs_validate_mount_data(struct file_system_type *fs_type, +				   void *options, +				   struct nfs_parsed_mount_data *args, +				   struct nfs_fh *mntfh, +				   const char *dev_name) +{ +	return nfs23_validate_mount_data(options, args, mntfh, dev_name); +} +#endif + +static int nfs_validate_text_mount_data(void *options, +					struct nfs_parsed_mount_data *args, +					const char *dev_name) +{ +	int port = 0; +	int max_namelen = PAGE_SIZE; +	int max_pathlen = NFS_MAXPATHLEN; +	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; + +	if (nfs_parse_mount_options((char *)options, args) == 0) +		return -EINVAL; + +	if (!nfs_verify_server_address(sap)) +		goto out_no_address; + +	if (args->version == 4) { +#if IS_ENABLED(CONFIG_NFS_V4) +		port = NFS_PORT; +		max_namelen = NFS4_MAXNAMLEN; +		max_pathlen = NFS4_MAXPATHLEN; +		nfs_validate_transport_protocol(args); +		if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP) +			goto out_invalid_transport_udp; +		nfs4_validate_mount_flags(args); +#else +		goto out_v4_not_compiled; +#endif /* CONFIG_NFS_V4 */ +	} else +		nfs_set_mount_transport_protocol(args); + +	nfs_set_port(sap, &args->nfs_server.port, port); + +	return nfs_parse_devname(dev_name, +				   &args->nfs_server.hostname, +				   max_namelen, +				   &args->nfs_server.export_path, +				   max_pathlen); + +#if !IS_ENABLED(CONFIG_NFS_V4) +out_v4_not_compiled: +	dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n"); +	return -EPROTONOSUPPORT; +#else +out_invalid_transport_udp: +	dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n"); +	return -EINVAL; +#endif /* !CONFIG_NFS_V4 */ + +out_no_address: +	dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n"); +	return -EINVAL; +} + +#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ +		| NFS_MOUNT_SECURE \ +		| NFS_MOUNT_TCP \ +		| NFS_MOUNT_VER3 \ +		| NFS_MOUNT_KERBEROS \ +		| NFS_MOUNT_NONLM \ +		| NFS_MOUNT_BROKEN_SUID \ +		| NFS_MOUNT_STRICTLOCK \ +		| NFS_MOUNT_UNSHARED \ +		| NFS_MOUNT_NORESVPORT \ +		| NFS_MOUNT_LEGACY_INTERFACE) +  static int  nfs_compare_remount_data(struct nfs_server *nfss,  			 struct nfs_parsed_mount_data *data)  { -	if (data->flags != nfss->flags || +	if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK ||  	    data->rsize != nfss->rsize ||  	    data->wsize != nfss->wsize || +	    data->version != nfss->nfs_client->rpc_ops->version || +	    data->minorversion != nfss->nfs_client->cl_minorversion ||  	    data->retrans != nfss->client->cl_timeout->to_retries || -	    data->auth_flavors[0] != nfss->client->cl_auth->au_flavor || +	    data->selected_flavor != nfss->client->cl_auth->au_flavor ||  	    data->acregmin != nfss->acregmin / HZ ||  	    data->acregmax != nfss->acregmax / HZ ||  	    data->acdirmin != nfss->acdirmin / HZ || @@ -2028,7 +2217,7 @@ nfs_compare_remount_data(struct nfs_server *nfss,  	return 0;  } -static int +int  nfs_remount(struct super_block *sb, int *flags, char *raw_data)  {  	int error; @@ -2038,6 +2227,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)  	struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;  	u32 nfsvers = nfss->nfs_client->rpc_ops->version; +	sync_filesystem(sb); +  	/*  	 * Userspace mount programs that send binary options generally send  	 * them populated with default values. We have no way to know which @@ -2058,7 +2249,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)  	data->rsize = nfss->rsize;  	data->wsize = nfss->wsize;  	data->retrans = nfss->client->cl_timeout->to_retries; -	data->auth_flavors[0] = nfss->client->cl_auth->au_flavor; +	data->selected_flavor = nfss->client->cl_auth->au_flavor; +	data->auth_info = nfss->auth_info;  	data->acregmin = nfss->acregmin / HZ;  	data->acregmax = nfss->acregmax / HZ;  	data->acdirmin = nfss->acdirmin / HZ; @@ -2066,25 +2258,38 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)  	data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;  	data->nfs_server.port = nfss->port;  	data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; +	data->version = nfsvers; +	data->minorversion = nfss->nfs_client->cl_minorversion; +	data->net = current->nsproxy->net_ns;  	memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,  		data->nfs_server.addrlen);  	/* overwrite those values with any that were specified */ -	error = nfs_parse_mount_options((char *)options, data); -	if (error < 0) +	error = -EINVAL; +	if (!nfs_parse_mount_options((char *)options, data))  		goto out; +	/* +	 * noac is a special case. It implies -o sync, but that's not +	 * necessarily reflected in the mtab options. do_remount_sb +	 * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the +	 * remount options, so we have to explicitly reset it. +	 */ +	if (data->flags & NFS_MOUNT_NOAC) +		*flags |= MS_SYNCHRONOUS; +  	/* compare new mount options with old ones */  	error = nfs_compare_remount_data(nfss, data);  out:  	kfree(data);  	return error;  } +EXPORT_SYMBOL_GPL(nfs_remount);  /*   * Initialise the common bits of the superblock   */ -static inline void nfs_initialise_sb(struct super_block *sb) +inline void nfs_initialise_sb(struct super_block *sb)  {  	struct nfs_server *server = NFS_SB(sb); @@ -2092,15 +2297,12 @@ static inline void nfs_initialise_sb(struct super_block *sb)  	/* We probably want something more informative here */  	snprintf(sb->s_id, sizeof(sb->s_id), -		 "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev)); +		 "%u:%u", MAJOR(sb->s_dev), MINOR(sb->s_dev));  	if (sb->s_blocksize == 0)  		sb->s_blocksize = nfs_block_bits(server->wsize,  						 &sb->s_blocksize_bits); -	if (server->flags & NFS_MOUNT_NOAC) -		sb->s_flags |= MS_SYNCHRONOUS; -  	sb->s_bdi = &server->backing_dev_info;  	nfs_super_set_maxbytes(sb, server->maxfilesize); @@ -2109,17 +2311,19 @@ static inline void nfs_initialise_sb(struct super_block *sb)  /*   * Finish setting up an NFS2/3 superblock   */ -static void nfs_fill_super(struct super_block *sb, -			   struct nfs_parsed_mount_data *data) +void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)  { +	struct nfs_parsed_mount_data *data = mount_info->parsed;  	struct nfs_server *server = NFS_SB(sb);  	sb->s_blocksize_bits = 0;  	sb->s_blocksize = 0; -	if (data->bsize) +	sb->s_xattr = server->nfs_client->cl_nfs_mod->xattr; +	sb->s_op = server->nfs_client->cl_nfs_mod->sops; +	if (data && data->bsize)  		sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); -	if (server->nfs_client->rpc_ops->version == 3) { +	if (server->nfs_client->rpc_ops->version != 2) {  		/* The VFS shouldn't apply the umask to mode bits. We will do  		 * so ourselves when necessary.  		 */ @@ -2127,31 +2331,32 @@ static void nfs_fill_super(struct super_block *sb,  		sb->s_time_gran = 1;  	} -	sb->s_op = &nfs_sops;   	nfs_initialise_sb(sb);  } +EXPORT_SYMBOL_GPL(nfs_fill_super);  /* - * Finish setting up a cloned NFS2/3 superblock + * Finish setting up a cloned NFS2/3/4 superblock   */ -static void nfs_clone_super(struct super_block *sb, -			    const struct super_block *old_sb) +void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)  { +	const struct super_block *old_sb = mount_info->cloned->sb;  	struct nfs_server *server = NFS_SB(sb);  	sb->s_blocksize_bits = old_sb->s_blocksize_bits;  	sb->s_blocksize = old_sb->s_blocksize;  	sb->s_maxbytes = old_sb->s_maxbytes; +	sb->s_xattr = old_sb->s_xattr; +	sb->s_op = old_sb->s_op; +	sb->s_time_gran = 1; -	if (server->nfs_client->rpc_ops->version == 3) { +	if (server->nfs_client->rpc_ops->version != 2) {  		/* The VFS shouldn't apply the umask to mode bits. We will do  		 * so ourselves when necessary.  		 */  		sb->s_flags |= MS_POSIXACL; -		sb->s_time_gran = 1;  	} -	sb->s_op = old_sb->s_op;   	nfs_initialise_sb(sb);  } @@ -2165,7 +2370,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n  		goto Ebusy;  	if (a->nfs_client != b->nfs_client)  		goto Ebusy; -	if (a->flags != b->flags) +	if ((a->flags ^ b->flags) & NFS_MOUNT_CMP_FLAGMASK)  		goto Ebusy;  	if (a->wsize != b->wsize)  		goto Ebusy; @@ -2179,7 +2384,8 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n  		goto Ebusy;  	if (a->acdirmax != b->acdirmax)  		goto Ebusy; -	if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) +	if (b->auth_info.flavor_len > 0 && +	   clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)  		goto Ebusy;  	return 1;  Ebusy: @@ -2199,6 +2405,7 @@ static int nfs_set_super(struct super_block *s, void *data)  	s->s_flags = sb_mntdata->mntflags;  	s->s_fs_info = server; +	s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;  	ret = set_anon_super(s, server);  	if (ret == 0)  		server->s_dev = s->s_dev; @@ -2258,61 +2465,111 @@ static int nfs_compare_super(struct super_block *sb, void *data)  	return nfs_compare_mount_options(sb, server, mntflags);  } +#ifdef CONFIG_NFS_FSCACHE +static void nfs_get_cache_cookie(struct super_block *sb, +				 struct nfs_parsed_mount_data *parsed, +				 struct nfs_clone_mount *cloned) +{ +	struct nfs_server *nfss = NFS_SB(sb); +	char *uniq = NULL; +	int ulen = 0; + +	nfss->fscache_key = NULL; +	nfss->fscache = NULL; + +	if (parsed) { +		if (!(parsed->options & NFS_OPTION_FSCACHE)) +			return; +		if (parsed->fscache_uniq) { +			uniq = parsed->fscache_uniq; +			ulen = strlen(parsed->fscache_uniq); +		} +	} else if (cloned) { +		struct nfs_server *mnt_s = NFS_SB(cloned->sb); +		if (!(mnt_s->options & NFS_OPTION_FSCACHE)) +			return; +		if (mnt_s->fscache_key) { +			uniq = mnt_s->fscache_key->key.uniquifier; +			ulen = mnt_s->fscache_key->key.uniq_len; +		}; +	} else +		return; + +	nfs_fscache_get_super_cookie(sb, uniq, ulen); +} +#else +static void nfs_get_cache_cookie(struct super_block *sb, +				 struct nfs_parsed_mount_data *parsed, +				 struct nfs_clone_mount *cloned) +{ +} +#endif +  static int nfs_bdi_register(struct nfs_server *server)  {  	return bdi_register_dev(&server->backing_dev_info, server->s_dev);  } -static int nfs_get_sb(struct file_system_type *fs_type, -	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) +int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot, +			struct nfs_mount_info *mount_info) +{ +	int error; +	unsigned long kflags = 0, kflags_out = 0; +	if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) +		kflags |= SECURITY_LSM_NATIVE_LABELS; + +	error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts, +						kflags, &kflags_out); +	if (error) +		goto err; + +	if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && +		!(kflags_out & SECURITY_LSM_NATIVE_LABELS)) +		NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; +err: +	return error; +} +EXPORT_SYMBOL_GPL(nfs_set_sb_security); + +int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, +			  struct nfs_mount_info *mount_info) +{ +	/* clone any lsm security options from the parent to the new sb */ +	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) +		return -ESTALE; +	return security_sb_clone_mnt_opts(mount_info->cloned->sb, s); +} +EXPORT_SYMBOL_GPL(nfs_clone_sb_security); + +struct dentry *nfs_fs_mount_common(struct nfs_server *server, +				   int flags, const char *dev_name, +				   struct nfs_mount_info *mount_info, +				   struct nfs_subversion *nfs_mod)  { -	struct nfs_server *server = NULL;  	struct super_block *s; -	struct nfs_parsed_mount_data *data; -	struct nfs_fh *mntfh; -	struct dentry *mntroot; +	struct dentry *mntroot = ERR_PTR(-ENOMEM);  	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;  	struct nfs_sb_mountdata sb_mntdata = {  		.mntflags = flags, +		.server = server,  	}; -	int error = -ENOMEM; - -	data = nfs_alloc_parsed_mount_data(3); -	mntfh = nfs_alloc_fhandle(); -	if (data == NULL || mntfh == NULL) -		goto out_free_fh; - -	security_init_mnt_opts(&data->lsm_opts); - -	/* Validate the mount data */ -	error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); -	if (error < 0) -		goto out; - -#ifdef CONFIG_NFS_V4 -	if (data->version == 4) { -		error = nfs4_try_mount(flags, dev_name, data, mnt); -		kfree(data->client_address); -		kfree(data->nfs_server.export_path); -		goto out; -	} -#endif	/* CONFIG_NFS_V4 */ - -	/* Get a volume representation */ -	server = nfs_create_server(data, mntfh); -	if (IS_ERR(server)) { -		error = PTR_ERR(server); -		goto out; -	} -	sb_mntdata.server = server; +	int error;  	if (server->flags & NFS_MOUNT_UNSHARED)  		compare_super = NULL; +	/* -o noac implies -o sync */ +	if (server->flags & NFS_MOUNT_NOAC) +		sb_mntdata.mntflags |= MS_SYNCHRONOUS; + +	if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL) +		if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS) +			sb_mntdata.mntflags |= MS_SYNCHRONOUS; +  	/* Get a superblock - note that we may end up sharing one that already exists */ -	s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata); +	s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata);  	if (IS_ERR(s)) { -		error = PTR_ERR(s); +		mntroot = ERR_CAST(s);  		goto out_err_nosb;  	} @@ -2321,41 +2578,31 @@ static int nfs_get_sb(struct file_system_type *fs_type,  		server = NULL;  	} else {  		error = nfs_bdi_register(server); -		if (error) +		if (error) { +			mntroot = ERR_PTR(error);  			goto error_splat_bdi; +		} +		server->super = s;  	}  	if (!s->s_root) {  		/* initial superblock/root creation */ -		nfs_fill_super(s, data); -		nfs_fscache_get_super_cookie( -			s, data ? data->fscache_uniq : NULL, NULL); +		mount_info->fill_super(s, mount_info); +		nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned);  	} -	mntroot = nfs_get_root(s, mntfh); -	if (IS_ERR(mntroot)) { -		error = PTR_ERR(mntroot); +	mntroot = nfs_get_root(s, mount_info->mntfh, dev_name); +	if (IS_ERR(mntroot))  		goto error_splat_super; -	} -	error = security_sb_set_mnt_opts(s, &data->lsm_opts); +	error = mount_info->set_security(s, mntroot, mount_info);  	if (error)  		goto error_splat_root;  	s->s_flags |= MS_ACTIVE; -	mnt->mnt_sb = s; -	mnt->mnt_root = mntroot; -	error = 0;  out: -	kfree(data->nfs_server.hostname); -	kfree(data->mount_server.hostname); -	kfree(data->fscache_uniq); -	security_free_mnt_opts(&data->lsm_opts); -out_free_fh: -	nfs_free_fhandle(mntfh); -	kfree(data); -	return error; +	return mntroot;  out_err_nosb:  	nfs_free_server(server); @@ -2363,6 +2610,7 @@ out_err_nosb:  error_splat_root:  	dput(mntroot); +	mntroot = ERR_PTR(error);  error_splat_super:  	if (server && !s->s_root)  		bdi_unregister(&server->backing_dev_info); @@ -2370,22 +2618,65 @@ error_splat_bdi:  	deactivate_locked_super(s);  	goto out;  } +EXPORT_SYMBOL_GPL(nfs_fs_mount_common); + +struct dentry *nfs_fs_mount(struct file_system_type *fs_type, +	int flags, const char *dev_name, void *raw_data) +{ +	struct nfs_mount_info mount_info = { +		.fill_super = nfs_fill_super, +		.set_security = nfs_set_sb_security, +	}; +	struct dentry *mntroot = ERR_PTR(-ENOMEM); +	struct nfs_subversion *nfs_mod; +	int error; + +	mount_info.parsed = nfs_alloc_parsed_mount_data(); +	mount_info.mntfh = nfs_alloc_fhandle(); +	if (mount_info.parsed == NULL || mount_info.mntfh == NULL) +		goto out; + +	/* Validate the mount data */ +	error = nfs_validate_mount_data(fs_type, raw_data, mount_info.parsed, mount_info.mntfh, dev_name); +	if (error == NFS_TEXT_DATA) +		error = nfs_validate_text_mount_data(raw_data, mount_info.parsed, dev_name); +	if (error < 0) { +		mntroot = ERR_PTR(error); +		goto out; +	} + +	nfs_mod = get_nfs_version(mount_info.parsed->version); +	if (IS_ERR(nfs_mod)) { +		mntroot = ERR_CAST(nfs_mod); +		goto out; +	} + +	mntroot = nfs_mod->rpc_ops->try_mount(flags, dev_name, &mount_info, nfs_mod); + +	put_nfs_version(nfs_mod); +out: +	nfs_free_parsed_mount_data(mount_info.parsed); +	nfs_free_fhandle(mount_info.mntfh); +	return mntroot; +} +EXPORT_SYMBOL_GPL(nfs_fs_mount);  /*   * Ensure that we unregister the bdi before kill_anon_super   * releases the device name   */ -static void nfs_put_super(struct super_block *s) +void nfs_put_super(struct super_block *s)  {  	struct nfs_server *server = NFS_SB(s);  	bdi_unregister(&server->backing_dev_info);  } +EXPORT_SYMBOL_GPL(nfs_put_super);  /*   * Destroy an NFS2/3 superblock   */ -static void nfs_kill_super(struct super_block *s) +void nfs_kill_super(struct super_block *s)  {  	struct nfs_server *server = NFS_SB(s); @@ -2393,118 +2684,44 @@ static void nfs_kill_super(struct super_block *s)  	nfs_fscache_release_super_cookie(s);  	nfs_free_server(server);  } +EXPORT_SYMBOL_GPL(nfs_kill_super);  /* - * Clone an NFS2/3 server record on xdev traversal (FSID-change) + * Clone an NFS2/3/4 server record on xdev traversal (FSID-change)   */  static struct dentry *  nfs_xdev_mount(struct file_system_type *fs_type, int flags,  		const char *dev_name, void *raw_data)  {  	struct nfs_clone_mount *data = raw_data; -	struct super_block *s; -	struct nfs_server *server; -	struct dentry *mntroot; -	int (*compare_super)(struct super_block *, void *) = nfs_compare_super; -	struct nfs_sb_mountdata sb_mntdata = { -		.mntflags = flags, +	struct nfs_mount_info mount_info = { +		.fill_super = nfs_clone_super, +		.set_security = nfs_clone_sb_security, +		.cloned = data,  	}; -	int error; +	struct nfs_server *server; +	struct dentry *mntroot = ERR_PTR(-ENOMEM); +	struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod;  	dprintk("--> nfs_xdev_mount()\n"); -	/* create a new volume representation */ -	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); -	if (IS_ERR(server)) { -		error = PTR_ERR(server); -		goto out_err_noserver; -	} -	sb_mntdata.server = server; +	mount_info.mntfh = mount_info.cloned->fh; -	if (server->flags & NFS_MOUNT_UNSHARED) -		compare_super = NULL; - -	/* Get a superblock - note that we may end up sharing one that already exists */ -	s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata); -	if (IS_ERR(s)) { -		error = PTR_ERR(s); -		goto out_err_nosb; -	} - -	if (s->s_fs_info != server) { -		nfs_free_server(server); -		server = NULL; -	} else { -		error = nfs_bdi_register(server); -		if (error) -			goto error_splat_bdi; -	} - -	if (!s->s_root) { -		/* initial superblock/root creation */ -		nfs_clone_super(s, data->sb); -		nfs_fscache_get_super_cookie(s, NULL, data); -	} - -	mntroot = nfs_get_root(s, data->fh); -	if (IS_ERR(mntroot)) { -		error = PTR_ERR(mntroot); -		goto error_splat_super; -	} -	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) { -		dput(mntroot); -		error = -ESTALE; -		goto error_splat_super; -	} - -	s->s_flags |= MS_ACTIVE; +	/* create a new volume representation */ +	server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor); -	/* clone any lsm security options from the parent to the new sb */ -	security_sb_clone_mnt_opts(data->sb, s); +	if (IS_ERR(server)) +		mntroot = ERR_CAST(server); +	else +		mntroot = nfs_fs_mount_common(server, flags, +				dev_name, &mount_info, nfs_mod); -	dprintk("<-- nfs_xdev_mount() = 0\n"); +	dprintk("<-- nfs_xdev_mount() = %ld\n", +			IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L);  	return mntroot; - -out_err_nosb: -	nfs_free_server(server); -out_err_noserver: -	dprintk("<-- nfs_xdev_mount() = %d [error]\n", error); -	return ERR_PTR(error); - -error_splat_super: -	if (server && !s->s_root) -		bdi_unregister(&server->backing_dev_info); -error_splat_bdi: -	deactivate_locked_super(s); -	dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error); -	return ERR_PTR(error); -} - -#ifdef CONFIG_NFS_V4 - -/* - * Finish setting up a cloned NFS4 superblock - */ -static void nfs4_clone_super(struct super_block *sb, -			    const struct super_block *old_sb) -{ -	sb->s_blocksize_bits = old_sb->s_blocksize_bits; -	sb->s_blocksize = old_sb->s_blocksize; -	sb->s_maxbytes = old_sb->s_maxbytes; -	sb->s_time_gran = 1; -	sb->s_op = old_sb->s_op; - 	nfs_initialise_sb(sb);  } -/* - * Set up an NFS4 superblock - */ -static void nfs4_fill_super(struct super_block *sb) -{ -	sb->s_time_gran = 1; -	sb->s_op = &nfs4_sops; -	nfs_initialise_sb(sb); -} +#if IS_ENABLED(CONFIG_NFS_V4)  static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)  { @@ -2512,43 +2729,6 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)  			 NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL);  } -static int nfs4_validate_text_mount_data(void *options, -					 struct nfs_parsed_mount_data *args, -					 const char *dev_name) -{ -	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; - -	nfs_set_port(sap, &args->nfs_server.port, NFS_PORT); - -	nfs_validate_transport_protocol(args); - -	nfs4_validate_mount_flags(args); - -	if (args->version != 4) { -		dfprintk(MOUNT, -			 "NFS4: Illegal mount version\n"); -		return -EINVAL; -	} - -	if (args->auth_flavor_len > 1) { -		dfprintk(MOUNT, -			 "NFS4: Too many RPC auth flavours specified\n"); -		return -EINVAL; -	} - -	if (args->client_address == NULL) { -		dfprintk(MOUNT, -			 "NFS4: mount program didn't pass callback address\n"); -		return -EINVAL; -	} - -	return nfs_parse_devname(dev_name, -				   &args->nfs_server.hostname, -				   NFS4_MAXNAMLEN, -				   &args->nfs_server.export_path, -				   NFS4_MAXPATHLEN); -} -  /*   * Validate NFSv4 mount options   */ @@ -2563,6 +2743,8 @@ static int nfs4_validate_mount_data(void *options,  	if (data == NULL)  		goto out_no_data; +	args->version = 4; +  	switch (data->version) {  	case 1:  		if (data->host_addrlen > sizeof(args->nfs_server.address)) @@ -2574,15 +2756,19 @@ static int nfs4_validate_mount_data(void *options,  			return -EFAULT;  		if (!nfs_verify_server_address(sap))  			goto out_no_address; +		args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);  		if (data->auth_flavourlen) { +			rpc_authflavor_t pseudoflavor;  			if (data->auth_flavourlen > 1)  				goto out_inval_auth; -			if (copy_from_user(&args->auth_flavors[0], +			if (copy_from_user(&pseudoflavor,  					   data->auth_flavours, -					   sizeof(args->auth_flavors[0]))) +					   sizeof(pseudoflavor)))  				return -EFAULT; -		} +			args->selected_flavor = pseudoflavor; +		} else +			args->selected_flavor = RPC_AUTH_UNIX;  		c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);  		if (IS_ERR(c)) @@ -2616,16 +2802,12 @@ static int nfs4_validate_mount_data(void *options,  		args->acdirmax	= data->acdirmax;  		args->nfs_server.protocol = data->proto;  		nfs_validate_transport_protocol(args); +		if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP) +			goto out_invalid_transport_udp;  		break;  	default: -		if (nfs_parse_mount_options((char *)options, args) == 0) -			return -EINVAL; - -		if (!nfs_verify_server_address(sap)) -			return -EINVAL; - -		return nfs4_validate_text_mount_data(options, args, dev_name); +		return NFS_TEXT_DATA;  	}  	return 0; @@ -2642,535 +2824,76 @@ out_inval_auth:  out_no_address:  	dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");  	return -EINVAL; + +out_invalid_transport_udp: +	dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n"); +	return -EINVAL;  }  /* - * Get the superblock for the NFS4 root partition + * NFS v4 module parameters need to stay in the + * NFS client for backwards compatibility   */ -static struct dentry * -nfs4_remote_mount(struct file_system_type *fs_type, int flags, -		  const char *dev_name, void *raw_data) -{ -	struct nfs_parsed_mount_data *data = raw_data; -	struct super_block *s; -	struct nfs_server *server; -	struct nfs_fh *mntfh; -	struct dentry *mntroot; -	int (*compare_super)(struct super_block *, void *) = nfs_compare_super; -	struct nfs_sb_mountdata sb_mntdata = { -		.mntflags = flags, -	}; -	int error = -ENOMEM; - -	mntfh = nfs_alloc_fhandle(); -	if (data == NULL || mntfh == NULL) -		goto out_free_fh; - -	security_init_mnt_opts(&data->lsm_opts); - -	/* Get a volume representation */ -	server = nfs4_create_server(data, mntfh); -	if (IS_ERR(server)) { -		error = PTR_ERR(server); -		goto out; -	} -	sb_mntdata.server = server; - -	if (server->flags & NFS4_MOUNT_UNSHARED) -		compare_super = NULL; - -	/* Get a superblock - note that we may end up sharing one that already exists */ -	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); -	if (IS_ERR(s)) { -		error = PTR_ERR(s); -		goto out_free; -	} - -	if (s->s_fs_info != server) { -		nfs_free_server(server); -		server = NULL; -	} else { -		error = nfs_bdi_register(server); -		if (error) -			goto error_splat_bdi; -	} - -	if (!s->s_root) { -		/* initial superblock/root creation */ -		nfs4_fill_super(s); -		nfs_fscache_get_super_cookie( -			s, data ? data->fscache_uniq : NULL, NULL); -	} - -	mntroot = nfs4_get_root(s, mntfh); -	if (IS_ERR(mntroot)) { -		error = PTR_ERR(mntroot); -		goto error_splat_super; -	} - -	error = security_sb_set_mnt_opts(s, &data->lsm_opts); -	if (error) -		goto error_splat_root; - -	s->s_flags |= MS_ACTIVE; - -	security_free_mnt_opts(&data->lsm_opts); -	nfs_free_fhandle(mntfh); -	return mntroot; - -out: -	security_free_mnt_opts(&data->lsm_opts); -out_free_fh: -	nfs_free_fhandle(mntfh); -	return ERR_PTR(error); - -out_free: -	nfs_free_server(server); -	goto out; - -error_splat_root: -	dput(mntroot); -error_splat_super: -	if (server && !s->s_root) -		bdi_unregister(&server->backing_dev_info); -error_splat_bdi: -	deactivate_locked_super(s); -	goto out; -} - -static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type, -		int flags, void *data, const char *hostname) -{ -	struct vfsmount *root_mnt; -	char *root_devname; -	size_t len; - -	len = strlen(hostname) + 3; -	root_devname = kmalloc(len, GFP_KERNEL); -	if (root_devname == NULL) -		return ERR_PTR(-ENOMEM); -	snprintf(root_devname, len, "%s:/", hostname); -	root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data); -	kfree(root_devname); -	return root_mnt; -} - -static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt) -{ -	char *page = (char *) __get_free_page(GFP_KERNEL); -	char *devname, *tmp; - -	if (page == NULL) -		return; -	devname = nfs_path(path->mnt->mnt_devname, -			path->mnt->mnt_root, path->dentry, -			page, PAGE_SIZE); -	if (IS_ERR(devname)) -		goto out_freepage; -	tmp = kstrdup(devname, GFP_KERNEL); -	if (tmp == NULL) -		goto out_freepage; -	kfree(mnt->mnt_devname); -	mnt->mnt_devname = tmp; -out_freepage: -	free_page((unsigned long)page); -} - -struct nfs_referral_count { -	struct list_head list; -	const struct task_struct *task; -	unsigned int referral_count; -}; - -static LIST_HEAD(nfs_referral_count_list); -static DEFINE_SPINLOCK(nfs_referral_count_list_lock); - -static struct nfs_referral_count *nfs_find_referral_count(void) -{ -	struct nfs_referral_count *p; - -	list_for_each_entry(p, &nfs_referral_count_list, list) { -		if (p->task == current) -			return p; -	} -	return NULL; -} - -#define NFS_MAX_NESTED_REFERRALS 2 - -static int nfs_referral_loop_protect(void) +unsigned int nfs_callback_set_tcpport; +unsigned short nfs_callback_tcpport; +/* Default cache timeout is 10 minutes */ +unsigned int nfs_idmap_cache_timeout = 600; +/* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */ +bool nfs4_disable_idmapping = true; +unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; +unsigned short send_implementation_id = 1; +char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = ""; +bool recover_lost_locks = false; + +EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); +EXPORT_SYMBOL_GPL(nfs_callback_tcpport); +EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout); +EXPORT_SYMBOL_GPL(nfs4_disable_idmapping); +EXPORT_SYMBOL_GPL(max_session_slots); +EXPORT_SYMBOL_GPL(send_implementation_id); +EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier); +EXPORT_SYMBOL_GPL(recover_lost_locks); + +#define NFS_CALLBACK_MAXPORTNR (65535U) + +static int param_set_portnr(const char *val, const struct kernel_param *kp)  { -	struct nfs_referral_count *p, *new; -	int ret = -ENOMEM; - -	new = kmalloc(sizeof(*new), GFP_KERNEL); -	if (!new) -		goto out; -	new->task = current; -	new->referral_count = 1; - -	ret = 0; -	spin_lock(&nfs_referral_count_list_lock); -	p = nfs_find_referral_count(); -	if (p != NULL) { -		if (p->referral_count >= NFS_MAX_NESTED_REFERRALS) -			ret = -ELOOP; -		else -			p->referral_count++; -	} else { -		list_add(&new->list, &nfs_referral_count_list); -		new = NULL; -	} -	spin_unlock(&nfs_referral_count_list_lock); -	kfree(new); -out: -	return ret; -} - -static void nfs_referral_loop_unprotect(void) -{ -	struct nfs_referral_count *p; - -	spin_lock(&nfs_referral_count_list_lock); -	p = nfs_find_referral_count(); -	p->referral_count--; -	if (p->referral_count == 0) -		list_del(&p->list); -	else -		p = NULL; -	spin_unlock(&nfs_referral_count_list_lock); -	kfree(p); -} - -static int nfs_follow_remote_path(struct vfsmount *root_mnt, -		const char *export_path, struct vfsmount *mnt_target) -{ -	struct nameidata *nd = NULL; -	struct mnt_namespace *ns_private; -	struct super_block *s; +	unsigned long num;  	int ret; -	nd = kmalloc(sizeof(*nd), GFP_KERNEL); -	if (nd == NULL) -		return -ENOMEM; - -	ns_private = create_mnt_ns(root_mnt); -	ret = PTR_ERR(ns_private); -	if (IS_ERR(ns_private)) -		goto out_mntput; - -	ret = nfs_referral_loop_protect(); -	if (ret != 0) -		goto out_put_mnt_ns; - -	ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, -			export_path, LOOKUP_FOLLOW, nd); - -	nfs_referral_loop_unprotect(); -	put_mnt_ns(ns_private); - -	if (ret != 0) -		goto out_err; - -	s = nd->path.mnt->mnt_sb; -	atomic_inc(&s->s_active); -	mnt_target->mnt_sb = s; -	mnt_target->mnt_root = dget(nd->path.dentry); - -	/* Correct the device pathname */ -	nfs_fix_devname(&nd->path, mnt_target); - -	path_put(&nd->path); -	kfree(nd); -	down_write(&s->s_umount); +	if (!val) +		return -EINVAL; +	ret = kstrtoul(val, 0, &num); +	if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) +		return -EINVAL; +	*((unsigned int *)kp->arg) = num;  	return 0; -out_put_mnt_ns: -	put_mnt_ns(ns_private); -out_mntput: -	mntput(root_mnt); -out_err: -	kfree(nd); -	return ret; -} - -static int nfs4_try_mount(int flags, const char *dev_name, -			 struct nfs_parsed_mount_data *data, -			 struct vfsmount *mnt) -{ -	char *export_path; -	struct vfsmount *root_mnt; -	int error; - -	dfprintk(MOUNT, "--> nfs4_try_mount()\n"); - -	export_path = data->nfs_server.export_path; -	data->nfs_server.export_path = "/"; -	root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data, -			data->nfs_server.hostname); -	data->nfs_server.export_path = export_path; - -	error = PTR_ERR(root_mnt); -	if (IS_ERR(root_mnt)) -		goto out; - -	error = nfs_follow_remote_path(root_mnt, export_path, mnt); - -out: -	dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error, -			error != 0 ? " [error]" : ""); -	return error;  } +static struct kernel_param_ops param_ops_portnr = { +	.set = param_set_portnr, +	.get = param_get_uint, +}; +#define param_check_portnr(name, p) __param_check(name, p, unsigned int); + +module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); +module_param(nfs_idmap_cache_timeout, int, 0644); +module_param(nfs4_disable_idmapping, bool, 0644); +module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier, +			NFS4_CLIENT_ID_UNIQ_LEN, 0600); +MODULE_PARM_DESC(nfs4_disable_idmapping, +		"Turn off NFSv4 idmapping when using 'sec=sys'"); +module_param(max_session_slots, ushort, 0644); +MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 " +		"requests the client will negotiate"); +module_param(send_implementation_id, ushort, 0644); +MODULE_PARM_DESC(send_implementation_id, +		"Send implementation ID with NFSv4.1 exchange_id"); +MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string"); + +module_param(recover_lost_locks, bool, 0644); +MODULE_PARM_DESC(recover_lost_locks, +		 "If the server reports that a lock might be lost, " +		 "try to recover it risking data corruption."); -/* - * Get the superblock for an NFS4 mountpoint - */ -static int nfs4_get_sb(struct file_system_type *fs_type, -	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) -{ -	struct nfs_parsed_mount_data *data; -	int error = -ENOMEM; - -	data = nfs_alloc_parsed_mount_data(4); -	if (data == NULL) -		goto out_free_data; - -	/* Validate the mount data */ -	error = nfs4_validate_mount_data(raw_data, data, dev_name); -	if (error < 0) -		goto out; - -	error = nfs4_try_mount(flags, dev_name, data, mnt); - -out: -	kfree(data->client_address); -	kfree(data->nfs_server.export_path); -	kfree(data->nfs_server.hostname); -	kfree(data->fscache_uniq); -out_free_data: -	kfree(data); -	dprintk("<-- nfs4_get_sb() = %d%s\n", error, -			error != 0 ? " [error]" : ""); -	return error; -} - -static void nfs4_kill_super(struct super_block *sb) -{ -	struct nfs_server *server = NFS_SB(sb); - -	dprintk("--> %s\n", __func__); -	nfs_super_return_all_delegations(sb); -	kill_anon_super(sb); -	nfs_fscache_release_super_cookie(sb); -	nfs_free_server(server); -	dprintk("<-- %s\n", __func__); -} - -/* - * Clone an NFS4 server record on xdev traversal (FSID-change) - */ -static struct dentry * -nfs4_xdev_mount(struct file_system_type *fs_type, int flags, -		 const char *dev_name, void *raw_data) -{ -	struct nfs_clone_mount *data = raw_data; -	struct super_block *s; -	struct nfs_server *server; -	struct dentry *mntroot; -	int (*compare_super)(struct super_block *, void *) = nfs_compare_super; -	struct nfs_sb_mountdata sb_mntdata = { -		.mntflags = flags, -	}; -	int error; - -	dprintk("--> nfs4_xdev_mount()\n"); - -	/* create a new volume representation */ -	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); -	if (IS_ERR(server)) { -		error = PTR_ERR(server); -		goto out_err_noserver; -	} -	sb_mntdata.server = server; - -	if (server->flags & NFS4_MOUNT_UNSHARED) -		compare_super = NULL; - -	/* Get a superblock - note that we may end up sharing one that already exists */ -	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); -	if (IS_ERR(s)) { -		error = PTR_ERR(s); -		goto out_err_nosb; -	} - -	if (s->s_fs_info != server) { -		nfs_free_server(server); -		server = NULL; -	} else { -		error = nfs_bdi_register(server); -		if (error) -			goto error_splat_bdi; -	} - -	if (!s->s_root) { -		/* initial superblock/root creation */ -		nfs4_clone_super(s, data->sb); -		nfs_fscache_get_super_cookie(s, NULL, data); -	} - -	mntroot = nfs4_get_root(s, data->fh); -	if (IS_ERR(mntroot)) { -		error = PTR_ERR(mntroot); -		goto error_splat_super; -	} -	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) { -		dput(mntroot); -		error = -ESTALE; -		goto error_splat_super; -	} - -	s->s_flags |= MS_ACTIVE; - -	security_sb_clone_mnt_opts(data->sb, s); - -	dprintk("<-- nfs4_xdev_mount() = 0\n"); -	return mntroot; - -out_err_nosb: -	nfs_free_server(server); -out_err_noserver: -	dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error); -	return ERR_PTR(error); - -error_splat_super: -	if (server && !s->s_root) -		bdi_unregister(&server->backing_dev_info); -error_splat_bdi: -	deactivate_locked_super(s); -	dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error); -	return ERR_PTR(error); -} - -static struct dentry * -nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags, -			   const char *dev_name, void *raw_data) -{ -	struct nfs_clone_mount *data = raw_data; -	struct super_block *s; -	struct nfs_server *server; -	struct dentry *mntroot; -	struct nfs_fh *mntfh; -	int (*compare_super)(struct super_block *, void *) = nfs_compare_super; -	struct nfs_sb_mountdata sb_mntdata = { -		.mntflags = flags, -	}; -	int error = -ENOMEM; - -	dprintk("--> nfs4_referral_get_sb()\n"); - -	mntfh = nfs_alloc_fhandle(); -	if (mntfh == NULL) -		goto out_err_nofh; - -	/* create a new volume representation */ -	server = nfs4_create_referral_server(data, mntfh); -	if (IS_ERR(server)) { -		error = PTR_ERR(server); -		goto out_err_noserver; -	} -	sb_mntdata.server = server; - -	if (server->flags & NFS4_MOUNT_UNSHARED) -		compare_super = NULL; - -	/* Get a superblock - note that we may end up sharing one that already exists */ -	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); -	if (IS_ERR(s)) { -		error = PTR_ERR(s); -		goto out_err_nosb; -	} - -	if (s->s_fs_info != server) { -		nfs_free_server(server); -		server = NULL; -	} else { -		error = nfs_bdi_register(server); -		if (error) -			goto error_splat_bdi; -	} - -	if (!s->s_root) { -		/* initial superblock/root creation */ -		nfs4_fill_super(s); -		nfs_fscache_get_super_cookie(s, NULL, data); -	} - -	mntroot = nfs4_get_root(s, mntfh); -	if (IS_ERR(mntroot)) { -		error = PTR_ERR(mntroot); -		goto error_splat_super; -	} -	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) { -		dput(mntroot); -		error = -ESTALE; -		goto error_splat_super; -	} - -	s->s_flags |= MS_ACTIVE; - -	security_sb_clone_mnt_opts(data->sb, s); - -	nfs_free_fhandle(mntfh); -	dprintk("<-- nfs4_referral_get_sb() = 0\n"); -	return mntroot; - -out_err_nosb: -	nfs_free_server(server); -out_err_noserver: -	nfs_free_fhandle(mntfh); -out_err_nofh: -	dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); -	return ERR_PTR(error); - -error_splat_super: -	if (server && !s->s_root) -		bdi_unregister(&server->backing_dev_info); -error_splat_bdi: -	deactivate_locked_super(s); -	nfs_free_fhandle(mntfh); -	dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); -	return ERR_PTR(error); -} - -/* - * Create an NFS4 server record on referral traversal - */ -static int nfs4_referral_get_sb(struct file_system_type *fs_type, -		int flags, const char *dev_name, void *raw_data, -		struct vfsmount *mnt) -{ -	struct nfs_clone_mount *data = raw_data; -	char *export_path; -	struct vfsmount *root_mnt; -	int error; - -	dprintk("--> nfs4_referral_get_sb()\n"); - -	export_path = data->mnt_path; -	data->mnt_path = "/"; - -	root_mnt = nfs_do_root_mount(&nfs4_remote_referral_fs_type, -			flags, data, data->hostname); -	data->mnt_path = export_path; - -	error = PTR_ERR(root_mnt); -	if (IS_ERR(root_mnt)) -		goto out; - -	error = nfs_follow_remote_path(root_mnt, export_path, mnt); -out: -	dprintk("<-- nfs4_referral_get_sb() = %d%s\n", error, -			error != 0 ? " [error]" : ""); -	return error; -}  #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index 978aaeb8a09..bb6ed810fa6 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -9,39 +9,11 @@  #include <linux/fs.h>  #include <linux/sysctl.h>  #include <linux/module.h> -#include <linux/nfs4.h> -#include <linux/nfs_idmap.h>  #include <linux/nfs_fs.h> -#include "callback.h" - -#ifdef CONFIG_NFS_V4 -static const int nfs_set_port_min = 0; -static const int nfs_set_port_max = 65535; -#endif  static struct ctl_table_header *nfs_callback_sysctl_table; -static ctl_table nfs_cb_sysctls[] = { -#ifdef CONFIG_NFS_V4 -	{ -		.procname = "nfs_callback_tcpport", -		.data = &nfs_callback_set_tcpport, -		.maxlen = sizeof(int), -		.mode = 0644, -		.proc_handler = proc_dointvec_minmax, -		.extra1 = (int *)&nfs_set_port_min, -		.extra2 = (int *)&nfs_set_port_max, -	}, -#ifndef CONFIG_NFS_USE_NEW_IDMAPPER -	{ -		.procname = "idmap_cache_timeout", -		.data = &nfs_idmap_cache_timeout, -		.maxlen = sizeof(int), -		.mode = 0644, -		.proc_handler = proc_dointvec_jiffies, -	}, -#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ -#endif +static struct ctl_table nfs_cb_sysctls[] = {  	{  		.procname	= "nfs_mountpoint_timeout",  		.data		= &nfs_mountpoint_expiry_timeout, @@ -59,7 +31,7 @@ static ctl_table nfs_cb_sysctls[] = {  	{ }  }; -static ctl_table nfs_cb_sysctl_dir[] = { +static struct ctl_table nfs_cb_sysctl_dir[] = {  	{  		.procname = "nfs",  		.mode = 0555, @@ -68,7 +40,7 @@ static ctl_table nfs_cb_sysctl_dir[] = {  	{ }  }; -static ctl_table nfs_cb_sysctl_root[] = { +static struct ctl_table nfs_cb_sysctl_root[] = {  	{  		.procname = "fs",  		.mode = 0555, diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 7bdec853140..de54129336c 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -14,20 +14,14 @@  #include <linux/sched.h>  #include <linux/wait.h>  #include <linux/namei.h> +#include <linux/fsnotify.h>  #include "internal.h"  #include "nfs4_fs.h"  #include "iostat.h"  #include "delegation.h" -struct nfs_unlinkdata { -	struct hlist_node list; -	struct nfs_removeargs args; -	struct nfs_removeres res; -	struct inode *dir; -	struct rpc_cred	*cred; -	struct nfs_fattr dir_attr; -}; +#include "nfstrace.h"  /**   * nfs_free_unlinkdata - release data from a sillydelete operation. @@ -86,8 +80,9 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)  	struct nfs_unlinkdata *data = calldata;  	struct inode *dir = data->dir; +	trace_nfs_sillyrename_unlink(data, task->tk_status);  	if (!NFS_PROTO(dir)->unlink_done(task, dir)) -		nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client); +		rpc_restart_call_prepare(task);  }  /** @@ -107,25 +102,16 @@ static void nfs_async_unlink_release(void *calldata)  	nfs_sb_deactive(sb);  } -#if defined(CONFIG_NFS_V4_1) -void nfs_unlink_prepare(struct rpc_task *task, void *calldata) +static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)  {  	struct nfs_unlinkdata *data = calldata; -	struct nfs_server *server = NFS_SERVER(data->dir); - -	if (nfs4_setup_sequence(server, &data->args.seq_args, -				&data->res.seq_res, 1, task)) -		return; -	rpc_call_start(task); +	NFS_PROTO(data->dir)->unlink_rpc_prepare(task, data);  } -#endif /* CONFIG_NFS_V4_1 */  static const struct rpc_call_ops nfs_unlink_ops = {  	.rpc_call_done = nfs_async_unlink_done,  	.rpc_release = nfs_async_unlink_release, -#if defined(CONFIG_NFS_V4_1)  	.rpc_call_prepare = nfs_unlink_prepare, -#endif /* CONFIG_NFS_V4_1 */  };  static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data) @@ -147,23 +133,33 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n  	alias = d_lookup(parent, &data->args.name);  	if (alias != NULL) { -		int ret = 0; +		int ret; +		void *devname_garbage = NULL;  		/*  		 * Hey, we raced with lookup... See if we need to transfer  		 * the sillyrename information to the aliased dentry.  		 */  		nfs_free_dname(data); +		ret = nfs_copy_dname(alias, data);  		spin_lock(&alias->d_lock); -		if (alias->d_inode != NULL && +		if (ret == 0 && alias->d_inode != NULL &&  		    !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { +			devname_garbage = alias->d_fsdata;  			alias->d_fsdata = data;  			alias->d_flags |= DCACHE_NFSFS_RENAMED;  			ret = 1; -		} +		} else +			ret = 0;  		spin_unlock(&alias->d_lock);  		nfs_dec_sillycount(dir);  		dput(alias); +		/* +		 * If we'd displaced old cached devname, free it.  At that +		 * point dentry is definitely not a root, so we won't need +		 * that anymore. +		 */ +		kfree(devname_garbage);  		return ret;  	}  	data->dir = igrab(dir); @@ -180,7 +176,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n  	task_setup_data.rpc_client = NFS_CLIENT(dir);  	task = rpc_run_task(&task_setup_data);  	if (!IS_ERR(task)) -		rpc_put_task(task); +		rpc_put_task_async(task);  	return 1;  } @@ -195,8 +191,6 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)  	if (parent == NULL)  		goto out_free;  	dir = parent->d_inode; -	if (nfs_copy_dname(dentry, data) != 0) -		goto out_dput;  	/* Non-exclusive lock protects against concurrent lookup() calls */  	spin_lock(&dir->i_lock);  	if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { @@ -214,6 +208,13 @@ out_free:  	return ret;  } +void nfs_wait_on_sillyrename(struct dentry *dentry) +{ +	struct nfs_inode *nfsi = NFS_I(dentry->d_inode); + +	wait_event(nfsi->waitqueue, atomic_read(&nfsi->silly_count) <= 1); +} +  void nfs_block_sillyrename(struct dentry *dentry)  {  	struct nfs_inode *nfsi = NFS_I(dentry->d_inode); @@ -252,6 +253,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)  {  	struct nfs_unlinkdata *data;  	int status = -ENOMEM; +	void *devname_garbage = NULL;  	data = kzalloc(sizeof(*data), GFP_KERNEL);  	if (data == NULL) @@ -269,8 +271,15 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)  	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)  		goto out_unlock;  	dentry->d_flags |= DCACHE_NFSFS_RENAMED; +	devname_garbage = dentry->d_fsdata;  	dentry->d_fsdata = data;  	spin_unlock(&dentry->d_lock); +	/* +	 * If we'd displaced old cached devname, free it.  At that +	 * point dentry is definitely not a root, so we won't need +	 * that anymore. +	 */ +	kfree(devname_garbage);  	return 0;  out_unlock:  	spin_unlock(&dentry->d_lock); @@ -299,6 +308,7 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)  	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {  		dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;  		data = dentry->d_fsdata; +		dentry->d_fsdata = NULL;  	}  	spin_unlock(&dentry->d_lock); @@ -315,6 +325,7 @@ nfs_cancel_async_unlink(struct dentry *dentry)  		struct nfs_unlinkdata *data = dentry->d_fsdata;  		dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; +		dentry->d_fsdata = NULL;  		spin_unlock(&dentry->d_lock);  		nfs_free_unlinkdata(data);  		return; @@ -322,18 +333,6 @@ nfs_cancel_async_unlink(struct dentry *dentry)  	spin_unlock(&dentry->d_lock);  } -struct nfs_renamedata { -	struct nfs_renameargs	args; -	struct nfs_renameres	res; -	struct rpc_cred		*cred; -	struct inode		*old_dir; -	struct dentry		*old_dentry; -	struct nfs_fattr	old_fattr; -	struct inode		*new_dir; -	struct dentry		*new_dentry; -	struct nfs_fattr	new_fattr; -}; -  /**   * nfs_async_rename_done - Sillyrename post-processing   * @task: rpc_task of the sillyrename @@ -346,19 +345,17 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)  	struct nfs_renamedata *data = calldata;  	struct inode *old_dir = data->old_dir;  	struct inode *new_dir = data->new_dir; +	struct dentry *old_dentry = data->old_dentry; +	trace_nfs_sillyrename_rename(old_dir, old_dentry, +			new_dir, data->new_dentry, task->tk_status);  	if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { -		nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client); +		rpc_restart_call_prepare(task);  		return;  	} -	if (task->tk_status != 0) { -		nfs_cancel_async_unlink(data->old_dentry); -		return; -	} - -	nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir)); -	d_move(data->old_dentry, data->new_dentry); +	if (data->complete) +		data->complete(task, data);  }  /** @@ -382,25 +379,16 @@ static void nfs_async_rename_release(void *calldata)  	kfree(data);  } -#if defined(CONFIG_NFS_V4_1)  static void nfs_rename_prepare(struct rpc_task *task, void *calldata)  {  	struct nfs_renamedata *data = calldata; -	struct nfs_server *server = NFS_SERVER(data->old_dir); - -	if (nfs4_setup_sequence(server, &data->args.seq_args, -				&data->res.seq_res, 1, task)) -		return; -	rpc_call_start(task); +	NFS_PROTO(data->old_dir)->rename_rpc_prepare(task, data);  } -#endif /* CONFIG_NFS_V4_1 */  static const struct rpc_call_ops nfs_rename_ops = {  	.rpc_call_done = nfs_async_rename_done,  	.rpc_release = nfs_async_rename_release, -#if defined(CONFIG_NFS_V4_1)  	.rpc_call_prepare = nfs_rename_prepare, -#endif /* CONFIG_NFS_V4_1 */  };  /** @@ -412,9 +400,10 @@ static const struct rpc_call_ops nfs_rename_ops = {   *   * It's expected that valid references to the dentries and inodes are held   */ -static struct rpc_task * +struct rpc_task *  nfs_async_rename(struct inode *old_dir, struct inode *new_dir, -		 struct dentry *old_dentry, struct dentry *new_dentry) +		 struct dentry *old_dentry, struct dentry *new_dentry, +		 void (*complete)(struct rpc_task *, struct nfs_renamedata *))  {  	struct nfs_renamedata *data;  	struct rpc_message msg = { }; @@ -429,7 +418,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,  	data = kzalloc(sizeof(*data), GFP_KERNEL);  	if (data == NULL)  		return ERR_PTR(-ENOMEM); -	task_setup_data.callback_data = data, +	task_setup_data.callback_data = data;  	data->cred = rpc_lookup_cred();  	if (IS_ERR(data->cred)) { @@ -451,6 +440,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,  	data->new_dentry = dget(new_dentry);  	nfs_fattr_init(&data->old_fattr);  	nfs_fattr_init(&data->new_fattr); +	data->complete = complete;  	/* set up nfs_renameargs */  	data->args.old_dir = NFS_FH(old_dir); @@ -469,6 +459,35 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,  	return rpc_run_task(&task_setup_data);  } +/* + * Perform tasks needed when a sillyrename is done such as cancelling the + * queued async unlink if it failed. + */ +static void +nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data) +{ +	struct dentry *dentry = data->old_dentry; + +	if (task->tk_status != 0) { +		nfs_cancel_async_unlink(dentry); +		return; +	} + +	/* +	 * vfs_unlink and the like do not issue this when a file is +	 * sillyrenamed, so do it here. +	 */ +	fsnotify_nameremove(dentry, 0); +} + +#define SILLYNAME_PREFIX ".nfs" +#define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1) +#define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1) +#define SILLYNAME_COUNTER_LEN ((unsigned)sizeof(unsigned int) << 1) +#define SILLYNAME_LEN (SILLYNAME_PREFIX_LEN + \ +		SILLYNAME_FILEID_LEN + \ +		SILLYNAME_COUNTER_LEN) +  /**   * nfs_sillyrename - Perform a silly-rename of a dentry   * @dir: inode of directory that contains dentry @@ -481,48 +500,52 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,   * and only performs the unlink once the last reference to it is put.   *   * The final cleanup is done during dentry_iput. + * + * (Note: NFSv4 is stateful, and has opens, so in theory an NFSv4 server + * could take responsibility for keeping open files referenced.  The server + * would also need to ensure that opened-but-deleted files were kept over + * reboots.  However, we may not assume a server does so.  (RFC 5661 + * does provide an OPEN4_RESULT_PRESERVE_UNLINKED flag that a server can + * use to advertise that it does this; some day we may take advantage of + * it.))   */  int  nfs_sillyrename(struct inode *dir, struct dentry *dentry)  {  	static unsigned int sillycounter; -	const int      fileidsize  = sizeof(NFS_FILEID(dentry->d_inode))*2; -	const int      countersize = sizeof(sillycounter)*2; -	const int      slen        = sizeof(".nfs")+fileidsize+countersize-1; -	char           silly[slen+1]; +	unsigned char silly[SILLYNAME_LEN + 1]; +	unsigned long long fileid;  	struct dentry *sdentry;  	struct rpc_task *task; -	int            error = -EIO; +	int            error = -EBUSY; -	dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", -		dentry->d_parent->d_name.name, dentry->d_name.name, -		atomic_read(&dentry->d_count)); +	dfprintk(VFS, "NFS: silly-rename(%pd2, ct=%d)\n", +		dentry, d_count(dentry));  	nfs_inc_stats(dir, NFSIOS_SILLYRENAME);  	/*  	 * We don't allow a dentry to be silly-renamed twice.  	 */ -	error = -EBUSY;  	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)  		goto out; -	sprintf(silly, ".nfs%*.*Lx", -		fileidsize, fileidsize, -		(unsigned long long)NFS_FILEID(dentry->d_inode)); +	fileid = NFS_FILEID(dentry->d_inode);  	/* Return delegation in anticipation of the rename */ -	nfs_inode_return_delegation(dentry->d_inode); +	NFS_PROTO(dentry->d_inode)->return_delegation(dentry->d_inode);  	sdentry = NULL;  	do { -		char *suffix = silly + slen - countersize; - +		int slen;  		dput(sdentry);  		sillycounter++; -		sprintf(suffix, "%*.*x", countersize, countersize, sillycounter); +		slen = scnprintf(silly, sizeof(silly), +				SILLYNAME_PREFIX "%0*llx%0*x", +				SILLYNAME_FILEID_LEN, fileid, +				SILLYNAME_COUNTER_LEN, sillycounter); -		dfprintk(VFS, "NFS: trying to rename %s to %s\n", -				dentry->d_name.name, silly); +		dfprintk(VFS, "NFS: trying to rename %pd to %s\n", +				dentry, silly);  		sdentry = lookup_one_len(silly, dentry->d_parent, slen);  		/* @@ -540,8 +563,17 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)  	if (error)  		goto out_dput; +	/* populate unlinkdata with the right dname */ +	error = nfs_copy_dname(sdentry, +				(struct nfs_unlinkdata *)dentry->d_fsdata); +	if (error) { +		nfs_cancel_async_unlink(dentry); +		goto out_dput; +	} +  	/* run the rename task, undo unlink if it fails */ -	task = nfs_async_rename(dir, dir, dentry, sdentry); +	task = nfs_async_rename(dir, dir, dentry, sdentry, +					nfs_complete_sillyrename);  	if (IS_ERR(task)) {  		error = -EBUSY;  		nfs_cancel_async_unlink(dentry); @@ -552,6 +584,18 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)  	error = rpc_wait_for_completion_task(task);  	if (error == 0)  		error = task->tk_status; +	switch (error) { +	case 0: +		/* The rename succeeded */ +		nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); +		d_move(dentry, sdentry); +		break; +	case -ERESTARTSYS: +		/* The result of the rename is unknown. Play it safe by +		 * forcing a new lookup */ +		d_drop(dentry); +		d_drop(sdentry); +	}  	rpc_put_task(task);  out_dput:  	dput(sdentry); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4c14c17a527..5e2f1030454 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -20,6 +20,7 @@  #include <linux/nfs_mount.h>  #include <linux/nfs_page.h>  #include <linux/backing-dev.h> +#include <linux/export.h>  #include <asm/uaccess.h> @@ -28,6 +29,9 @@  #include "iostat.h"  #include "nfs4_fs.h"  #include "fscache.h" +#include "pnfs.h" + +#include "nfstrace.h"  #define NFSDBG_FACILITY		NFSDBG_PAGECACHE @@ -37,20 +41,21 @@  /*   * Local function declarations   */ -static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc, -				  struct inode *inode, int ioflags);  static void nfs_redirty_request(struct nfs_page *req); -static const struct rpc_call_ops nfs_write_partial_ops; -static const struct rpc_call_ops nfs_write_full_ops;  static const struct rpc_call_ops nfs_commit_ops; +static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; +static const struct nfs_commit_completion_ops nfs_commit_completion_ops; +static const struct nfs_rw_ops nfs_rw_write_ops; +static void nfs_clear_request_commit(struct nfs_page *req);  static struct kmem_cache *nfs_wdata_cachep;  static mempool_t *nfs_wdata_mempool; +static struct kmem_cache *nfs_cdata_cachep;  static mempool_t *nfs_commit_mempool; -struct nfs_write_data *nfs_commitdata_alloc(void) +struct nfs_commit_data *nfs_commitdata_alloc(void)  { -	struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); +	struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);  	if (p) {  		memset(p, 0, sizeof(*p)); @@ -58,46 +63,26 @@ struct nfs_write_data *nfs_commitdata_alloc(void)  	}  	return p;  } +EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); -void nfs_commit_free(struct nfs_write_data *p) +void nfs_commit_free(struct nfs_commit_data *p)  { -	if (p && (p->pagevec != &p->page_array[0])) -		kfree(p->pagevec);  	mempool_free(p, nfs_commit_mempool);  } +EXPORT_SYMBOL_GPL(nfs_commit_free); -struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) +static struct nfs_rw_header *nfs_writehdr_alloc(void)  { -	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); +	struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); -	if (p) { +	if (p)  		memset(p, 0, sizeof(*p)); -		INIT_LIST_HEAD(&p->pages); -		p->npages = pagecount; -		if (pagecount <= ARRAY_SIZE(p->page_array)) -			p->pagevec = p->page_array; -		else { -			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); -			if (!p->pagevec) { -				mempool_free(p, nfs_wdata_mempool); -				p = NULL; -			} -		} -	}  	return p;  } -void nfs_writedata_free(struct nfs_write_data *p) -{ -	if (p && (p->pagevec != &p->page_array[0])) -		kfree(p->pagevec); -	mempool_free(p, nfs_wdata_mempool); -} - -static void nfs_writedata_release(struct nfs_write_data *wdata) +static void nfs_writehdr_free(struct nfs_rw_header *whdr)  { -	put_nfs_open_context(wdata->args.context); -	nfs_writedata_free(wdata); +	mempool_free(whdr, nfs_wdata_mempool);  }  static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) @@ -107,25 +92,53 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)  	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);  } -static struct nfs_page *nfs_page_find_request_locked(struct page *page) +/* + * nfs_page_find_head_request_locked - find head request associated with @page + * + * must be called while holding the inode lock. + * + * returns matching head request with reference held, or NULL if not found. + */ +static struct nfs_page * +nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)  {  	struct nfs_page *req = NULL; -	if (PagePrivate(page)) { +	if (PagePrivate(page))  		req = (struct nfs_page *)page_private(page); -		if (req != NULL) -			kref_get(&req->wb_kref); +	else if (unlikely(PageSwapCache(page))) { +		struct nfs_page *freq, *t; + +		/* Linearly search the commit list for the correct req */ +		list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) { +			if (freq->wb_page == page) { +				req = freq->wb_head; +				break; +			} +		} +	} + +	if (req) { +		WARN_ON_ONCE(req->wb_head != req); + +		kref_get(&req->wb_kref);  	} +  	return req;  } -static struct nfs_page *nfs_page_find_request(struct page *page) +/* + * nfs_page_find_head_request - find head request associated with @page + * + * returns matching head request with reference held, or NULL if not found. + */ +static struct nfs_page *nfs_page_find_head_request(struct page *page)  { -	struct inode *inode = page->mapping->host; +	struct inode *inode = page_file_mapping(page)->host;  	struct nfs_page *req = NULL;  	spin_lock(&inode->i_lock); -	req = nfs_page_find_request_locked(page); +	req = nfs_page_find_head_request_locked(NFS_I(inode), page);  	spin_unlock(&inode->i_lock);  	return req;  } @@ -133,16 +146,16 @@ static struct nfs_page *nfs_page_find_request(struct page *page)  /* Adjust the file length if we're writing beyond the end */  static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)  { -	struct inode *inode = page->mapping->host; +	struct inode *inode = page_file_mapping(page)->host;  	loff_t end, i_size;  	pgoff_t end_index;  	spin_lock(&inode->i_lock);  	i_size = i_size_read(inode);  	end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; -	if (i_size > 0 && page->index < end_index) +	if (i_size > 0 && page_file_index(page) < end_index)  		goto out; -	end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); +	end = page_file_offset(page) + ((loff_t)offset+count);  	if (i_size >= end)  		goto out;  	i_size_write(inode, end); @@ -154,22 +167,81 @@ out:  /* A writeback failed: mark the page as bad, and invalidate the page cache */  static void nfs_set_pageerror(struct page *page)  { -	SetPageError(page); -	nfs_zap_mapping(page->mapping->host, page->mapping); +	nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); +} + +/* + * nfs_page_group_search_locked + * @head - head request of page group + * @page_offset - offset into page + * + * Search page group with head @head to find a request that contains the + * page offset @page_offset. + * + * Returns a pointer to the first matching nfs request, or NULL if no + * match is found. + * + * Must be called with the page group lock held + */ +static struct nfs_page * +nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) +{ +	struct nfs_page *req; + +	WARN_ON_ONCE(head != head->wb_head); +	WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags)); + +	req = head; +	do { +		if (page_offset >= req->wb_pgbase && +		    page_offset < (req->wb_pgbase + req->wb_bytes)) +			return req; + +		req = req->wb_this_page; +	} while (req != head); + +	return NULL; +} + +/* + * nfs_page_group_covers_page + * @head - head request of page group + * + * Return true if the page group with head @head covers the whole page, + * returns false otherwise + */ +static bool nfs_page_group_covers_page(struct nfs_page *req) +{ +	struct nfs_page *tmp; +	unsigned int pos = 0; +	unsigned int len = nfs_page_length(req->wb_page); + +	nfs_page_group_lock(req); + +	do { +		tmp = nfs_page_group_search_locked(req->wb_head, pos); +		if (tmp) { +			/* no way this should happen */ +			WARN_ON_ONCE(tmp->wb_pgbase != pos); +			pos += tmp->wb_bytes - (pos - tmp->wb_pgbase); +		} +	} while (tmp && pos < len); + +	nfs_page_group_unlock(req); +	WARN_ON_ONCE(pos > len); +	return pos == len;  }  /* We can set the PG_uptodate flag if we see that a write request   * covers the full page.   */ -static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count) +static void nfs_mark_uptodate(struct nfs_page *req)  { -	if (PageUptodate(page)) -		return; -	if (base != 0) +	if (PageUptodate(req->wb_page))  		return; -	if (count != nfs_page_length(page)) +	if (!nfs_page_group_covers_page(req))  		return; -	SetPageUptodate(page); +	SetPageUptodate(req->wb_page);  }  static int wb_priority(struct writeback_control *wbc) @@ -177,8 +249,8 @@ static int wb_priority(struct writeback_control *wbc)  	if (wbc->for_reclaim)  		return FLUSH_HIGHPRI | FLUSH_STABLE;  	if (wbc->for_kupdate || wbc->for_background) -		return FLUSH_LOWPRI; -	return 0; +		return FLUSH_LOWPRI | FLUSH_COND_STABLE; +	return FLUSH_COND_STABLE;  }  /* @@ -191,65 +263,273 @@ int nfs_congestion_kb;  #define NFS_CONGESTION_OFF_THRESH	\  	(NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) -static int nfs_set_page_writeback(struct page *page) +static void nfs_set_page_writeback(struct page *page)  { +	struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host);  	int ret = test_set_page_writeback(page); -	if (!ret) { -		struct inode *inode = page->mapping->host; -		struct nfs_server *nfss = NFS_SERVER(inode); +	WARN_ON_ONCE(ret != 0); -		page_cache_get(page); -		if (atomic_long_inc_return(&nfss->writeback) > -				NFS_CONGESTION_ON_THRESH) { -			set_bdi_congested(&nfss->backing_dev_info, -						BLK_RW_ASYNC); -		} +	if (atomic_long_inc_return(&nfss->writeback) > +			NFS_CONGESTION_ON_THRESH) { +		set_bdi_congested(&nfss->backing_dev_info, +					BLK_RW_ASYNC);  	} -	return ret;  } -static void nfs_end_page_writeback(struct page *page) +static void nfs_end_page_writeback(struct nfs_page *req)  { -	struct inode *inode = page->mapping->host; +	struct inode *inode = page_file_mapping(req->wb_page)->host;  	struct nfs_server *nfss = NFS_SERVER(inode); -	end_page_writeback(page); -	page_cache_release(page); +	if (!nfs_page_group_sync_on_bit(req, PG_WB_END)) +		return; + +	end_page_writeback(req->wb_page);  	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)  		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);  } -static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) + +/* nfs_page_group_clear_bits + *   @req - an nfs request + * clears all page group related bits from @req + */ +static void +nfs_page_group_clear_bits(struct nfs_page *req)  { -	struct inode *inode = page->mapping->host; -	struct nfs_page *req; +	clear_bit(PG_TEARDOWN, &req->wb_flags); +	clear_bit(PG_UNLOCKPAGE, &req->wb_flags); +	clear_bit(PG_UPTODATE, &req->wb_flags); +	clear_bit(PG_WB_END, &req->wb_flags); +	clear_bit(PG_REMOVE, &req->wb_flags); +} + + +/* + * nfs_unroll_locks_and_wait -  unlock all newly locked reqs and wait on @req + * + * this is a helper function for nfs_lock_and_join_requests + * + * @inode - inode associated with request page group, must be holding inode lock + * @head  - head request of page group, must be holding head lock + * @req   - request that couldn't lock and needs to wait on the req bit lock + * @nonblock - if true, don't actually wait + * + * NOTE: this must be called holding page_group bit lock and inode spin lock + *       and BOTH will be released before returning. + * + * returns 0 on success, < 0 on error. + */ +static int +nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, +			  struct nfs_page *req, bool nonblock) +	__releases(&inode->i_lock) +{ +	struct nfs_page *tmp; +	int ret; + +	/* relinquish all the locks successfully grabbed this run */ +	for (tmp = head ; tmp != req; tmp = tmp->wb_this_page) +		nfs_unlock_request(tmp); + +	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); + +	/* grab a ref on the request that will be waited on */ +	kref_get(&req->wb_kref); + +	nfs_page_group_unlock(head); +	spin_unlock(&inode->i_lock); + +	/* release ref from nfs_page_find_head_request_locked */ +	nfs_release_request(head); + +	if (!nonblock) +		ret = nfs_wait_on_request(req); +	else +		ret = -EAGAIN; +	nfs_release_request(req); + +	return ret; +} + +/* + * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests + * + * @destroy_list - request list (using wb_this_page) terminated by @old_head + * @old_head - the old head of the list + * + * All subrequests must be locked and removed from all lists, so at this point + * they are only "active" in this function, and possibly in nfs_wait_on_request + * with a reference held by some other context. + */ +static void +nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, +				 struct nfs_page *old_head) +{ +	while (destroy_list) { +		struct nfs_page *subreq = destroy_list; + +		destroy_list = (subreq->wb_this_page == old_head) ? +				   NULL : subreq->wb_this_page; + +		WARN_ON_ONCE(old_head != subreq->wb_head); + +		/* make sure old group is not used */ +		subreq->wb_head = subreq; +		subreq->wb_this_page = subreq; + +		nfs_clear_request_commit(subreq); + +		/* subreq is now totally disconnected from page group or any +		 * write / commit lists. last chance to wake any waiters */ +		nfs_unlock_request(subreq); + +		if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) { +			/* release ref on old head request */ +			nfs_release_request(old_head); + +			nfs_page_group_clear_bits(subreq); + +			/* release the PG_INODE_REF reference */ +			if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) +				nfs_release_request(subreq); +			else +				WARN_ON_ONCE(1); +		} else { +			WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags)); +			/* zombie requests have already released the last +			 * reference and were waiting on the rest of the +			 * group to complete. Since it's no longer part of a +			 * group, simply free the request */ +			nfs_page_group_clear_bits(subreq); +			nfs_free_request(subreq); +		} +	} +} + +/* + * nfs_lock_and_join_requests - join all subreqs to the head req and return + *                              a locked reference, cancelling any pending + *                              operations for this page. + * + * @page - the page used to lookup the "page group" of nfs_page structures + * @nonblock - if true, don't block waiting for request locks + * + * This function joins all sub requests to the head request by first + * locking all requests in the group, cancelling any pending operations + * and finally updating the head request to cover the whole range covered by + * the (former) group.  All subrequests are removed from any write or commit + * lists, unlinked from the group and destroyed. + * + * Returns a locked, referenced pointer to the head request - which after + * this call is guaranteed to be the only request associated with the page. + * Returns NULL if no requests are found for @page, or a ERR_PTR if an + * error was encountered. + */ +static struct nfs_page * +nfs_lock_and_join_requests(struct page *page, bool nonblock) +{ +	struct inode *inode = page_file_mapping(page)->host; +	struct nfs_page *head, *subreq; +	struct nfs_page *destroy_list = NULL; +	unsigned int total_bytes;  	int ret; +try_again: +	total_bytes = 0; + +	WARN_ON_ONCE(destroy_list); +  	spin_lock(&inode->i_lock); -	for (;;) { -		req = nfs_page_find_request_locked(page); -		if (req == NULL) -			break; -		if (nfs_set_page_tag_locked(req)) -			break; -		/* Note: If we hold the page lock, as is the case in nfs_writepage, -		 *	 then the call to nfs_set_page_tag_locked() will always -		 *	 succeed provided that someone hasn't already marked the -		 *	 request as dirty (in which case we don't care). -		 */ + +	/* +	 * A reference is taken only on the head request which acts as a +	 * reference to the whole page group - the group will not be destroyed +	 * until the head reference is released. +	 */ +	head = nfs_page_find_head_request_locked(NFS_I(inode), page); + +	if (!head) {  		spin_unlock(&inode->i_lock); -		if (!nonblock) -			ret = nfs_wait_on_request(req); -		else -			ret = -EAGAIN; -		nfs_release_request(req); -		if (ret != 0) +		return NULL; +	} + +	/* lock each request in the page group */ +	nfs_page_group_lock(head); +	subreq = head; +	do { +		/* +		 * Subrequests are always contiguous, non overlapping +		 * and in order. If not, it's a programming error. +		 */ +		WARN_ON_ONCE(subreq->wb_offset != +		     (head->wb_offset + total_bytes)); + +		/* keep track of how many bytes this group covers */ +		total_bytes += subreq->wb_bytes; + +		if (!nfs_lock_request(subreq)) { +			/* releases page group bit lock and +			 * inode spin lock and all references */ +			ret = nfs_unroll_locks_and_wait(inode, head, +				subreq, nonblock); + +			if (ret == 0) +				goto try_again; +  			return ERR_PTR(ret); -		spin_lock(&inode->i_lock); +		} + +		subreq = subreq->wb_this_page; +	} while (subreq != head); + +	/* Now that all requests are locked, make sure they aren't on any list. +	 * Commit list removal accounting is done after locks are dropped */ +	subreq = head; +	do { +		nfs_list_remove_request(subreq); +		subreq = subreq->wb_this_page; +	} while (subreq != head); + +	/* unlink subrequests from head, destroy them later */ +	if (head->wb_this_page != head) { +		/* destroy list will be terminated by head */ +		destroy_list = head->wb_this_page; +		head->wb_this_page = head; + +		/* change head request to cover whole range that +		 * the former page group covered */ +		head->wb_bytes = total_bytes;  	} + +	/* +	 * prepare head request to be added to new pgio descriptor +	 */ +	nfs_page_group_clear_bits(head); + +	/* +	 * some part of the group was still on the inode list - otherwise +	 * the group wouldn't be involved in async write. +	 * grab a reference for the head request, iff it needs one. +	 */ +	if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags)) +		kref_get(&head->wb_kref); + +	nfs_page_group_unlock(head); + +	/* drop lock to clear_request_commit the head req and clean up +	 * requests on destroy list */  	spin_unlock(&inode->i_lock); -	return req; + +	nfs_destroy_unlinked_subrequests(destroy_list, head); + +	/* clean up commit list state */ +	nfs_clear_request_commit(head); + +	/* still holds ref on head from nfs_page_find_head_request_locked +	 * and still has lock on head from lock loop */ +	return head;  }  /* @@ -262,17 +542,17 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,  	struct nfs_page *req;  	int ret = 0; -	req = nfs_find_and_lock_request(page, nonblock); +	req = nfs_lock_and_join_requests(page, nonblock);  	if (!req)  		goto out;  	ret = PTR_ERR(req);  	if (IS_ERR(req))  		goto out; -	ret = nfs_set_page_writeback(page); -	BUG_ON(ret != 0); -	BUG_ON(test_bit(PG_CLEAN, &req->wb_flags)); +	nfs_set_page_writeback(page); +	WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); +	ret = 0;  	if (!nfs_pageio_add_request(pgio, req)) {  		nfs_redirty_request(req);  		ret = pgio->pg_error; @@ -283,13 +563,13 @@ out:  static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)  { -	struct inode *inode = page->mapping->host; +	struct inode *inode = page_file_mapping(page)->host;  	int ret;  	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);  	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); -	nfs_pageio_cond_complete(pgio, page->index); +	nfs_pageio_cond_complete(pgio, page_file_index(page));  	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);  	if (ret == -EAGAIN) {  		redirty_page_for_writepage(wbc, page); @@ -306,7 +586,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc  	struct nfs_pageio_descriptor pgio;  	int err; -	nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc)); +	nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc), +				false, &nfs_async_write_completion_ops);  	err = nfs_do_writepage(page, wbc, &pgio);  	nfs_pageio_complete(&pgio);  	if (err < 0) @@ -349,12 +630,13 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)  	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); -	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); +	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, +				&nfs_async_write_completion_ops);  	err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);  	nfs_pageio_complete(&pgio);  	clear_bit_unlock(NFS_INO_FLUSHING, bitlock); -	smp_mb__after_clear_bit(); +	smp_mb__after_atomic();  	wake_up_bit(bitlock, NFS_INO_FLUSHING);  	if (err < 0) @@ -370,36 +652,33 @@ out_err:  /*   * Insert a write request into an inode   */ -static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) +static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)  {  	struct nfs_inode *nfsi = NFS_I(inode); -	int error; -	error = radix_tree_preload(GFP_NOFS); -	if (error != 0) -		goto out; +	WARN_ON_ONCE(req->wb_this_page != req);  	/* Lock the request! */ -	nfs_lock_request_dontget(req); +	nfs_lock_request(req);  	spin_lock(&inode->i_lock); -	error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); -	BUG_ON(error); -	if (!nfsi->npages) { -		igrab(inode); -		if (nfs_have_delegation(inode, FMODE_WRITE)) -			nfsi->change_attr++; +	if (!nfsi->npages && NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) +		inode->i_version++; +	/* +	 * Swap-space should not get truncated. Hence no need to plug the race +	 * with invalidate/truncate. +	 */ +	if (likely(!PageSwapCache(req->wb_page))) { +		set_bit(PG_MAPPED, &req->wb_flags); +		SetPagePrivate(req->wb_page); +		set_page_private(req->wb_page, (unsigned long)req);  	} -	SetPagePrivate(req->wb_page); -	set_page_private(req->wb_page, (unsigned long)req);  	nfsi->npages++; +	/* this a head request for a page group - mark it as having an +	 * extra reference so sub groups can follow suit */ +	WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags));  	kref_get(&req->wb_kref); -	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, -				NFS_PAGE_TAG_LOCKED);  	spin_unlock(&inode->i_lock); -	radix_tree_preload_end(); -out: -	return error;  }  /* @@ -407,151 +686,288 @@ out:   */  static void nfs_inode_remove_request(struct nfs_page *req)  { -	struct inode *inode = req->wb_context->path.dentry->d_inode; +	struct inode *inode = req->wb_context->dentry->d_inode;  	struct nfs_inode *nfsi = NFS_I(inode); +	struct nfs_page *head; -	BUG_ON (!NFS_WBACK_BUSY(req)); +	if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { +		head = req->wb_head; -	spin_lock(&inode->i_lock); -	set_page_private(req->wb_page, 0); -	ClearPagePrivate(req->wb_page); -	radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); -	nfsi->npages--; -	if (!nfsi->npages) { -		spin_unlock(&inode->i_lock); -		iput(inode); -	} else +		spin_lock(&inode->i_lock); +		if (likely(!PageSwapCache(head->wb_page))) { +			set_page_private(head->wb_page, 0); +			ClearPagePrivate(head->wb_page); +			clear_bit(PG_MAPPED, &head->wb_flags); +		} +		nfsi->npages--;  		spin_unlock(&inode->i_lock); -	nfs_clear_request(req); -	nfs_release_request(req); +	} + +	if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) +		nfs_release_request(req);  }  static void  nfs_mark_request_dirty(struct nfs_page *req)  {  	__set_page_dirty_nobuffers(req->wb_page); -	__mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);  } -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) +/** + * nfs_request_add_commit_list - add request to a commit list + * @req: pointer to a struct nfs_page + * @dst: commit list head + * @cinfo: holds list lock and accounting info + * + * This sets the PG_CLEAN bit, updates the cinfo count of + * number of outstanding requests requiring a commit as well as + * the MM page stats. + * + * The caller must _not_ hold the cinfo->lock, but must be + * holding the nfs_page lock. + */ +void +nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, +			    struct nfs_commit_info *cinfo) +{ +	set_bit(PG_CLEAN, &(req)->wb_flags); +	spin_lock(cinfo->lock); +	nfs_list_add_request(req, dst); +	cinfo->mds->ncommit++; +	spin_unlock(cinfo->lock); +	if (!cinfo->dreq) { +		inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); +		inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, +			     BDI_RECLAIMABLE); +		__mark_inode_dirty(req->wb_context->dentry->d_inode, +				   I_DIRTY_DATASYNC); +	} +} +EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); + +/** + * nfs_request_remove_commit_list - Remove request from a commit list + * @req: pointer to a nfs_page + * @cinfo: holds list lock and accounting info + * + * This clears the PG_CLEAN bit, and updates the cinfo's count of + * number of outstanding requests requiring a commit + * It does not update the MM page stats. + * + * The caller _must_ hold the cinfo->lock and the nfs_page lock. + */ +void +nfs_request_remove_commit_list(struct nfs_page *req, +			       struct nfs_commit_info *cinfo) +{ +	if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) +		return; +	nfs_list_remove_request(req); +	cinfo->mds->ncommit--; +} +EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list); + +static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, +				      struct inode *inode) +{ +	cinfo->lock = &inode->i_lock; +	cinfo->mds = &NFS_I(inode)->commit_info; +	cinfo->ds = pnfs_get_ds_info(inode); +	cinfo->dreq = NULL; +	cinfo->completion_ops = &nfs_commit_completion_ops; +} + +void nfs_init_cinfo(struct nfs_commit_info *cinfo, +		    struct inode *inode, +		    struct nfs_direct_req *dreq) +{ +	if (dreq) +		nfs_init_cinfo_from_dreq(cinfo, dreq); +	else +		nfs_init_cinfo_from_inode(cinfo, inode); +} +EXPORT_SYMBOL_GPL(nfs_init_cinfo); +  /*   * Add a request to the inode's commit list.   */ -static void -nfs_mark_request_commit(struct nfs_page *req) +void +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, +			struct nfs_commit_info *cinfo)  { -	struct inode *inode = req->wb_context->path.dentry->d_inode; -	struct nfs_inode *nfsi = NFS_I(inode); +	if (pnfs_mark_request_commit(req, lseg, cinfo)) +		return; +	nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); +} -	spin_lock(&inode->i_lock); -	set_bit(PG_CLEAN, &(req)->wb_flags); -	radix_tree_tag_set(&nfsi->nfs_page_tree, -			req->wb_index, -			NFS_PAGE_TAG_COMMIT); -	nfsi->ncommit++; -	spin_unlock(&inode->i_lock); -	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); -	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); -	__mark_inode_dirty(inode, I_DIRTY_DATASYNC); +static void +nfs_clear_page_commit(struct page *page) +{ +	dec_zone_page_state(page, NR_UNSTABLE_NFS); +	dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);  } -static int +static void  nfs_clear_request_commit(struct nfs_page *req)  { -	struct page *page = req->wb_page; +	if (test_bit(PG_CLEAN, &req->wb_flags)) { +		struct inode *inode = req->wb_context->dentry->d_inode; +		struct nfs_commit_info cinfo; -	if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { -		dec_zone_page_state(page, NR_UNSTABLE_NFS); -		dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); -		return 1; +		nfs_init_cinfo_from_inode(&cinfo, inode); +		if (!pnfs_clear_request_commit(req, &cinfo)) { +			spin_lock(cinfo.lock); +			nfs_request_remove_commit_list(req, &cinfo); +			spin_unlock(cinfo.lock); +		} +		nfs_clear_page_commit(req->wb_page);  	} -	return 0;  }  static inline -int nfs_write_need_commit(struct nfs_write_data *data) +int nfs_write_need_commit(struct nfs_pgio_data *data)  { +	if (data->verf.committed == NFS_DATA_SYNC) +		return data->header->lseg == NULL;  	return data->verf.committed != NFS_FILE_SYNC;  } -static inline -int nfs_reschedule_unstable_write(struct nfs_page *req) +#else +static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, +				      struct inode *inode)  { -	if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) { -		nfs_mark_request_commit(req); -		return 1; -	} -	if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) { -		nfs_mark_request_dirty(req); -		return 1; -	} -	return 0;  } -#else -static inline void -nfs_mark_request_commit(struct nfs_page *req) + +void nfs_init_cinfo(struct nfs_commit_info *cinfo, +		    struct inode *inode, +		    struct nfs_direct_req *dreq)  {  } -static inline int -nfs_clear_request_commit(struct nfs_page *req) +void +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, +			struct nfs_commit_info *cinfo)  { -	return 0;  } -static inline -int nfs_write_need_commit(struct nfs_write_data *data) +static void +nfs_clear_request_commit(struct nfs_page *req)  { -	return 0;  }  static inline -int nfs_reschedule_unstable_write(struct nfs_page *req) +int nfs_write_need_commit(struct nfs_pgio_data *data)  {  	return 0;  } +  #endif -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -static int -nfs_need_commit(struct nfs_inode *nfsi) +static void nfs_write_completion(struct nfs_pgio_header *hdr) +{ +	struct nfs_commit_info cinfo; +	unsigned long bytes = 0; + +	if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) +		goto out; +	nfs_init_cinfo_from_inode(&cinfo, hdr->inode); +	while (!list_empty(&hdr->pages)) { +		struct nfs_page *req = nfs_list_entry(hdr->pages.next); + +		bytes += req->wb_bytes; +		nfs_list_remove_request(req); +		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && +		    (hdr->good_bytes < bytes)) { +			nfs_set_pageerror(req->wb_page); +			nfs_context_set_write_error(req->wb_context, hdr->error); +			goto remove_req; +		} +		if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) { +			nfs_mark_request_dirty(req); +			goto next; +		} +		if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { +			memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); +			nfs_mark_request_commit(req, hdr->lseg, &cinfo); +			goto next; +		} +remove_req: +		nfs_inode_remove_request(req); +next: +		nfs_unlock_request(req); +		nfs_end_page_writeback(req); +		nfs_release_request(req); +	} +out: +	hdr->release(hdr); +} + +#if  IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) +unsigned long +nfs_reqs_to_commit(struct nfs_commit_info *cinfo) +{ +	return cinfo->mds->ncommit; +} + +/* cinfo->lock held by caller */ +int +nfs_scan_commit_list(struct list_head *src, struct list_head *dst, +		     struct nfs_commit_info *cinfo, int max)  { -	return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT); +	struct nfs_page *req, *tmp; +	int ret = 0; + +	list_for_each_entry_safe(req, tmp, src, wb_list) { +		if (!nfs_lock_request(req)) +			continue; +		kref_get(&req->wb_kref); +		if (cond_resched_lock(cinfo->lock)) +			list_safe_reset_next(req, tmp, wb_list); +		nfs_request_remove_commit_list(req, cinfo); +		nfs_list_add_request(req, dst); +		ret++; +		if ((ret == max) && !cinfo->dreq) +			break; +	} +	return ret;  }  /*   * nfs_scan_commit - Scan an inode for commit requests   * @inode: NFS inode to scan - * @dst: destination list - * @idx_start: lower bound of page->index to scan. - * @npages: idx_start + npages sets the upper bound to scan. + * @dst: mds destination list + * @cinfo: mds and ds lists of reqs ready to commit   *   * Moves requests from the inode's 'commit' request list.   * The requests are *not* checked to ensure that they form a contiguous set.   */ -static int -nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) +int +nfs_scan_commit(struct inode *inode, struct list_head *dst, +		struct nfs_commit_info *cinfo)  { -	struct nfs_inode *nfsi = NFS_I(inode); -	int ret; +	int ret = 0; -	if (!nfs_need_commit(nfsi)) -		return 0; +	spin_lock(cinfo->lock); +	if (cinfo->mds->ncommit > 0) { +		const int max = INT_MAX; -	ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); -	if (ret > 0) -		nfsi->ncommit -= ret; -	if (nfs_need_commit(NFS_I(inode))) -		__mark_inode_dirty(inode, I_DIRTY_DATASYNC); +		ret = nfs_scan_commit_list(&cinfo->mds->list, dst, +					   cinfo, max); +		ret += pnfs_scan_commit_lists(inode, cinfo, max - ret); +	} +	spin_unlock(cinfo->lock);  	return ret;  } +  #else -static inline int nfs_need_commit(struct nfs_inode *nfsi) +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)  {  	return 0;  } -static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) +int nfs_scan_commit(struct inode *inode, struct list_head *dst, +		    struct nfs_commit_info *cinfo)  {  	return 0;  } @@ -581,10 +997,14 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,  	spin_lock(&inode->i_lock);  	for (;;) { -		req = nfs_page_find_request_locked(page); +		req = nfs_page_find_head_request_locked(NFS_I(inode), page);  		if (req == NULL)  			goto out_unlock; +		/* should be handled by nfs_flush_incompatible */ +		WARN_ON_ONCE(req->wb_head != req); +		WARN_ON_ONCE(req->wb_this_page != req); +  		rqend = req->wb_offset + req->wb_bytes;  		/*  		 * Tell the caller to flush out the request if @@ -596,7 +1016,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,  		    || end < req->wb_offset)  			goto out_flushme; -		if (nfs_set_page_tag_locked(req)) +		if (nfs_lock_request(req))  			break;  		/* The request is locked, so wait and then retry */ @@ -608,11 +1028,6 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,  		spin_lock(&inode->i_lock);  	} -	if (nfs_clear_request_commit(req) && -			radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, -				req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) -		NFS_I(inode)->ncommit--; -  	/* Okay, the request matches. Update the region */  	if (offset < req->wb_offset) {  		req->wb_offset = offset; @@ -624,6 +1039,8 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,  		req->wb_bytes = rqend - req->wb_offset;  out_unlock:  	spin_unlock(&inode->i_lock); +	if (req) +		nfs_clear_request_commit(req);  	return req;  out_flushme:  	spin_unlock(&inode->i_lock); @@ -643,21 +1060,16 @@ out_err:  static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,  		struct page *page, unsigned int offset, unsigned int bytes)  { -	struct inode *inode = page->mapping->host; +	struct inode *inode = page_file_mapping(page)->host;  	struct nfs_page	*req; -	int error;  	req = nfs_try_to_update_request(inode, page, offset, bytes);  	if (req != NULL)  		goto out; -	req = nfs_create_request(ctx, inode, page, offset, bytes); +	req = nfs_create_request(ctx, page, NULL, offset, bytes);  	if (IS_ERR(req))  		goto out; -	error = nfs_inode_add_request(inode, req); -	if (error != 0) { -		nfs_release_request(req); -		req = ERR_PTR(error); -	} +	nfs_inode_add_request(inode, req);  out:  	return req;  } @@ -670,18 +1082,18 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,  	req = nfs_setup_write_request(ctx, page, offset, count);  	if (IS_ERR(req))  		return PTR_ERR(req); -	nfs_mark_request_dirty(req);  	/* Update file length */  	nfs_grow_file(page, offset, count); -	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); +	nfs_mark_uptodate(req);  	nfs_mark_request_dirty(req); -	nfs_clear_page_tag_locked(req); +	nfs_unlock_and_release_request(req);  	return 0;  }  int nfs_flush_incompatible(struct file *file, struct page *page)  {  	struct nfs_open_context *ctx = nfs_file_open_context(file); +	struct nfs_lock_context *l_ctx;  	struct nfs_page	*req;  	int do_flush, status;  	/* @@ -693,29 +1105,95 @@ int nfs_flush_incompatible(struct file *file, struct page *page)  	 * dropped page.  	 */  	do { -		req = nfs_page_find_request(page); +		req = nfs_page_find_head_request(page);  		if (req == NULL)  			return 0; -		do_flush = req->wb_page != page || req->wb_context != ctx || -			req->wb_lock_context->lockowner != current->files || -			req->wb_lock_context->pid != current->tgid; +		l_ctx = req->wb_lock_context; +		do_flush = req->wb_page != page || req->wb_context != ctx; +		/* for now, flush if more than 1 request in page_group */ +		do_flush |= req->wb_this_page != req; +		if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) { +			do_flush |= l_ctx->lockowner.l_owner != current->files +				|| l_ctx->lockowner.l_pid != current->tgid; +		}  		nfs_release_request(req);  		if (!do_flush)  			return 0; -		status = nfs_wb_page(page->mapping->host, page); +		status = nfs_wb_page(page_file_mapping(page)->host, page);  	} while (status == 0);  	return status;  }  /* + * Avoid buffered writes when a open context credential's key would + * expire soon. + * + * Returns -EACCES if the key will expire within RPC_KEY_EXPIRE_FAIL. + * + * Return 0 and set a credential flag which triggers the inode to flush + * and performs  NFS_FILE_SYNC writes if the key will expired within + * RPC_KEY_EXPIRE_TIMEO. + */ +int +nfs_key_timeout_notify(struct file *filp, struct inode *inode) +{ +	struct nfs_open_context *ctx = nfs_file_open_context(filp); +	struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth; + +	return rpcauth_key_timeout_notify(auth, ctx->cred); +} + +/* + * Test if the open context credential key is marked to expire soon. + */ +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx) +{ +	return rpcauth_cred_key_to_expire(ctx->cred); +} + +/*   * If the page cache is marked as unsafe or invalid, then we can't rely on   * the PageUptodate() flag. In this case, we will need to turn off   * write optimisations that depend on the page contents being correct.   */ -static int nfs_write_pageuptodate(struct page *page, struct inode *inode) +static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)  { -	return PageUptodate(page) && -		!(NFS_I(inode)->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA)); +	struct nfs_inode *nfsi = NFS_I(inode); + +	if (nfs_have_delegated_attributes(inode)) +		goto out; +	if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) +		return false; +	smp_rmb(); +	if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) +		return false; +out: +	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) +		return false; +	return PageUptodate(page) != 0; +} + +/* If we know the page is up to date, and we're not using byte range locks (or + * if we have the whole file locked for writing), it may be more efficient to + * extend the write to cover the entire page in order to avoid fragmentation + * inefficiencies. + * + * If the file is opened for synchronous writes then we can just skip the rest + * of the checks. + */ +static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode) +{ +	if (file->f_flags & O_DSYNC) +		return 0; +	if (!nfs_write_pageuptodate(page, inode)) +		return 0; +	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) +		return 1; +	if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 && +			inode->i_flock->fl_end == OFFSET_MAX && +			inode->i_flock->fl_type != F_RDLCK)) +		return 1; +	return 0;  }  /* @@ -728,24 +1206,15 @@ int nfs_updatepage(struct file *file, struct page *page,  		unsigned int offset, unsigned int count)  {  	struct nfs_open_context *ctx = nfs_file_open_context(file); -	struct inode	*inode = page->mapping->host; +	struct inode	*inode = page_file_mapping(page)->host;  	int		status = 0;  	nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); -	dprintk("NFS:       nfs_updatepage(%s/%s %d@%lld)\n", -		file->f_path.dentry->d_parent->d_name.name, -		file->f_path.dentry->d_name.name, count, -		(long long)(page_offset(page) + offset)); +	dprintk("NFS:       nfs_updatepage(%pD2 %d@%lld)\n", +		file, count, (long long)(page_file_offset(page) + offset)); -	/* If we're not using byte range locks, and we know the page -	 * is up to date, it may be more efficient to extend the write -	 * to cover the entire page in order to avoid fragmentation -	 * inefficiencies. -	 */ -	if (nfs_write_pageuptodate(page, inode) && -			inode->i_flock == NULL && -			!(file->f_flags & O_DSYNC)) { +	if (nfs_can_extend_write(file, page, inode)) {  		count = max(count + offset, nfs_page_length(page));  		offset = 0;  	} @@ -753,22 +1222,14 @@ int nfs_updatepage(struct file *file, struct page *page,  	status = nfs_writepage_setup(ctx, page, offset, count);  	if (status < 0)  		nfs_set_pageerror(page); +	else +		__set_page_dirty_nobuffers(page);  	dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",  			status, (long long)i_size_read(inode));  	return status;  } -static void nfs_writepage_release(struct nfs_page *req) -{ -	struct page *page = req->wb_page; - -	if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) -		nfs_inode_remove_request(req); -	nfs_clear_page_tag_locked(req); -	nfs_end_page_writeback(page); -} -  static int flush_task_priority(int how)  {  	switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) { @@ -780,85 +1241,17 @@ static int flush_task_priority(int how)  	return RPC_PRIORITY_NORMAL;  } -/* - * Set up the argument/result storage required for the RPC call. - */ -static int nfs_write_rpcsetup(struct nfs_page *req, -		struct nfs_write_data *data, -		const struct rpc_call_ops *call_ops, -		unsigned int count, unsigned int offset, -		int how) +static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg, +			       struct rpc_task_setup *task_setup_data, int how)  { -	struct inode *inode = req->wb_context->path.dentry->d_inode; +	struct inode *inode = data->header->inode;  	int priority = flush_task_priority(how); -	struct rpc_task *task; -	struct rpc_message msg = { -		.rpc_argp = &data->args, -		.rpc_resp = &data->res, -		.rpc_cred = req->wb_context->cred, -	}; -	struct rpc_task_setup task_setup_data = { -		.rpc_client = NFS_CLIENT(inode), -		.task = &data->task, -		.rpc_message = &msg, -		.callback_ops = call_ops, -		.callback_data = data, -		.workqueue = nfsiod_workqueue, -		.flags = RPC_TASK_ASYNC, -		.priority = priority, -	}; -	int ret = 0; -	/* Set up the RPC argument and reply structs -	 * NB: take care not to mess about with data->commit et al. */ - -	data->req = req; -	data->inode = inode = req->wb_context->path.dentry->d_inode; -	data->cred = msg.rpc_cred; - -	data->args.fh     = NFS_FH(inode); -	data->args.offset = req_offset(req) + offset; -	data->args.pgbase = req->wb_pgbase + offset; -	data->args.pages  = data->pagevec; -	data->args.count  = count; -	data->args.context = get_nfs_open_context(req->wb_context); -	data->args.lock_context = req->wb_lock_context; -	data->args.stable  = NFS_UNSTABLE; -	if (how & FLUSH_STABLE) { -		data->args.stable = NFS_DATA_SYNC; -		if (!nfs_need_commit(NFS_I(inode))) -			data->args.stable = NFS_FILE_SYNC; -	} - -	data->res.fattr   = &data->fattr; -	data->res.count   = count; -	data->res.verf    = &data->verf; -	nfs_fattr_init(&data->fattr); - -	/* Set up the initial task struct.  */ -	NFS_PROTO(inode)->write_setup(data, &msg); +	task_setup_data->priority = priority; +	NFS_PROTO(inode)->write_setup(data, msg); -	dprintk("NFS: %5u initiated write call " -		"(req %s/%lld, %u bytes @ offset %llu)\n", -		data->task.tk_pid, -		inode->i_sb->s_id, -		(long long)NFS_FILEID(inode), -		count, -		(unsigned long long)data->args.offset); - -	task = rpc_run_task(&task_setup_data); -	if (IS_ERR(task)) { -		ret = PTR_ERR(task); -		goto out; -	} -	if (how & FLUSH_SYNC) { -		ret = rpc_wait_for_completion_task(task); -		if (ret == 0) -			ret = task->tk_status; -	} -	rpc_put_task(task); -out: -	return ret; +	nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client, +				 &task_setup_data->rpc_client, msg, data);  }  /* If a nfs_flush_* function fails, it should remove reqs from @head and @@ -867,280 +1260,109 @@ out:   */  static void nfs_redirty_request(struct nfs_page *req)  { -	struct page *page = req->wb_page; -  	nfs_mark_request_dirty(req); -	nfs_clear_page_tag_locked(req); -	nfs_end_page_writeback(page); -} - -/* - * Generate multiple small requests to write out a single - * contiguous dirty area on one page. - */ -static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) -{ -	struct nfs_page *req = nfs_list_entry(head->next); -	struct page *page = req->wb_page; -	struct nfs_write_data *data; -	size_t wsize = NFS_SERVER(inode)->wsize, nbytes; -	unsigned int offset; -	int requests = 0; -	int ret = 0; -	LIST_HEAD(list); - -	nfs_list_remove_request(req); - -	nbytes = count; -	do { -		size_t len = min(nbytes, wsize); - -		data = nfs_writedata_alloc(1); -		if (!data) -			goto out_bad; -		list_add(&data->pages, &list); -		requests++; -		nbytes -= len; -	} while (nbytes != 0); -	atomic_set(&req->wb_complete, requests); - -	ClearPageError(page); -	offset = 0; -	nbytes = count; -	do { -		int ret2; - -		data = list_entry(list.next, struct nfs_write_data, pages); -		list_del_init(&data->pages); - -		data->pagevec[0] = page; - -		if (nbytes < wsize) -			wsize = nbytes; -		ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, -				   wsize, offset, how); -		if (ret == 0) -			ret = ret2; -		offset += wsize; -		nbytes -= wsize; -	} while (nbytes != 0); - -	return ret; - -out_bad: -	while (!list_empty(&list)) { -		data = list_entry(list.next, struct nfs_write_data, pages); -		list_del(&data->pages); -		nfs_writedata_release(data); -	} -	nfs_redirty_request(req); -	return -ENOMEM; +	nfs_unlock_request(req); +	nfs_end_page_writeback(req); +	nfs_release_request(req);  } -/* - * Create an RPC task for the given write request and kick it. - * The page must have been locked by the caller. - * - * It may happen that the page we're passed is not marked dirty. - * This is the case if nfs_updatepage detects a conflicting request - * that has been written but not committed. - */ -static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) +static void nfs_async_write_error(struct list_head *head)  { -	struct nfs_page		*req; -	struct page		**pages; -	struct nfs_write_data	*data; - -	data = nfs_writedata_alloc(npages); -	if (!data) -		goto out_bad; - -	pages = data->pagevec; -	while (!list_empty(head)) { -		req = nfs_list_entry(head->next); -		nfs_list_remove_request(req); -		nfs_list_add_request(req, &data->pages); -		ClearPageError(req->wb_page); -		*pages++ = req->wb_page; -	} -	req = nfs_list_entry(data->pages.next); +	struct nfs_page	*req; -	/* Set up the argument struct */ -	return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how); - out_bad:  	while (!list_empty(head)) {  		req = nfs_list_entry(head->next);  		nfs_list_remove_request(req);  		nfs_redirty_request(req);  	} -	return -ENOMEM;  } -static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, -				  struct inode *inode, int ioflags) -{ -	size_t wsize = NFS_SERVER(inode)->wsize; - -	if (wsize < PAGE_CACHE_SIZE) -		nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); -	else -		nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags); -} +static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = { +	.error_cleanup = nfs_async_write_error, +	.completion = nfs_write_completion, +}; -/* - * Handle a write reply that flushed part of a page. - */ -static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) +void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, +			       struct inode *inode, int ioflags, bool force_mds, +			       const struct nfs_pgio_completion_ops *compl_ops)  { -	struct nfs_write_data	*data = calldata; - -	dprintk("NFS: %5u write(%s/%lld %d@%lld)", -		task->tk_pid, -		data->req->wb_context->path.dentry->d_inode->i_sb->s_id, -		(long long) -		  NFS_FILEID(data->req->wb_context->path.dentry->d_inode), -		data->req->wb_bytes, (long long)req_offset(data->req)); +	struct nfs_server *server = NFS_SERVER(inode); +	const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops; -	nfs_writeback_done(task, data); +#ifdef CONFIG_NFS_V4_1 +	if (server->pnfs_curr_ld && !force_mds) +		pg_ops = server->pnfs_curr_ld->pg_write_ops; +#endif +	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, +			server->wsize, ioflags);  } +EXPORT_SYMBOL_GPL(nfs_pageio_init_write); -static void nfs_writeback_release_partial(void *calldata) +void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)  { -	struct nfs_write_data	*data = calldata; -	struct nfs_page		*req = data->req; -	struct page		*page = req->wb_page; -	int status = data->task.tk_status; - -	if (status < 0) { -		nfs_set_pageerror(page); -		nfs_context_set_write_error(req->wb_context, status); -		dprintk(", error = %d\n", status); -		goto out; -	} +	pgio->pg_ops = &nfs_pgio_rw_ops; +	pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; +} +EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); -	if (nfs_write_need_commit(data)) { -		struct inode *inode = page->mapping->host; -		spin_lock(&inode->i_lock); -		if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) { -			/* Do nothing we need to resend the writes */ -		} else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) { -			memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); -			dprintk(" defer commit\n"); -		} else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) { -			set_bit(PG_NEED_RESCHED, &req->wb_flags); -			clear_bit(PG_NEED_COMMIT, &req->wb_flags); -			dprintk(" server reboot detected\n"); -		} -		spin_unlock(&inode->i_lock); -	} else -		dprintk(" OK\n"); +void nfs_commit_prepare(struct rpc_task *task, void *calldata) +{ +	struct nfs_commit_data *data = calldata; -out: -	if (atomic_dec_and_test(&req->wb_complete)) -		nfs_writepage_release(req); -	nfs_writedata_release(calldata); +	NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);  } -#if defined(CONFIG_NFS_V4_1) -void nfs_write_prepare(struct rpc_task *task, void *calldata) +static void nfs_writeback_release_common(struct nfs_pgio_data *data)  { -	struct nfs_write_data *data = calldata; +	struct nfs_pgio_header *hdr = data->header; +	int status = data->task.tk_status; -	if (nfs4_setup_sequence(NFS_SERVER(data->inode), -				&data->args.seq_args, -				&data->res.seq_res, 1, task)) -		return; -	rpc_call_start(task); +	if ((status >= 0) && nfs_write_need_commit(data)) { +		spin_lock(&hdr->lock); +		if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) +			; /* Do nothing */ +		else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) +			memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf)); +		else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf))) +			set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags); +		spin_unlock(&hdr->lock); +	}  } -#endif /* CONFIG_NFS_V4_1 */ - -static const struct rpc_call_ops nfs_write_partial_ops = { -#if defined(CONFIG_NFS_V4_1) -	.rpc_call_prepare = nfs_write_prepare, -#endif /* CONFIG_NFS_V4_1 */ -	.rpc_call_done = nfs_writeback_done_partial, -	.rpc_release = nfs_writeback_release_partial, -};  /* - * Handle a write reply that flushes a whole page. - * - * FIXME: There is an inherent race with invalidate_inode_pages and - *	  writebacks since the page->count is kept > 1 for as long - *	  as the page has a write request pending. + * Special version of should_remove_suid() that ignores capabilities.   */ -static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) -{ -	struct nfs_write_data	*data = calldata; - -	nfs_writeback_done(task, data); -} - -static void nfs_writeback_release_full(void *calldata) +static int nfs_should_remove_suid(const struct inode *inode)  { -	struct nfs_write_data	*data = calldata; -	int status = data->task.tk_status; - -	/* Update attributes as result of writeback. */ -	while (!list_empty(&data->pages)) { -		struct nfs_page *req = nfs_list_entry(data->pages.next); -		struct page *page = req->wb_page; +	umode_t mode = inode->i_mode; +	int kill = 0; -		nfs_list_remove_request(req); +	/* suid always must be killed */ +	if (unlikely(mode & S_ISUID)) +		kill = ATTR_KILL_SUID; -		dprintk("NFS: %5u write (%s/%lld %d@%lld)", -			data->task.tk_pid, -			req->wb_context->path.dentry->d_inode->i_sb->s_id, -			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), -			req->wb_bytes, -			(long long)req_offset(req)); +	/* +	 * sgid without any exec bits is just a mandatory locking mark; leave +	 * it alone.  If some exec bits are set, it's a real sgid; kill it. +	 */ +	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) +		kill |= ATTR_KILL_SGID; -		if (status < 0) { -			nfs_set_pageerror(page); -			nfs_context_set_write_error(req->wb_context, status); -			dprintk(", error = %d\n", status); -			goto remove_request; -		} +	if (unlikely(kill && S_ISREG(mode))) +		return kill; -		if (nfs_write_need_commit(data)) { -			memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); -			nfs_mark_request_commit(req); -			dprintk(" marked for commit\n"); -			goto next; -		} -		dprintk(" OK\n"); -remove_request: -		nfs_inode_remove_request(req); -	next: -		nfs_clear_page_tag_locked(req); -		nfs_end_page_writeback(page); -	} -	nfs_writedata_release(calldata); +	return 0;  } -static const struct rpc_call_ops nfs_write_full_ops = { -#if defined(CONFIG_NFS_V4_1) -	.rpc_call_prepare = nfs_write_prepare, -#endif /* CONFIG_NFS_V4_1 */ -	.rpc_call_done = nfs_writeback_done_full, -	.rpc_release = nfs_writeback_release_full, -}; - -  /*   * This function is called when the WRITE call is complete.   */ -int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data, +			      struct inode *inode)  { -	struct nfs_writeargs	*argp = &data->args; -	struct nfs_writeres	*resp = &data->res; -	struct nfs_server	*server = NFS_SERVER(data->inode);  	int status; -	dprintk("NFS: %5u nfs_writeback_done (status %d)\n", -		task->tk_pid, task->tk_status); -  	/*  	 * ->write_done will attempt to use post-op attributes to detect  	 * conflicting writes by other clients.  A strict interpretation @@ -1148,13 +1370,13 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)  	 * another writer had changed the file, but some applications  	 * depend on tighter cache coherency when writing.  	 */ -	status = NFS_PROTO(data->inode)->write_done(task, data); +	status = NFS_PROTO(inode)->write_done(task, data);  	if (status != 0)  		return status; -	nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); +	nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count); -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -	if (resp->verf->committed < argp->stable && task->tk_status >= 0) { +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) +	if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) {  		/* We tried a write call, but the server did not  		 * commit data to stable storage even though we  		 * requested it. @@ -1165,105 +1387,146 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)  		 */  		static unsigned long    complain; +		/* Note this will print the MDS for a DS write */  		if (time_before(complain, jiffies)) {  			dprintk("NFS:       faulty NFS server %s:"  				" (committed = %d) != (stable = %d)\n", -				server->nfs_client->cl_hostname, -				resp->verf->committed, argp->stable); +				NFS_SERVER(inode)->nfs_client->cl_hostname, +				data->res.verf->committed, data->args.stable);  			complain = jiffies + 300 * HZ;  		}  	}  #endif -	/* Is this a short write? */ -	if (task->tk_status >= 0 && resp->count < argp->count) { + +	/* Deal with the suid/sgid bit corner case */ +	if (nfs_should_remove_suid(inode)) +		nfs_mark_for_revalidate(inode); +	return 0; +} + +/* + * This function is called when the WRITE call is complete. + */ +static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data) +{ +	struct nfs_pgio_args	*argp = &data->args; +	struct nfs_pgio_res	*resp = &data->res; + +	if (resp->count < argp->count) {  		static unsigned long    complain; -		nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); +		/* This a short write! */ +		nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE);  		/* Has the server at least made some progress? */ -		if (resp->count != 0) { -			/* Was this an NFSv2 write or an NFSv3 stable write? */ -			if (resp->verf->committed != NFS_UNSTABLE) { -				/* Resend from where the server left off */ -				argp->offset += resp->count; -				argp->pgbase += resp->count; -				argp->count -= resp->count; -			} else { -				/* Resend as a stable write in order to avoid -				 * headaches in the case of a server crash. -				 */ -				argp->stable = NFS_FILE_SYNC; +		if (resp->count == 0) { +			if (time_before(complain, jiffies)) { +				printk(KERN_WARNING +				       "NFS: Server wrote zero bytes, expected %u.\n", +				       argp->count); +				complain = jiffies + 300 * HZ;  			} -			nfs_restart_rpc(task, server->nfs_client); -			return -EAGAIN; +			nfs_set_pgio_error(data->header, -EIO, argp->offset); +			task->tk_status = -EIO; +			return;  		} -		if (time_before(complain, jiffies)) { -			printk(KERN_WARNING -			       "NFS: Server wrote zero bytes, expected %u.\n", -					argp->count); -			complain = jiffies + 300 * HZ; +		/* Was this an NFSv2 write or an NFSv3 stable write? */ +		if (resp->verf->committed != NFS_UNSTABLE) { +			/* Resend from where the server left off */ +			data->mds_offset += resp->count; +			argp->offset += resp->count; +			argp->pgbase += resp->count; +			argp->count -= resp->count; +		} else { +			/* Resend as a stable write in order to avoid +			 * headaches in the case of a server crash. +			 */ +			argp->stable = NFS_FILE_SYNC;  		} -		/* Can't do anything about it except throw an error. */ -		task->tk_status = -EIO; +		rpc_restart_call_prepare(task);  	} -	return 0;  } -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)  static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)  { +	int ret; +  	if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))  		return 1; -	if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags, -				NFS_INO_COMMIT, nfs_wait_bit_killable, -				TASK_KILLABLE)) -		return 1; -	return 0; +	if (!may_wait) +		return 0; +	ret = out_of_line_wait_on_bit_lock(&nfsi->flags, +				NFS_INO_COMMIT, +				nfs_wait_bit_killable, +				TASK_KILLABLE); +	return (ret < 0) ? ret : 1;  }  static void nfs_commit_clear_lock(struct nfs_inode *nfsi)  {  	clear_bit(NFS_INO_COMMIT, &nfsi->flags); -	smp_mb__after_clear_bit(); +	smp_mb__after_atomic();  	wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);  } - -static void nfs_commitdata_release(void *data) +void nfs_commitdata_release(struct nfs_commit_data *data)  { -	struct nfs_write_data *wdata = data; - -	put_nfs_open_context(wdata->args.context); -	nfs_commit_free(wdata); +	put_nfs_open_context(data->context); +	nfs_commit_free(data);  } +EXPORT_SYMBOL_GPL(nfs_commitdata_release); -/* - * Set up the argument/result storage required for the RPC call. - */ -static int nfs_commit_rpcsetup(struct list_head *head, -		struct nfs_write_data *data, -		int how) +int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, +			const struct rpc_call_ops *call_ops, +			int how, int flags)  { -	struct nfs_page *first = nfs_list_entry(head->next); -	struct inode *inode = first->wb_context->path.dentry->d_inode; -	int priority = flush_task_priority(how);  	struct rpc_task *task; +	int priority = flush_task_priority(how);  	struct rpc_message msg = {  		.rpc_argp = &data->args,  		.rpc_resp = &data->res, -		.rpc_cred = first->wb_context->cred, +		.rpc_cred = data->cred,  	};  	struct rpc_task_setup task_setup_data = {  		.task = &data->task, -		.rpc_client = NFS_CLIENT(inode), +		.rpc_client = clnt,  		.rpc_message = &msg, -		.callback_ops = &nfs_commit_ops, +		.callback_ops = call_ops,  		.callback_data = data,  		.workqueue = nfsiod_workqueue, -		.flags = RPC_TASK_ASYNC, +		.flags = RPC_TASK_ASYNC | flags,  		.priority = priority,  	}; +	/* Set up the initial task struct.  */ +	NFS_PROTO(data->inode)->commit_setup(data, &msg); + +	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + +	nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client, +		NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg); + +	task = rpc_run_task(&task_setup_data); +	if (IS_ERR(task)) +		return PTR_ERR(task); +	if (how & FLUSH_SYNC) +		rpc_wait_for_completion_task(task); +	rpc_put_task(task); +	return 0; +} +EXPORT_SYMBOL_GPL(nfs_initiate_commit); + +/* + * Set up the argument/result storage required for the RPC call. + */ +void nfs_init_commit(struct nfs_commit_data *data, +		     struct list_head *head, +		     struct pnfs_layout_segment *lseg, +		     struct nfs_commit_info *cinfo) +{ +	struct nfs_page *first = nfs_list_entry(head->next); +	struct inode *inode = first->wb_context->dentry->d_inode;  	/* Set up the RPC argument and reply structs  	 * NB: take care not to mess about with data->commit et al. */ @@ -1271,38 +1534,51 @@ static int nfs_commit_rpcsetup(struct list_head *head,  	list_splice_init(head, &data->pages);  	data->inode	  = inode; -	data->cred	  = msg.rpc_cred; +	data->cred	  = first->wb_context->cred; +	data->lseg	  = lseg; /* reference transferred */ +	data->mds_ops     = &nfs_commit_ops; +	data->completion_ops = cinfo->completion_ops; +	data->dreq	  = cinfo->dreq;  	data->args.fh     = NFS_FH(data->inode);  	/* Note: we always request a commit of the entire inode */  	data->args.offset = 0;  	data->args.count  = 0; -	data->args.context = get_nfs_open_context(first->wb_context); -	data->res.count   = 0; +	data->context     = get_nfs_open_context(first->wb_context);  	data->res.fattr   = &data->fattr;  	data->res.verf    = &data->verf;  	nfs_fattr_init(&data->fattr); +} +EXPORT_SYMBOL_GPL(nfs_init_commit); -	/* Set up the initial task struct.  */ -	NFS_PROTO(inode)->commit_setup(data, &msg); - -	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); +void nfs_retry_commit(struct list_head *page_list, +		      struct pnfs_layout_segment *lseg, +		      struct nfs_commit_info *cinfo) +{ +	struct nfs_page *req; -	task = rpc_run_task(&task_setup_data); -	if (IS_ERR(task)) -		return PTR_ERR(task); -	rpc_put_task(task); -	return 0; +	while (!list_empty(page_list)) { +		req = nfs_list_entry(page_list->next); +		nfs_list_remove_request(req); +		nfs_mark_request_commit(req, lseg, cinfo); +		if (!cinfo->dreq) { +			dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); +			dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, +				     BDI_RECLAIMABLE); +		} +		nfs_unlock_and_release_request(req); +	}  } +EXPORT_SYMBOL_GPL(nfs_retry_commit);  /*   * Commit dirty pages   */  static int -nfs_commit_list(struct inode *inode, struct list_head *head, int how) +nfs_commit_list(struct inode *inode, struct list_head *head, int how, +		struct nfs_commit_info *cinfo)  { -	struct nfs_write_data	*data; -	struct nfs_page         *req; +	struct nfs_commit_data	*data;  	data = nfs_commitdata_alloc(); @@ -1310,18 +1586,13 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)  		goto out_bad;  	/* Set up the argument struct */ -	return nfs_commit_rpcsetup(head, data, how); +	nfs_init_commit(data, head, NULL, cinfo); +	atomic_inc(&cinfo->mds->rpcs_out); +	return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops, +				   how, 0);   out_bad: -	while (!list_empty(head)) { -		req = nfs_list_entry(head->next); -		nfs_list_remove_request(req); -		nfs_mark_request_commit(req); -		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); -		dec_bdi_stat(req->wb_page->mapping->backing_dev_info, -				BDI_RECLAIMABLE); -		nfs_clear_page_tag_locked(req); -	} -	nfs_commit_clear_lock(NFS_I(inode)); +	nfs_retry_commit(head, NULL, cinfo); +	cinfo->completion_ops->error_cleanup(NFS_I(inode));  	return -ENOMEM;  } @@ -1330,30 +1601,29 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)   */  static void nfs_commit_done(struct rpc_task *task, void *calldata)  { -	struct nfs_write_data	*data = calldata; +	struct nfs_commit_data	*data = calldata;          dprintk("NFS: %5u nfs_commit_done (status %d)\n",                                  task->tk_pid, task->tk_status);  	/* Call the NFS version-specific code */ -	if (NFS_PROTO(data->inode)->commit_done(task, data) != 0) -		return; +	NFS_PROTO(data->inode)->commit_done(task, data);  } -static void nfs_commit_release(void *calldata) +static void nfs_commit_release_pages(struct nfs_commit_data *data)  { -	struct nfs_write_data	*data = calldata; -	struct nfs_page		*req; +	struct nfs_page	*req;  	int status = data->task.tk_status; +	struct nfs_commit_info cinfo;  	while (!list_empty(&data->pages)) {  		req = nfs_list_entry(data->pages.next);  		nfs_list_remove_request(req); -		nfs_clear_request_commit(req); +		nfs_clear_page_commit(req->wb_page); -		dprintk("NFS:       commit (%s/%lld %d@%lld)", -			req->wb_context->path.dentry->d_inode->i_sb->s_id, -			(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), +		dprintk("NFS:       commit (%s/%llu %d@%lld)", +			req->wb_context->dentry->d_sb->s_id, +			(unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),  			req->wb_bytes,  			(long long)req_offset(req));  		if (status < 0) { @@ -1365,7 +1635,7 @@ static void nfs_commit_release(void *calldata)  		/* Okay, COMMIT succeeded, apparently. Check the verifier  		 * returned by the server against all stored verfs. */ -		if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) { +		if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) {  			/* We have a match */  			nfs_inode_remove_request(req);  			dprintk(" OK\n"); @@ -1374,42 +1644,71 @@ static void nfs_commit_release(void *calldata)  		/* We have a mismatch. Write the page again */  		dprintk(" mismatch\n");  		nfs_mark_request_dirty(req); +		set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);  	next: -		nfs_clear_page_tag_locked(req); +		nfs_unlock_and_release_request(req);  	} -	nfs_commit_clear_lock(NFS_I(data->inode)); +	nfs_init_cinfo(&cinfo, data->inode, data->dreq); +	if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) +		nfs_commit_clear_lock(NFS_I(data->inode)); +} + +static void nfs_commit_release(void *calldata) +{ +	struct nfs_commit_data *data = calldata; + +	data->completion_ops->completion(data);  	nfs_commitdata_release(calldata);  }  static const struct rpc_call_ops nfs_commit_ops = { -#if defined(CONFIG_NFS_V4_1) -	.rpc_call_prepare = nfs_write_prepare, -#endif /* CONFIG_NFS_V4_1 */ +	.rpc_call_prepare = nfs_commit_prepare,  	.rpc_call_done = nfs_commit_done,  	.rpc_release = nfs_commit_release,  }; +static const struct nfs_commit_completion_ops nfs_commit_completion_ops = { +	.completion = nfs_commit_release_pages, +	.error_cleanup = nfs_commit_clear_lock, +}; + +int nfs_generic_commit_list(struct inode *inode, struct list_head *head, +			    int how, struct nfs_commit_info *cinfo) +{ +	int status; + +	status = pnfs_commit_list(inode, head, how, cinfo); +	if (status == PNFS_NOT_ATTEMPTED) +		status = nfs_commit_list(inode, head, how, cinfo); +	return status; +} +  int nfs_commit_inode(struct inode *inode, int how)  {  	LIST_HEAD(head); +	struct nfs_commit_info cinfo;  	int may_wait = how & FLUSH_SYNC; -	int res = 0; +	int res; -	if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) +	res = nfs_commit_set_lock(NFS_I(inode), may_wait); +	if (res <= 0)  		goto out_mark_dirty; -	spin_lock(&inode->i_lock); -	res = nfs_scan_commit(inode, &head, 0, 0); -	spin_unlock(&inode->i_lock); +	nfs_init_cinfo_from_inode(&cinfo, inode); +	res = nfs_scan_commit(inode, &head, &cinfo);  	if (res) { -		int error = nfs_commit_list(inode, &head, how); +		int error; + +		error = nfs_generic_commit_list(inode, &head, how, &cinfo);  		if (error < 0)  			return error; -		if (may_wait) -			wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, -					nfs_wait_bit_killable, -					TASK_KILLABLE); -		else +		if (!may_wait)  			goto out_mark_dirty; +		error = wait_on_bit(&NFS_I(inode)->flags, +				NFS_INO_COMMIT, +				nfs_wait_bit_killable, +				TASK_KILLABLE); +		if (error < 0) +			return error;  	} else  		nfs_commit_clear_lock(NFS_I(inode));  	return res; @@ -1429,11 +1728,15 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr  	int flags = FLUSH_SYNC;  	int ret = 0; +	/* no commits means nothing needs to be done */ +	if (!nfsi->commit_info.ncommit) +		return ret; +  	if (wbc->sync_mode == WB_SYNC_NONE) {  		/* Don't commit yet if this is a non-blocking flush and there  		 * are a lot of outstanding writes for this mapping.  		 */ -		if (nfsi->ncommit <= (nfsi->npages >> 1)) +		if (nfsi->commit_info.ncommit <= (nfsi->npages >> 1))  			goto out_mark_dirty;  		/* don't wait for the COMMIT response */ @@ -1465,6 +1768,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)  {  	return nfs_commit_unstable_pages(inode, wbc);  } +EXPORT_SYMBOL_GPL(nfs_write_inode);  /*   * flush the inode to disk. @@ -1477,36 +1781,44 @@ int nfs_wb_all(struct inode *inode)  		.range_start = 0,  		.range_end = LLONG_MAX,  	}; +	int ret; + +	trace_nfs_writeback_inode_enter(inode); + +	ret = sync_inode(inode, &wbc); -	return sync_inode(inode, &wbc); +	trace_nfs_writeback_inode_exit(inode, ret); +	return ret;  } +EXPORT_SYMBOL_GPL(nfs_wb_all);  int nfs_wb_page_cancel(struct inode *inode, struct page *page)  {  	struct nfs_page *req;  	int ret = 0; -	BUG_ON(!PageLocked(page)); -	for (;;) { -		wait_on_page_writeback(page); -		req = nfs_page_find_request(page); -		if (req == NULL) -			break; -		if (nfs_lock_request_dontget(req)) { -			nfs_inode_remove_request(req); -			/* -			 * In case nfs_inode_remove_request has marked the -			 * page as being dirty -			 */ -			cancel_dirty_page(page, PAGE_CACHE_SIZE); -			nfs_unlock_request(req); -			break; -		} -		ret = nfs_wait_on_request(req); -		nfs_release_request(req); -		if (ret < 0) -			break; +	wait_on_page_writeback(page); + +	/* blocking call to cancel all requests and join to a single (head) +	 * request */ +	req = nfs_lock_and_join_requests(page, false); + +	if (IS_ERR(req)) { +		ret = PTR_ERR(req); +	} else if (req) { +		/* all requests from this page have been cancelled by +		 * nfs_lock_and_join_requests, so just remove the head +		 * request from the inode / page_private pointer and +		 * release it */ +		nfs_inode_remove_request(req); +		/* +		 * In case nfs_inode_remove_request has marked the +		 * page as being dirty +		 */ +		cancel_dirty_page(page, PAGE_CACHE_SIZE); +		nfs_unlock_and_release_request(req);  	} +  	return ret;  } @@ -1515,7 +1827,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)   */  int nfs_wb_page(struct inode *inode, struct page *page)  { -	loff_t range_start = page_offset(page); +	loff_t range_start = page_file_offset(page);  	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);  	struct writeback_control wbc = {  		.sync_mode = WB_SYNC_ALL, @@ -1525,6 +1837,8 @@ int nfs_wb_page(struct inode *inode, struct page *page)  	};  	int ret; +	trace_nfs_writeback_page_enter(inode); +  	for (;;) {  		wait_on_page_writeback(page);  		if (clear_page_dirty_for_io(page)) { @@ -1533,56 +1847,44 @@ int nfs_wb_page(struct inode *inode, struct page *page)  				goto out_error;  			continue;  		} +		ret = 0;  		if (!PagePrivate(page))  			break;  		ret = nfs_commit_inode(inode, FLUSH_SYNC);  		if (ret < 0)  			goto out_error;  	} -	return 0;  out_error: +	trace_nfs_writeback_page_exit(inode, ret);  	return ret;  }  #ifdef CONFIG_MIGRATION  int nfs_migrate_page(struct address_space *mapping, struct page *newpage, -		struct page *page) +		struct page *page, enum migrate_mode mode)  { -	struct nfs_page *req; -	int ret; - -	nfs_fscache_release_page(page, GFP_KERNEL); +	/* +	 * If PagePrivate is set, then the page is currently associated with +	 * an in-progress read or write request. Don't try to migrate it. +	 * +	 * FIXME: we could do this in principle, but we'll need a way to ensure +	 *        that we can safely release the inode reference while holding +	 *        the page lock. +	 */ +	if (PagePrivate(page)) +		return -EBUSY; -	req = nfs_find_and_lock_request(page, false); -	ret = PTR_ERR(req); -	if (IS_ERR(req)) -		goto out; +	if (!nfs_fscache_release_page(page, GFP_KERNEL)) +		return -EBUSY; -	ret = migrate_page(mapping, newpage, page); -	if (!req) -		goto out; -	if (ret) -		goto out_unlock; -	page_cache_get(newpage); -	spin_lock(&mapping->host->i_lock); -	req->wb_page = newpage; -	SetPagePrivate(newpage); -	set_page_private(newpage, (unsigned long)req); -	ClearPagePrivate(page); -	set_page_private(page, 0); -	spin_unlock(&mapping->host->i_lock); -	page_cache_release(page); -out_unlock: -	nfs_clear_page_tag_locked(req); -out: -	return ret; +	return migrate_page(mapping, newpage, page, mode);  }  #endif  int __init nfs_init_writepagecache(void)  {  	nfs_wdata_cachep = kmem_cache_create("nfs_write_data", -					     sizeof(struct nfs_write_data), +					     sizeof(struct nfs_rw_header),  					     0, SLAB_HWCACHE_ALIGN,  					     NULL);  	if (nfs_wdata_cachep == NULL) @@ -1591,12 +1893,19 @@ int __init nfs_init_writepagecache(void)  	nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,  						     nfs_wdata_cachep);  	if (nfs_wdata_mempool == NULL) -		return -ENOMEM; +		goto out_destroy_write_cache; + +	nfs_cdata_cachep = kmem_cache_create("nfs_commit_data", +					     sizeof(struct nfs_commit_data), +					     0, SLAB_HWCACHE_ALIGN, +					     NULL); +	if (nfs_cdata_cachep == NULL) +		goto out_destroy_write_mempool;  	nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT, -						      nfs_wdata_cachep); +						      nfs_cdata_cachep);  	if (nfs_commit_mempool == NULL) -		return -ENOMEM; +		goto out_destroy_commit_cache;  	/*  	 * NFS congestion size, scale with available memory. @@ -1619,12 +1928,30 @@ int __init nfs_init_writepagecache(void)  		nfs_congestion_kb = 256*1024;  	return 0; + +out_destroy_commit_cache: +	kmem_cache_destroy(nfs_cdata_cachep); +out_destroy_write_mempool: +	mempool_destroy(nfs_wdata_mempool); +out_destroy_write_cache: +	kmem_cache_destroy(nfs_wdata_cachep); +	return -ENOMEM;  }  void nfs_destroy_writepagecache(void)  {  	mempool_destroy(nfs_commit_mempool); +	kmem_cache_destroy(nfs_cdata_cachep);  	mempool_destroy(nfs_wdata_mempool);  	kmem_cache_destroy(nfs_wdata_cachep);  } +static const struct nfs_rw_ops nfs_rw_write_ops = { +	.rw_mode		= FMODE_WRITE, +	.rw_alloc_header	= nfs_writehdr_alloc, +	.rw_free_header		= nfs_writehdr_free, +	.rw_release		= nfs_writeback_release_common, +	.rw_done		= nfs_writeback_done, +	.rw_result		= nfs_writeback_result, +	.rw_initiate		= nfs_initiate_write, +};  | 
