diff options
Diffstat (limited to 'fs/nfs')
79 files changed, 46512 insertions, 12796 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig new file mode 100644 index 00000000000..3dece03f2fc --- /dev/null +++ b/fs/nfs/Kconfig @@ -0,0 +1,197 @@ +config NFS_FS + tristate "NFS client support" + depends on INET && FILE_LOCKING + select LOCKD + select SUNRPC + select NFS_ACL_SUPPORT if NFS_V3_ACL + help + Choose Y here if you want to access files residing on other + computers using Sun's Network File System protocol. To compile + this file system support as a module, choose M here: the module + will be called nfs. + + To mount file systems exported by NFS servers, you also need to + install the user space mount.nfs command which can be found in + the Linux nfs-utils package, available from http://linux-nfs.org/. + Information about using the mount command is available in the + mount(8) man page. More detail about the Linux NFS client + implementation is available via the nfs(5) man page. + + Below you can choose which versions of the NFS protocol are + available in the kernel to mount NFS servers. Support for NFS + version 2 (RFC 1094) is always available when NFS_FS is selected. + + To configure a system which mounts its root file system via NFS + at boot time, say Y here, select "Kernel level IP + autoconfiguration" in the NETWORK menu, and select "Root file + system on NFS" below. You cannot compile this file system as a + module in this case. + + If unsure, say N. + +config NFS_V2 + tristate "NFS client support for NFS version 2" + depends on NFS_FS + default y + help + This option enables support for version 2 of the NFS protocol + (RFC 1094) in the kernel's NFS client. + + If unsure, say Y. + +config NFS_V3 + tristate "NFS client support for NFS version 3" + depends on NFS_FS + default y + help + This option enables support for version 3 of the NFS protocol + (RFC 1813) in the kernel's NFS client. + + If unsure, say Y. + +config NFS_V3_ACL + bool "NFS client support for the NFSv3 ACL protocol extension" + depends on NFS_V3 + help + Some NFS servers support an auxiliary NFSv3 ACL protocol that + Sun added to Solaris but never became an official part of the + NFS version 3 protocol. This protocol extension allows + applications on NFS clients to manipulate POSIX Access Control + Lists on files residing on NFS servers. NFS servers enforce + ACLs on local files whether this protocol is available or not. + + Choose Y here if your NFS server supports the Solaris NFSv3 ACL + protocol extension and you want your NFS client to allow + applications to access and modify ACLs on files on the server. + + Most NFS servers don't support the Solaris NFSv3 ACL protocol + extension. You can choose N here or specify the "noacl" mount + option to prevent your NFS client from trying to use the NFSv3 + ACL protocol. + + If unsure, say N. + +config NFS_V4 + tristate "NFS client support for NFS version 4" + depends on NFS_FS + select SUNRPC_GSS + select KEYS + help + This option enables support for version 4 of the NFS protocol + (RFC 3530) in the kernel's NFS client. + + To mount NFS servers using NFSv4, you also need to install user + space programs which can be found in the Linux nfs-utils package, + available from http://linux-nfs.org/. + + If unsure, say Y. + +config NFS_SWAP + bool "Provide swap over NFS support" + default n + depends on NFS_FS + select SUNRPC_SWAP + help + This option enables swapon to work on files located on NFS mounts. + +config NFS_V4_1 + bool "NFS client support for NFSv4.1" + depends on NFS_V4 + select SUNRPC_BACKCHANNEL + help + This option enables support for minor version 1 of the NFSv4 protocol + (RFC 5661) in the kernel's NFS client. + + If unsure, say N. + +config NFS_V4_2 + bool "NFS client support for NFSv4.2" + depends on NFS_V4_1 + help + This option enables support for minor version 2 of the NFSv4 protocol + in the kernel's NFS client. + + If unsure, say N. + +config PNFS_FILE_LAYOUT + tristate + depends on NFS_V4_1 + default NFS_V4 + +config PNFS_BLOCK + tristate + depends on NFS_V4_1 && BLK_DEV_DM + default NFS_V4 + +config PNFS_OBJLAYOUT + tristate + depends on NFS_V4_1 && SCSI_OSD_ULD + default NFS_V4 + +config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN + string "NFSv4.1 Implementation ID Domain" + depends on NFS_V4_1 + default "kernel.org" + help + This option defines the domain portion of the implementation ID that + may be sent in the NFS exchange_id operation. The value must be in + the format of a DNS domain name and should be set to the DNS domain + name of the distribution. + If the NFS client is unchanged from the upstream kernel, this + option should be set to the default "kernel.org". + +config NFS_V4_1_MIGRATION + bool "NFSv4.1 client support for migration" + depends on NFS_V4_1 + default n + help + This option makes the NFS client advertise to NFSv4.1 servers that + it can support NFSv4 migration. + + The NFSv4.1 pieces of the Linux NFSv4 migration implementation are + still experimental. If you are not an NFSv4 developer, say N here. + +config NFS_V4_SECURITY_LABEL + bool + depends on NFS_V4_2 && SECURITY + default y + +config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP + help + If you want your system to mount its root file system via NFS, + choose Y here. This is common practice for managing systems + without local permanent storage. For details, read + <file:Documentation/filesystems/nfs/nfsroot.txt>. + + Most people say N here. + +config NFS_FSCACHE + bool "Provide NFS client caching support" + depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y + help + Say Y here if you want NFS data to be cached locally on disc through + the general filesystem cache manager + +config NFS_USE_LEGACY_DNS + bool "Use the legacy NFS DNS resolver" + depends on NFS_V4 + help + The kernel now provides a method for translating a host name into an + IP address. Select Y here if you would rather use your own DNS + resolver script. + + If unsure, say N + +config NFS_USE_KERNEL_DNS + bool + depends on NFS_V4 && !NFS_USE_LEGACY_DNS + select DNS_RESOLVER + default y + +config NFS_DEBUG + bool + depends on NFS_FS && SUNRPC_DEBUG + select CRC32 + default y diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index df0f41e0988..4782e0840dc 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -4,15 +4,31 @@ obj-$(CONFIG_NFS_FS) += nfs.o -nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ - pagelist.o proc.o read.o symlink.o unlink.o \ - write.o namespace.o mount_clnt.o +CFLAGS_nfstrace.o += -I$(src) +nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ + direct.o pagelist.o read.o symlink.o unlink.o \ + write.o namespace.o mount_clnt.o nfstrace.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o -nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o -nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o -nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ - delegation.o idmap.o \ - callback.o callback_xdr.o callback_proc.o \ - nfs4namespace.o -nfs-$(CONFIG_NFS_DIRECTIO) += direct.o -nfs-$(CONFIG_SYSCTL) += sysctl.o +nfs-$(CONFIG_SYSCTL) += sysctl.o +nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o + +obj-$(CONFIG_NFS_V2) += nfsv2.o +nfsv2-y := nfs2super.o proc.o nfs2xdr.o + +obj-$(CONFIG_NFS_V3) += nfsv3.o +nfsv3-y := nfs3super.o nfs3client.o nfs3proc.o nfs3xdr.o +nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o + +obj-$(CONFIG_NFS_V4) += nfsv4.o +CFLAGS_nfs4trace.o += -I$(src) +nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ + delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ + nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \ + dns_resolve.o nfs4trace.o +nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o +nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o +nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o + +obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile new file mode 100644 index 00000000000..d5815505c02 --- /dev/null +++ b/fs/nfs/blocklayout/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS block layout driver kernel module +# +obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o +blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c new file mode 100644 index 00000000000..9b431f44fad --- /dev/null +++ b/fs/nfs/blocklayout/blocklayout.c @@ -0,0 +1,1458 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.c + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/bio.h> /* struct bio */ +#include <linux/buffer_head.h> /* various write calls */ +#include <linux/prefetch.h> +#include <linux/pagevec.h> + +#include "../pnfs.h" +#include "../nfs4session.h" +#include "../internal.h" +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); +MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); + +static void print_page(struct page *page) +{ + dprintk("PRINTPAGE page %p\n", page); + dprintk(" PagePrivate %d\n", PagePrivate(page)); + dprintk(" PageUptodate %d\n", PageUptodate(page)); + dprintk(" PageError %d\n", PageError(page)); + dprintk(" PageDirty %d\n", PageDirty(page)); + dprintk(" PageReferenced %d\n", PageReferenced(page)); + dprintk(" PageLocked %d\n", PageLocked(page)); + dprintk(" PageWriteback %d\n", PageWriteback(page)); + dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); + dprintk("\n"); +} + +/* Given the be associated with isect, determine if page data needs to be + * initialized. + */ +static int is_hole(struct pnfs_block_extent *be, sector_t isect) +{ + if (be->be_state == PNFS_BLOCK_NONE_DATA) + return 1; + else if (be->be_state != PNFS_BLOCK_INVALID_DATA) + return 0; + else + return !bl_is_sector_init(be->be_inval, isect); +} + +/* Given the be associated with isect, determine if page data can be + * written to disk. + */ +static int is_writable(struct pnfs_block_extent *be, sector_t isect) +{ + return (be->be_state == PNFS_BLOCK_READWRITE_DATA || + be->be_state == PNFS_BLOCK_INVALID_DATA); +} + +/* The data we are handed might be spread across several bios. We need + * to track when the last one is finished. + */ +struct parallel_io { + struct kref refcnt; + void (*pnfs_callback) (void *data, int num_se); + void *data; + int bse_count; +}; + +static inline struct parallel_io *alloc_parallel(void *data) +{ + struct parallel_io *rv; + + rv = kmalloc(sizeof(*rv), GFP_NOFS); + if (rv) { + rv->data = data; + kref_init(&rv->refcnt); + rv->bse_count = 0; + } + return rv; +} + +static inline void get_parallel(struct parallel_io *p) +{ + kref_get(&p->refcnt); +} + +static void destroy_parallel(struct kref *kref) +{ + struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); + + dprintk("%s enter\n", __func__); + p->pnfs_callback(p->data, p->bse_count); + kfree(p); +} + +static inline void put_parallel(struct parallel_io *p) +{ + kref_put(&p->refcnt, destroy_parallel); +} + +static struct bio * +bl_submit_bio(int rw, struct bio *bio) +{ + if (bio) { + get_parallel(bio->bi_private); + dprintk("%s submitting %s bio %u@%llu\n", __func__, + rw == READ ? "read" : "write", bio->bi_iter.bi_size, + (unsigned long long)bio->bi_iter.bi_sector); + submit_bio(rw, bio); + } + return NULL; +} + +static struct bio *bl_alloc_init_bio(int npg, sector_t isect, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), + struct parallel_io *par) +{ + struct bio *bio; + + npg = min(npg, BIO_MAX_PAGES); + bio = bio_alloc(GFP_NOIO, npg); + if (!bio && (current->flags & PF_MEMALLOC)) { + while (!bio && (npg /= 2)) + bio = bio_alloc(GFP_NOIO, npg); + } + + if (bio) { + bio->bi_iter.bi_sector = isect - be->be_f_offset + + be->be_v_offset; + bio->bi_bdev = be->be_mdev; + bio->bi_end_io = end_io; + bio->bi_private = par; + } + return bio; +} + +static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, + sector_t isect, struct page *page, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), + struct parallel_io *par, + unsigned int offset, int len) +{ + isect = isect + (offset >> SECTOR_SHIFT); + dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, + npg, rw, (unsigned long long)isect, offset, len); +retry: + if (!bio) { + bio = bl_alloc_init_bio(npg, isect, be, end_io, par); + if (!bio) + return ERR_PTR(-ENOMEM); + } + if (bio_add_page(bio, page, len, offset) < len) { + bio = bl_submit_bio(rw, bio); + goto retry; + } + return bio; +} + +static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, + sector_t isect, struct page *page, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), + struct parallel_io *par) +{ + return do_add_page_to_bio(bio, npg, rw, isect, page, be, + end_io, par, 0, PAGE_CACHE_SIZE); +} + +/* This is basically copied from mpage_end_io_read */ +static void bl_end_io_read(struct bio *bio, int err) +{ + struct parallel_io *par = bio->bi_private; + struct bio_vec *bvec; + int i; + + if (!err) + bio_for_each_segment_all(bvec, bio, i) + SetPageUptodate(bvec->bv_page); + + if (err) { + struct nfs_pgio_data *rdata = par->data; + struct nfs_pgio_header *header = rdata->header; + + if (!header->pnfs_error) + header->pnfs_error = -EIO; + pnfs_set_lo_fail(header->lseg); + } + bio_put(bio); + put_parallel(par); +} + +static void bl_read_cleanup(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_pgio_data *rdata; + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + rdata = container_of(task, struct nfs_pgio_data, task); + pnfs_ld_read_done(rdata); +} + +static void +bl_end_par_io_read(void *data, int unused) +{ + struct nfs_pgio_data *rdata = data; + + rdata->task.tk_status = rdata->header->pnfs_error; + INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); + schedule_work(&rdata->task.u.tk_work); +} + +static enum pnfs_try_status +bl_read_pagelist(struct nfs_pgio_data *rdata) +{ + struct nfs_pgio_header *header = rdata->header; + int i, hole; + struct bio *bio = NULL; + struct pnfs_block_extent *be = NULL, *cow_read = NULL; + sector_t isect, extent_length = 0; + struct parallel_io *par; + loff_t f_offset = rdata->args.offset; + size_t bytes_left = rdata->args.count; + unsigned int pg_offset, pg_len; + struct page **pages = rdata->args.pages; + int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; + const bool is_dio = (header->dreq != NULL); + + dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, + rdata->pages.npages, f_offset, (unsigned int)rdata->args.count); + + par = alloc_parallel(rdata); + if (!par) + goto use_mds; + par->pnfs_callback = bl_end_par_io_read; + /* At this point, we can no longer jump to use_mds */ + + isect = (sector_t) (f_offset >> SECTOR_SHIFT); + /* Code assumes extents are page-aligned */ + for (i = pg_index; i < rdata->pages.npages; i++) { + if (!extent_length) { + /* We've used up the previous extent */ + bl_put_extent(be); + bl_put_extent(cow_read); + bio = bl_submit_bio(READ, bio); + /* Get the next one */ + be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), + isect, &cow_read); + if (!be) { + header->pnfs_error = -EIO; + goto out; + } + extent_length = be->be_length - + (isect - be->be_f_offset); + if (cow_read) { + sector_t cow_length = cow_read->be_length - + (isect - cow_read->be_f_offset); + extent_length = min(extent_length, cow_length); + } + } + + if (is_dio) { + pg_offset = f_offset & ~PAGE_CACHE_MASK; + if (pg_offset + bytes_left > PAGE_CACHE_SIZE) + pg_len = PAGE_CACHE_SIZE - pg_offset; + else + pg_len = bytes_left; + + f_offset += pg_len; + bytes_left -= pg_len; + isect += (pg_offset >> SECTOR_SHIFT); + } else { + pg_offset = 0; + pg_len = PAGE_CACHE_SIZE; + } + + hole = is_hole(be, isect); + if (hole && !cow_read) { + bio = bl_submit_bio(READ, bio); + /* Fill hole w/ zeroes w/o accessing device */ + dprintk("%s Zeroing page for hole\n", __func__); + zero_user_segment(pages[i], pg_offset, pg_len); + print_page(pages[i]); + SetPageUptodate(pages[i]); + } else { + struct pnfs_block_extent *be_read; + + be_read = (hole && cow_read) ? cow_read : be; + bio = do_add_page_to_bio(bio, rdata->pages.npages - i, + READ, + isect, pages[i], be_read, + bl_end_io_read, par, + pg_offset, pg_len); + if (IS_ERR(bio)) { + header->pnfs_error = PTR_ERR(bio); + bio = NULL; + goto out; + } + } + isect += (pg_len >> SECTOR_SHIFT); + extent_length -= PAGE_CACHE_SECTORS; + } + if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { + rdata->res.eof = 1; + rdata->res.count = header->inode->i_size - rdata->args.offset; + } else { + rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset; + } +out: + bl_put_extent(be); + bl_put_extent(cow_read); + bl_submit_bio(READ, bio); + put_parallel(par); + return PNFS_ATTEMPTED; + + use_mds: + dprintk("Giving up and using normal NFS\n"); + return PNFS_NOT_ATTEMPTED; +} + +static void mark_extents_written(struct pnfs_block_layout *bl, + __u64 offset, __u32 count) +{ + sector_t isect, end; + struct pnfs_block_extent *be; + struct pnfs_block_short_extent *se; + + dprintk("%s(%llu, %u)\n", __func__, offset, count); + if (count == 0) + return; + isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT; + end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); + end >>= SECTOR_SHIFT; + while (isect < end) { + sector_t len; + be = bl_find_get_extent(bl, isect, NULL); + BUG_ON(!be); /* FIXME */ + len = min(end, be->be_f_offset + be->be_length) - isect; + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + se = bl_pop_one_short_extent(be->be_inval); + BUG_ON(!se); + bl_mark_for_commit(be, isect, len, se); + } + isect += len; + bl_put_extent(be); + } +} + +static void bl_end_io_write_zero(struct bio *bio, int err) +{ + struct parallel_io *par = bio->bi_private; + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + /* This is the zeroing page we added */ + end_page_writeback(bvec->bv_page); + page_cache_release(bvec->bv_page); + } + + if (unlikely(err)) { + struct nfs_pgio_data *data = par->data; + struct nfs_pgio_header *header = data->header; + + if (!header->pnfs_error) + header->pnfs_error = -EIO; + pnfs_set_lo_fail(header->lseg); + } + bio_put(bio); + put_parallel(par); +} + +static void bl_end_io_write(struct bio *bio, int err) +{ + struct parallel_io *par = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct nfs_pgio_data *data = par->data; + struct nfs_pgio_header *header = data->header; + + if (!uptodate) { + if (!header->pnfs_error) + header->pnfs_error = -EIO; + pnfs_set_lo_fail(header->lseg); + } + bio_put(bio); + put_parallel(par); +} + +/* Function scheduled for call during bl_end_par_io_write, + * it marks sectors as written and extends the commitlist. + */ +static void bl_write_cleanup(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_pgio_data *wdata; + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + wdata = container_of(task, struct nfs_pgio_data, task); + if (likely(!wdata->header->pnfs_error)) { + /* Marks for LAYOUTCOMMIT */ + mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg), + wdata->args.offset, wdata->args.count); + } + pnfs_ld_write_done(wdata); +} + +/* Called when last of bios associated with a bl_write_pagelist call finishes */ +static void bl_end_par_io_write(void *data, int num_se) +{ + struct nfs_pgio_data *wdata = data; + + if (unlikely(wdata->header->pnfs_error)) { + bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval, + num_se); + } + + wdata->task.tk_status = wdata->header->pnfs_error; + wdata->verf.committed = NFS_FILE_SYNC; + INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); + schedule_work(&wdata->task.u.tk_work); +} + +/* FIXME STUB - mark intersection of layout and page as bad, so is not + * used again. + */ +static void mark_bad_read(void) +{ + return; +} + +/* + * map_block: map a requested I/0 block (isect) into an offset in the LVM + * block_device + */ +static void +map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) +{ + dprintk("%s enter be=%p\n", __func__, be); + + set_buffer_mapped(bh); + bh->b_bdev = be->be_mdev; + bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> + (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT); + + dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n", + __func__, (unsigned long long)isect, (long)bh->b_blocknr, + bh->b_size); + return; +} + +static void +bl_read_single_end_io(struct bio *bio, int error) +{ + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct page *page = bvec->bv_page; + + /* Only one page in bvec */ + unlock_page(page); +} + +static int +bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be, + unsigned int offset, unsigned int len) +{ + struct bio *bio; + struct page *shadow_page; + sector_t isect; + char *kaddr, *kshadow_addr; + int ret = 0; + + dprintk("%s: offset %u len %u\n", __func__, offset, len); + + shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (shadow_page == NULL) + return -ENOMEM; + + bio = bio_alloc(GFP_NOIO, 1); + if (bio == NULL) + return -ENOMEM; + + isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) + + (offset / SECTOR_SIZE); + + bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset; + bio->bi_bdev = be->be_mdev; + bio->bi_end_io = bl_read_single_end_io; + + lock_page(shadow_page); + if (bio_add_page(bio, shadow_page, + SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) { + unlock_page(shadow_page); + bio_put(bio); + return -EIO; + } + + submit_bio(READ, bio); + wait_on_page_locked(shadow_page); + if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) { + ret = -EIO; + } else { + kaddr = kmap_atomic(page); + kshadow_addr = kmap_atomic(shadow_page); + memcpy(kaddr + offset, kshadow_addr + offset, len); + kunmap_atomic(kshadow_addr); + kunmap_atomic(kaddr); + } + __free_page(shadow_page); + bio_put(bio); + + return ret; +} + +static int +bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be, + unsigned int dirty_offset, unsigned int dirty_len, + bool full_page) +{ + int ret = 0; + unsigned int start, end; + + if (full_page) { + start = 0; + end = PAGE_CACHE_SIZE; + } else { + start = round_down(dirty_offset, SECTOR_SIZE); + end = round_up(dirty_offset + dirty_len, SECTOR_SIZE); + } + + dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len); + if (!be) { + zero_user_segments(page, start, dirty_offset, + dirty_offset + dirty_len, end); + if (start == 0 && end == PAGE_CACHE_SIZE && + trylock_page(page)) { + SetPageUptodate(page); + unlock_page(page); + } + return ret; + } + + if (start != dirty_offset) + ret = bl_do_readpage_sync(page, be, start, dirty_offset - start); + + if (!ret && (dirty_offset + dirty_len < end)) + ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len, + end - dirty_offset - dirty_len); + + return ret; +} + +/* Given an unmapped page, zero it or read in page for COW, page is locked + * by caller. + */ +static int +init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) +{ + struct buffer_head *bh = NULL; + int ret = 0; + sector_t isect; + + dprintk("%s enter, %p\n", __func__, page); + BUG_ON(PageUptodate(page)); + if (!cow_read) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + goto cleanup; + } + + bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); + if (!bh) { + ret = -ENOMEM; + goto cleanup; + } + + isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; + map_block(bh, isect, cow_read); + if (!bh_uptodate_or_lock(bh)) + ret = bh_submit_read(bh); + if (ret) + goto cleanup; + SetPageUptodate(page); + +cleanup: + if (bh) + free_buffer_head(bh); + if (ret) { + /* Need to mark layout with bad read...should now + * just use nfs4 for reads and writes. + */ + mark_bad_read(); + } + return ret; +} + +/* Find or create a zeroing page marked being writeback. + * Return ERR_PTR on error, NULL to indicate skip this page and page itself + * to indicate write out. + */ +static struct page * +bl_find_get_zeroing_page(struct inode *inode, pgoff_t index, + struct pnfs_block_extent *cow_read) +{ + struct page *page; + int locked = 0; + page = find_get_page(inode->i_mapping, index); + if (page) + goto check_page; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (unlikely(!page)) { + dprintk("%s oom\n", __func__); + return ERR_PTR(-ENOMEM); + } + locked = 1; + +check_page: + /* PageDirty: Other will write this out + * PageWriteback: Other is writing this out + * PageUptodate: It was read before + */ + if (PageDirty(page) || PageWriteback(page)) { + print_page(page); + if (locked) + unlock_page(page); + page_cache_release(page); + return NULL; + } + + if (!locked) { + lock_page(page); + locked = 1; + goto check_page; + } + if (!PageUptodate(page)) { + /* New page, readin or zero it */ + init_page_for_write(page, cow_read); + } + set_page_writeback(page); + unlock_page(page); + + return page; +} + +static enum pnfs_try_status +bl_write_pagelist(struct nfs_pgio_data *wdata, int sync) +{ + struct nfs_pgio_header *header = wdata->header; + int i, ret, npg_zero, pg_index, last = 0; + struct bio *bio = NULL; + struct pnfs_block_extent *be = NULL, *cow_read = NULL; + sector_t isect, last_isect = 0, extent_length = 0; + struct parallel_io *par = NULL; + loff_t offset = wdata->args.offset; + size_t count = wdata->args.count; + unsigned int pg_offset, pg_len, saved_len; + struct page **pages = wdata->args.pages; + struct page *page; + pgoff_t index; + u64 temp; + int npg_per_block = + NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; + + dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); + + if (header->dreq != NULL && + (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) || + !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) { + dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n"); + goto out_mds; + } + /* At this point, wdata->pages is a (sequential) list of nfs_pages. + * We want to write each, and if there is an error set pnfs_error + * to have it redone using nfs. + */ + par = alloc_parallel(wdata); + if (!par) + goto out_mds; + par->pnfs_callback = bl_end_par_io_write; + /* At this point, have to be more careful with error handling */ + + isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); + be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read); + if (!be || !is_writable(be, isect)) { + dprintk("%s no matching extents!\n", __func__); + goto out_mds; + } + + /* First page inside INVALID extent */ + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (likely(!bl_push_one_short_extent(be->be_inval))) + par->bse_count++; + else + goto out_mds; + temp = offset >> PAGE_CACHE_SHIFT; + npg_zero = do_div(temp, npg_per_block); + isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & + (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); + extent_length = be->be_length - (isect - be->be_f_offset); + +fill_invalid_ext: + dprintk("%s need to zero %d pages\n", __func__, npg_zero); + for (;npg_zero > 0; npg_zero--) { + if (bl_is_sector_init(be->be_inval, isect)) { + dprintk("isect %llu already init\n", + (unsigned long long)isect); + goto next_page; + } + /* page ref released in bl_end_io_write_zero */ + index = isect >> PAGE_CACHE_SECTOR_SHIFT; + dprintk("%s zero %dth page: index %lu isect %llu\n", + __func__, npg_zero, index, + (unsigned long long)isect); + page = bl_find_get_zeroing_page(header->inode, index, + cow_read); + if (unlikely(IS_ERR(page))) { + header->pnfs_error = PTR_ERR(page); + goto out; + } else if (page == NULL) + goto next_page; + + ret = bl_mark_sectors_init(be->be_inval, isect, + PAGE_CACHE_SECTORS); + if (unlikely(ret)) { + dprintk("%s bl_mark_sectors_init fail %d\n", + __func__, ret); + end_page_writeback(page); + page_cache_release(page); + header->pnfs_error = ret; + goto out; + } + if (likely(!bl_push_one_short_extent(be->be_inval))) + par->bse_count++; + else { + end_page_writeback(page); + page_cache_release(page); + header->pnfs_error = -ENOMEM; + goto out; + } + /* FIXME: This should be done in bi_end_io */ + mark_extents_written(BLK_LSEG2EXT(header->lseg), + page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE); + + bio = bl_add_page_to_bio(bio, npg_zero, WRITE, + isect, page, be, + bl_end_io_write_zero, par); + if (IS_ERR(bio)) { + header->pnfs_error = PTR_ERR(bio); + bio = NULL; + goto out; + } +next_page: + isect += PAGE_CACHE_SECTORS; + extent_length -= PAGE_CACHE_SECTORS; + } + if (last) + goto write_done; + } + bio = bl_submit_bio(WRITE, bio); + + /* Middle pages */ + pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; + for (i = pg_index; i < wdata->pages.npages; i++) { + if (!extent_length) { + /* We've used up the previous extent */ + bl_put_extent(be); + bl_put_extent(cow_read); + bio = bl_submit_bio(WRITE, bio); + /* Get the next one */ + be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), + isect, &cow_read); + if (!be || !is_writable(be, isect)) { + header->pnfs_error = -EINVAL; + goto out; + } + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (likely(!bl_push_one_short_extent( + be->be_inval))) + par->bse_count++; + else { + header->pnfs_error = -ENOMEM; + goto out; + } + } + extent_length = be->be_length - + (isect - be->be_f_offset); + } + + dprintk("%s offset %lld count %Zu\n", __func__, offset, count); + pg_offset = offset & ~PAGE_CACHE_MASK; + if (pg_offset + count > PAGE_CACHE_SIZE) + pg_len = PAGE_CACHE_SIZE - pg_offset; + else + pg_len = count; + + saved_len = pg_len; + if (be->be_state == PNFS_BLOCK_INVALID_DATA && + !bl_is_sector_init(be->be_inval, isect)) { + ret = bl_read_partial_page_sync(pages[i], cow_read, + pg_offset, pg_len, true); + if (ret) { + dprintk("%s bl_read_partial_page_sync fail %d\n", + __func__, ret); + header->pnfs_error = ret; + goto out; + } + + ret = bl_mark_sectors_init(be->be_inval, isect, + PAGE_CACHE_SECTORS); + if (unlikely(ret)) { + dprintk("%s bl_mark_sectors_init fail %d\n", + __func__, ret); + header->pnfs_error = ret; + goto out; + } + + /* Expand to full page write */ + pg_offset = 0; + pg_len = PAGE_CACHE_SIZE; + } else if ((pg_offset & (SECTOR_SIZE - 1)) || + (pg_len & (SECTOR_SIZE - 1))){ + /* ahh, nasty case. We have to do sync full sector + * read-modify-write cycles. + */ + unsigned int saved_offset = pg_offset; + ret = bl_read_partial_page_sync(pages[i], be, pg_offset, + pg_len, false); + pg_offset = round_down(pg_offset, SECTOR_SIZE); + pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE) + - pg_offset; + } + + + bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, + isect, pages[i], be, + bl_end_io_write, par, + pg_offset, pg_len); + if (IS_ERR(bio)) { + header->pnfs_error = PTR_ERR(bio); + bio = NULL; + goto out; + } + offset += saved_len; + count -= saved_len; + isect += PAGE_CACHE_SECTORS; + last_isect = isect; + extent_length -= PAGE_CACHE_SECTORS; + } + + /* Last page inside INVALID extent */ + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + bio = bl_submit_bio(WRITE, bio); + temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; + npg_zero = npg_per_block - do_div(temp, npg_per_block); + if (npg_zero < npg_per_block) { + last = 1; + goto fill_invalid_ext; + } + } + +write_done: + wdata->res.count = wdata->args.count; +out: + bl_put_extent(be); + bl_put_extent(cow_read); + bl_submit_bio(WRITE, bio); + put_parallel(par); + return PNFS_ATTEMPTED; +out_mds: + bl_put_extent(be); + bl_put_extent(cow_read); + kfree(par); + return PNFS_NOT_ATTEMPTED; +} + +/* FIXME - range ignored */ +static void +release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range) +{ + int i; + struct pnfs_block_extent *be; + + spin_lock(&bl->bl_ext_lock); + for (i = 0; i < EXTENT_LISTS; i++) { + while (!list_empty(&bl->bl_extents[i])) { + be = list_first_entry(&bl->bl_extents[i], + struct pnfs_block_extent, + be_node); + list_del(&be->be_node); + bl_put_extent(be); + } + } + spin_unlock(&bl->bl_ext_lock); +} + +static void +release_inval_marks(struct pnfs_inval_markings *marks) +{ + struct pnfs_inval_tracking *pos, *temp; + struct pnfs_block_short_extent *se, *stemp; + + list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { + list_del(&pos->it_link); + kfree(pos); + } + + list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) { + list_del(&se->bse_node); + kfree(se); + } + return; +} + +static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + + dprintk("%s enter\n", __func__); + release_extents(bl, NULL); + release_inval_marks(&bl->bl_inval); + kfree(bl); +} + +static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, + gfp_t gfp_flags) +{ + struct pnfs_block_layout *bl; + + dprintk("%s enter\n", __func__); + bl = kzalloc(sizeof(*bl), gfp_flags); + if (!bl) + return NULL; + spin_lock_init(&bl->bl_ext_lock); + INIT_LIST_HEAD(&bl->bl_extents[0]); + INIT_LIST_HEAD(&bl->bl_extents[1]); + INIT_LIST_HEAD(&bl->bl_commit); + INIT_LIST_HEAD(&bl->bl_committing); + bl->bl_count = 0; + bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT; + BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); + return &bl->bl_layout; +} + +static void bl_free_lseg(struct pnfs_layout_segment *lseg) +{ + dprintk("%s enter\n", __func__); + kfree(lseg); +} + +/* We pretty much ignore lseg, and store all data layout wide, so we + * can correctly merge. + */ +static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, + struct nfs4_layoutget_res *lgr, + gfp_t gfp_flags) +{ + struct pnfs_layout_segment *lseg; + int status; + + dprintk("%s enter\n", __func__); + lseg = kzalloc(sizeof(*lseg), gfp_flags); + if (!lseg) + return ERR_PTR(-ENOMEM); + status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); + if (status) { + /* We don't want to call the full-blown bl_free_lseg, + * since on error extents were not touched. + */ + kfree(lseg); + return ERR_PTR(status); + } + return lseg; +} + +static void +bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg) +{ + dprintk("%s enter\n", __func__); + encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); +} + +static void +bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) +{ + struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; + + dprintk("%s enter\n", __func__); + clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); +} + +static void free_blk_mountid(struct block_mount_id *mid) +{ + if (mid) { + struct pnfs_block_dev *dev, *tmp; + + /* No need to take bm_lock as we are last user freeing bm_devlist */ + list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) { + list_del(&dev->bm_node); + bl_free_block_dev(dev); + } + kfree(mid); + } +} + +/* This is mostly copied from the filelayout_get_device_info function. + * It seems much of this should be at the generic pnfs level. + */ +static struct pnfs_block_dev * +nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, + struct nfs4_deviceid *d_id) +{ + struct pnfs_device *dev; + struct pnfs_block_dev *rv; + u32 max_resp_sz; + int max_pages; + struct page **pages = NULL; + int i, rc; + + /* + * Use the session max response size as the basis for setting + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + max_pages = nfs_page_array_len(0, max_resp_sz); + dprintk("%s max_resp_sz %u max_pages %d\n", + __func__, max_resp_sz, max_pages); + + dev = kmalloc(sizeof(*dev), GFP_NOFS); + if (!dev) { + dprintk("%s kmalloc failed\n", __func__); + return ERR_PTR(-ENOMEM); + } + + pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); + if (pages == NULL) { + kfree(dev); + return ERR_PTR(-ENOMEM); + } + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(GFP_NOFS); + if (!pages[i]) { + rv = ERR_PTR(-ENOMEM); + goto out_free; + } + } + + memcpy(&dev->dev_id, d_id, sizeof(*d_id)); + dev->layout_type = LAYOUT_BLOCK_VOLUME; + dev->pages = pages; + dev->pgbase = 0; + dev->pglen = PAGE_SIZE * max_pages; + dev->mincount = 0; + dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; + + dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); + rc = nfs4_proc_getdeviceinfo(server, dev, NULL); + dprintk("%s getdevice info returns %d\n", __func__, rc); + if (rc) { + rv = ERR_PTR(rc); + goto out_free; + } + + rv = nfs4_blk_decode_device(server, dev); + out_free: + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); + kfree(dev); + return rv; +} + +static int +bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) +{ + struct block_mount_id *b_mt_id = NULL; + struct pnfs_devicelist *dlist = NULL; + struct pnfs_block_dev *bdev; + LIST_HEAD(block_disklist); + int status, i; + + dprintk("%s enter\n", __func__); + + if (server->pnfs_blksize == 0) { + dprintk("%s Server did not return blksize\n", __func__); + return -EINVAL; + } + b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS); + if (!b_mt_id) { + status = -ENOMEM; + goto out_error; + } + /* Initialize nfs4 block layout mount id */ + spin_lock_init(&b_mt_id->bm_lock); + INIT_LIST_HEAD(&b_mt_id->bm_devlist); + + dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS); + if (!dlist) { + status = -ENOMEM; + goto out_error; + } + dlist->eof = 0; + while (!dlist->eof) { + status = nfs4_proc_getdevicelist(server, fh, dlist); + if (status) + goto out_error; + dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", + __func__, dlist->num_devs, dlist->eof); + for (i = 0; i < dlist->num_devs; i++) { + bdev = nfs4_blk_get_deviceinfo(server, fh, + &dlist->dev_id[i]); + if (IS_ERR(bdev)) { + status = PTR_ERR(bdev); + goto out_error; + } + spin_lock(&b_mt_id->bm_lock); + list_add(&bdev->bm_node, &b_mt_id->bm_devlist); + spin_unlock(&b_mt_id->bm_lock); + } + } + dprintk("%s SUCCESS\n", __func__); + server->pnfs_ld_data = b_mt_id; + + out_return: + kfree(dlist); + return status; + + out_error: + free_blk_mountid(b_mt_id); + goto out_return; +} + +static int +bl_clear_layoutdriver(struct nfs_server *server) +{ + struct block_mount_id *b_mt_id = server->pnfs_ld_data; + + dprintk("%s enter\n", __func__); + free_blk_mountid(b_mt_id); + dprintk("%s RETURNS\n", __func__); + return 0; +} + +static bool +is_aligned_req(struct nfs_page *req, unsigned int alignment) +{ + return IS_ALIGNED(req->wb_offset, alignment) && + IS_ALIGNED(req->wb_bytes, alignment); +} + +static void +bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + if (pgio->pg_dreq != NULL && + !is_aligned_req(req, SECTOR_SIZE)) + nfs_pageio_reset_read_mds(pgio); + else + pnfs_generic_pg_init_read(pgio, req); +} + +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t +bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + if (pgio->pg_dreq != NULL && + !is_aligned_req(req, SECTOR_SIZE)) + return 0; + + return pnfs_generic_pg_test(pgio, prev, req); +} + +/* + * Return the number of contiguous bytes for a given inode + * starting at page frame idx. + */ +static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t end; + + /* Optimize common case that writes from 0 to end of file */ + end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); + if (end != NFS_I(inode)->npages) { + rcu_read_lock(); + end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); + rcu_read_unlock(); + } + + if (!end) + return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT); + else + return (end - idx) << PAGE_CACHE_SHIFT; +} + +static void +bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + if (pgio->pg_dreq != NULL && + !is_aligned_req(req, PAGE_CACHE_SIZE)) { + nfs_pageio_reset_write_mds(pgio); + } else { + u64 wb_size; + if (pgio->pg_dreq == NULL) + wb_size = pnfs_num_cont_bytes(pgio->pg_inode, + req->wb_index); + else + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + + pnfs_generic_pg_init_write(pgio, req, wb_size); + } +} + +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t +bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + if (pgio->pg_dreq != NULL && + !is_aligned_req(req, PAGE_CACHE_SIZE)) + return 0; + + return pnfs_generic_pg_test(pgio, prev, req); +} + +static const struct nfs_pageio_ops bl_pg_read_ops = { + .pg_init = bl_pg_init_read, + .pg_test = bl_pg_test_read, + .pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops bl_pg_write_ops = { + .pg_init = bl_pg_init_write, + .pg_test = bl_pg_test_write, + .pg_doio = pnfs_generic_pg_writepages, +}; + +static struct pnfs_layoutdriver_type blocklayout_type = { + .id = LAYOUT_BLOCK_VOLUME, + .name = "LAYOUT_BLOCK_VOLUME", + .owner = THIS_MODULE, + .read_pagelist = bl_read_pagelist, + .write_pagelist = bl_write_pagelist, + .alloc_layout_hdr = bl_alloc_layout_hdr, + .free_layout_hdr = bl_free_layout_hdr, + .alloc_lseg = bl_alloc_lseg, + .free_lseg = bl_free_lseg, + .encode_layoutcommit = bl_encode_layoutcommit, + .cleanup_layoutcommit = bl_cleanup_layoutcommit, + .set_layoutdriver = bl_set_layoutdriver, + .clear_layoutdriver = bl_clear_layoutdriver, + .pg_read_ops = &bl_pg_read_ops, + .pg_write_ops = &bl_pg_write_ops, +}; + +static const struct rpc_pipe_ops bl_upcall_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = bl_pipe_downcall, + .destroy_msg = bl_pipe_destroy_msg, +}; + +static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, + struct rpc_pipe *pipe) +{ + struct dentry *dir, *dentry; + + dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); + if (dir == NULL) + return ERR_PTR(-ENOENT); + dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); + dput(dir); + return dentry; +} + +static void nfs4blocklayout_unregister_sb(struct super_block *sb, + struct rpc_pipe *pipe) +{ + if (pipe->dentry) + rpc_unlink(pipe->dentry); +} + +static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct super_block *sb = ptr; + struct net *net = sb->s_fs_info; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct dentry *dentry; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return 0; + + if (nn->bl_device_pipe == NULL) { + module_put(THIS_MODULE); + return 0; + } + + switch (event) { + case RPC_PIPEFS_MOUNT: + dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + break; + } + nn->bl_device_pipe->dentry = dentry; + break; + case RPC_PIPEFS_UMOUNT: + if (nn->bl_device_pipe->dentry) + nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); + break; + default: + ret = -ENOTSUPP; + break; + } + module_put(THIS_MODULE); + return ret; +} + +static struct notifier_block nfs4blocklayout_block = { + .notifier_call = rpc_pipefs_event, +}; + +static struct dentry *nfs4blocklayout_register_net(struct net *net, + struct rpc_pipe *pipe) +{ + struct super_block *pipefs_sb; + struct dentry *dentry; + + pipefs_sb = rpc_get_sb_net(net); + if (!pipefs_sb) + return NULL; + dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); + rpc_put_sb_net(net); + return dentry; +} + +static void nfs4blocklayout_unregister_net(struct net *net, + struct rpc_pipe *pipe) +{ + struct super_block *pipefs_sb; + + pipefs_sb = rpc_get_sb_net(net); + if (pipefs_sb) { + nfs4blocklayout_unregister_sb(pipefs_sb, pipe); + rpc_put_sb_net(net); + } +} + +static int nfs4blocklayout_net_init(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct dentry *dentry; + + init_waitqueue_head(&nn->bl_wq); + nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); + if (IS_ERR(nn->bl_device_pipe)) + return PTR_ERR(nn->bl_device_pipe); + dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); + if (IS_ERR(dentry)) { + rpc_destroy_pipe_data(nn->bl_device_pipe); + return PTR_ERR(dentry); + } + nn->bl_device_pipe->dentry = dentry; + return 0; +} + +static void nfs4blocklayout_net_exit(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + nfs4blocklayout_unregister_net(net, nn->bl_device_pipe); + rpc_destroy_pipe_data(nn->bl_device_pipe); + nn->bl_device_pipe = NULL; +} + +static struct pernet_operations nfs4blocklayout_net_ops = { + .init = nfs4blocklayout_net_init, + .exit = nfs4blocklayout_net_exit, +}; + +static int __init nfs4blocklayout_init(void) +{ + int ret; + + dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); + + ret = pnfs_register_layoutdriver(&blocklayout_type); + if (ret) + goto out; + + ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block); + if (ret) + goto out_remove; + ret = register_pernet_subsys(&nfs4blocklayout_net_ops); + if (ret) + goto out_notifier; +out: + return ret; + +out_notifier: + rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); +out_remove: + pnfs_unregister_layoutdriver(&blocklayout_type); + return ret; +} + +static void __exit nfs4blocklayout_exit(void) +{ + dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", + __func__); + + rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); + unregister_pernet_subsys(&nfs4blocklayout_net_ops); + pnfs_unregister_layoutdriver(&blocklayout_type); +} + +MODULE_ALIAS("nfs-layouttype4-3"); + +module_init(nfs4blocklayout_init); +module_exit(nfs4blocklayout_exit); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h new file mode 100644 index 00000000000..9838fb02047 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayout.h @@ -0,0 +1,211 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#ifndef FS_NFS_NFS4BLOCKLAYOUT_H +#define FS_NFS_NFS4BLOCKLAYOUT_H + +#include <linux/device-mapper.h> +#include <linux/nfs_fs.h> +#include <linux/sunrpc/rpc_pipe_fs.h> + +#include "../nfs4_fs.h" +#include "../pnfs.h" +#include "../netns.h" + +#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) +#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) +#define SECTOR_SIZE (1 << SECTOR_SHIFT) + +struct block_mount_id { + spinlock_t bm_lock; /* protects list */ + struct list_head bm_devlist; /* holds pnfs_block_dev */ +}; + +struct pnfs_block_dev { + struct list_head bm_node; + struct nfs4_deviceid bm_mdevid; /* associated devid */ + struct block_device *bm_mdev; /* meta device itself */ + struct net *net; +}; + +enum exstate4 { + PNFS_BLOCK_READWRITE_DATA = 0, + PNFS_BLOCK_READ_DATA = 1, + PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ + PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ +}; + +#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ + +struct my_tree { + sector_t mtt_step_size; /* Internal sector alignment */ + struct list_head mtt_stub; /* Should be a radix tree */ +}; + +struct pnfs_inval_markings { + spinlock_t im_lock; + struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ + sector_t im_block_size; /* Server blocksize in sectors */ + struct list_head im_extents; /* Short extents for INVAL->RW conversion */ +}; + +struct pnfs_inval_tracking { + struct list_head it_link; + int it_sector; + int it_tags; +}; + +/* sector_t fields are all in 512-byte sectors */ +struct pnfs_block_extent { + struct kref be_refcnt; + struct list_head be_node; /* link into lseg list */ + struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ + struct block_device *be_mdev; + sector_t be_f_offset; /* the starting offset in the file */ + sector_t be_length; /* the size of the extent */ + sector_t be_v_offset; /* the starting offset in the volume */ + enum exstate4 be_state; /* the state of this extent */ + struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ +}; + +/* Shortened extent used by LAYOUTCOMMIT */ +struct pnfs_block_short_extent { + struct list_head bse_node; + struct nfs4_deviceid bse_devid; + struct block_device *bse_mdev; + sector_t bse_f_offset; /* the starting offset in the file */ + sector_t bse_length; /* the size of the extent */ +}; + +static inline void +BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) +{ + spin_lock_init(&marks->im_lock); + INIT_LIST_HEAD(&marks->im_tree.mtt_stub); + INIT_LIST_HEAD(&marks->im_extents); + marks->im_block_size = blocksize; + marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, + blocksize); +} + +enum extentclass4 { + RW_EXTENT = 0, /* READWRTE and INVAL */ + RO_EXTENT = 1, /* READ and NONE */ + EXTENT_LISTS = 2, +}; + +static inline int bl_choose_list(enum exstate4 state) +{ + if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) + return RO_EXTENT; + else + return RW_EXTENT; +} + +struct pnfs_block_layout { + struct pnfs_layout_hdr bl_layout; + struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ + spinlock_t bl_ext_lock; /* Protects list manipulation */ + struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ + struct list_head bl_commit; /* Needs layout commit */ + struct list_head bl_committing; /* Layout committing */ + unsigned int bl_count; /* entries in bl_commit */ + sector_t bl_blocksize; /* Server blocksize in sectors */ +}; + +#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data)) + +static inline struct pnfs_block_layout * +BLK_LO2EXT(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct pnfs_block_layout, bl_layout); +} + +static inline struct pnfs_block_layout * +BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) +{ + return BLK_LO2EXT(lseg->pls_layout); +} + +struct bl_pipe_msg { + struct rpc_pipe_msg msg; + wait_queue_head_t *bl_wq; +}; + +struct bl_msg_hdr { + u8 type; + u16 totallen; /* length of entire message, including hdr itself */ +}; + +#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ +#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ +#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ +#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ +#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ + +/* blocklayoutdev.c */ +ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); +void bl_pipe_destroy_msg(struct rpc_pipe_msg *); +void nfs4_blkdev_put(struct block_device *bdev); +struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, + struct pnfs_device *dev); +int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, + struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); + +/* blocklayoutdm.c */ +void bl_free_block_dev(struct pnfs_block_dev *bdev); + +/* extents.c */ +struct pnfs_block_extent * +bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent **cow_read); +int bl_mark_sectors_init(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length); +void bl_put_extent(struct pnfs_block_extent *be); +struct pnfs_block_extent *bl_alloc_extent(void); +int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); +int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg); +void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + const struct nfs4_layoutcommit_args *arg, + int status); +int bl_add_merge_extent(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new); +int bl_mark_for_commit(struct pnfs_block_extent *be, + sector_t offset, sector_t length, + struct pnfs_block_short_extent *new); +int bl_push_one_short_extent(struct pnfs_inval_markings *marks); +struct pnfs_block_short_extent * +bl_pop_one_short_extent(struct pnfs_inval_markings *marks); +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free); + +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c new file mode 100644 index 00000000000..04303b5c936 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -0,0 +1,384 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdev.c + * + * Device operations for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#include <linux/module.h> +#include <linux/buffer_head.h> /* __bread */ + +#include <linux/genhd.h> +#include <linux/blkdev.h> +#include <linux/hash.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static int decode_sector_number(__be32 **rp, sector_t *sp) +{ + uint64_t s; + + *rp = xdr_decode_hyper(*rp, &s); + if (s & 0x1ff) { + printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__); + return -1; + } + *sp = s >> SECTOR_SHIFT; + return 0; +} + +/* + * Release the block device + */ +void nfs4_blkdev_put(struct block_device *bdev) +{ + dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), + MINOR(bdev->bd_dev)); + blkdev_put(bdev, FMODE_READ); +} + +ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, + size_t mlen) +{ + struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, + nfs_net_id); + + if (mlen != sizeof (struct bl_dev_msg)) + return -EINVAL; + + if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0) + return -EFAULT; + + wake_up(&nn->bl_wq); + + return mlen; +} + +void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg); + + if (msg->errno >= 0) + return; + wake_up(bl_pipe_msg->bl_wq); +} + +/* + * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. + */ +struct pnfs_block_dev * +nfs4_blk_decode_device(struct nfs_server *server, + struct pnfs_device *dev) +{ + struct pnfs_block_dev *rv; + struct block_device *bd = NULL; + struct bl_pipe_msg bl_pipe_msg; + struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; + struct bl_msg_hdr bl_msg = { + .type = BL_DEVICE_MOUNT, + .totallen = dev->mincount, + }; + uint8_t *dataptr; + DECLARE_WAITQUEUE(wq, current); + int offset, len, i, rc; + struct net *net = server->nfs_client->cl_net; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct bl_dev_msg *reply = &nn->bl_mount_reply; + + dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); + dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, + dev->mincount); + + bl_pipe_msg.bl_wq = &nn->bl_wq; + memset(msg, 0, sizeof(*msg)); + msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); + if (!msg->data) { + rv = ERR_PTR(-ENOMEM); + goto out; + } + + memcpy(msg->data, &bl_msg, sizeof(bl_msg)); + dataptr = (uint8_t *) msg->data; + len = dev->mincount; + offset = sizeof(bl_msg); + for (i = 0; len > 0; i++) { + memcpy(&dataptr[offset], page_address(dev->pages[i]), + len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); + len -= PAGE_CACHE_SIZE; + offset += PAGE_CACHE_SIZE; + } + msg->len = sizeof(bl_msg) + dev->mincount; + + dprintk("%s CALLING USERSPACE DAEMON\n", __func__); + add_wait_queue(&nn->bl_wq, &wq); + rc = rpc_queue_upcall(nn->bl_device_pipe, msg); + if (rc < 0) { + remove_wait_queue(&nn->bl_wq, &wq); + rv = ERR_PTR(rc); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&nn->bl_wq, &wq); + + if (reply->status != BL_DEVICE_REQUEST_PROC) { + dprintk("%s failed to open device: %d\n", + __func__, reply->status); + rv = ERR_PTR(-EINVAL); + goto out; + } + + bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor), + FMODE_READ, NULL); + if (IS_ERR(bd)) { + dprintk("%s failed to open device : %ld\n", __func__, + PTR_ERR(bd)); + rv = ERR_CAST(bd); + goto out; + } + + rv = kzalloc(sizeof(*rv), GFP_NOFS); + if (!rv) { + rv = ERR_PTR(-ENOMEM); + goto out; + } + + rv->bm_mdev = bd; + memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); + rv->net = net; + dprintk("%s Created device %s with bd_block_size %u\n", + __func__, + bd->bd_disk->disk_name, + bd->bd_block_size); + +out: + kfree(msg->data); + return rv; +} + +/* Map deviceid returned by the server to constructed block_device */ +static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, + struct nfs4_deviceid *id) +{ + struct block_device *rv = NULL; + struct block_mount_id *mid; + struct pnfs_block_dev *dev; + + dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); + mid = BLK_ID(lo); + spin_lock(&mid->bm_lock); + list_for_each_entry(dev, &mid->bm_devlist, bm_node) { + if (memcmp(id->data, dev->bm_mdevid.data, + NFS4_DEVICEID4_SIZE) == 0) { + rv = dev->bm_mdev; + goto out; + } + } + out: + spin_unlock(&mid->bm_lock); + dprintk("%s returning %p\n", __func__, rv); + return rv; +} + +/* Tracks info needed to ensure extents in layout obey constraints of spec */ +struct layout_verification { + u32 mode; /* R or RW */ + u64 start; /* Expected start of next non-COW extent */ + u64 inval; /* Start of INVAL coverage */ + u64 cowread; /* End of COW read coverage */ +}; + +/* Verify the extent meets the layout requirements of the pnfs-block draft, + * section 2.3.1. + */ +static int verify_extent(struct pnfs_block_extent *be, + struct layout_verification *lv) +{ + if (lv->mode == IOMODE_READ) { + if (be->be_state == PNFS_BLOCK_READWRITE_DATA || + be->be_state == PNFS_BLOCK_INVALID_DATA) + return -EIO; + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; + } + /* lv->mode == IOMODE_RW */ + if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + if (lv->cowread > lv->start) + return -EIO; + lv->start += be->be_length; + lv->inval = lv->start; + return 0; + } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; + } else if (be->be_state == PNFS_BLOCK_READ_DATA) { + if (be->be_f_offset > lv->start) + return -EIO; + if (be->be_f_offset < lv->inval) + return -EIO; + if (be->be_f_offset < lv->cowread) + return -EIO; + /* It looks like you might want to min this with lv->start, + * but you really don't. + */ + lv->inval = lv->inval + be->be_length; + lv->cowread = be->be_f_offset + be->be_length; + return 0; + } else + return -EIO; +} + +/* XDR decode pnfs_block_layout4 structure */ +int +nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, + struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + int i, status = -EIO; + uint32_t count; + struct pnfs_block_extent *be = NULL, *save; + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + __be32 *p; + struct layout_verification lv = { + .mode = lgr->range.iomode, + .start = lgr->range.offset >> SECTOR_SHIFT, + .inval = lgr->range.offset >> SECTOR_SHIFT, + .cowread = lgr->range.offset >> SECTOR_SHIFT, + }; + LIST_HEAD(extents); + + dprintk("---> %s\n", __func__); + + scratch = alloc_page(gfp_flags); + if (!scratch) + return -ENOMEM; + + xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err; + + count = be32_to_cpup(p++); + + dprintk("%s enter, number of extents %i\n", __func__, count); + p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); + if (unlikely(!p)) + goto out_err; + + /* Decode individual extents, putting them in temporary + * staging area until whole layout is decoded to make error + * recovery easier. + */ + for (i = 0; i < count; i++) { + be = bl_alloc_extent(); + if (!be) { + status = -ENOMEM; + goto out_err; + } + memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + be->be_mdev = translate_devid(lo, &be->be_devid); + if (!be->be_mdev) + goto out_err; + + /* The next three values are read in as bytes, + * but stored as 512-byte sector lengths + */ + if (decode_sector_number(&p, &be->be_f_offset) < 0) + goto out_err; + if (decode_sector_number(&p, &be->be_length) < 0) + goto out_err; + if (decode_sector_number(&p, &be->be_v_offset) < 0) + goto out_err; + be->be_state = be32_to_cpup(p++); + if (be->be_state == PNFS_BLOCK_INVALID_DATA) + be->be_inval = &bl->bl_inval; + if (verify_extent(be, &lv)) { + dprintk("%s verify failed\n", __func__); + goto out_err; + } + list_add_tail(&be->be_node, &extents); + } + if (lgr->range.offset + lgr->range.length != + lv.start << SECTOR_SHIFT) { + dprintk("%s Final length mismatch\n", __func__); + be = NULL; + goto out_err; + } + if (lv.start < lv.cowread) { + dprintk("%s Final uncovered COW extent\n", __func__); + be = NULL; + goto out_err; + } + /* Extents decoded properly, now try to merge them in to + * existing layout extents. + */ + spin_lock(&bl->bl_ext_lock); + list_for_each_entry_safe(be, save, &extents, be_node) { + list_del(&be->be_node); + status = bl_add_merge_extent(bl, be); + if (status) { + spin_unlock(&bl->bl_ext_lock); + /* This is a fairly catastrophic error, as the + * entire layout extent lists are now corrupted. + * We should have some way to distinguish this. + */ + be = NULL; + goto out_err; + } + } + spin_unlock(&bl->bl_ext_lock); + status = 0; + out: + __free_page(scratch); + dprintk("%s returns %i\n", __func__, status); + return status; + + out_err: + bl_put_extent(be); + while (!list_empty(&extents)) { + be = list_first_entry(&extents, struct pnfs_block_extent, + be_node); + list_del(&be->be_node); + bl_put_extent(be); + } + goto out; +} diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c new file mode 100644 index 00000000000..8999cfddd86 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -0,0 +1,108 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdm.c + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2007 The Regents of the University of Michigan. + * All rights reserved. + * + * Fred Isaman <iisaman@umich.edu> + * Andy Adamson <andros@citi.umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include <linux/genhd.h> /* gendisk - used in a dprintk*/ +#include <linux/sched.h> +#include <linux/hash.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static void dev_remove(struct net *net, dev_t dev) +{ + struct bl_pipe_msg bl_pipe_msg; + struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; + struct bl_dev_msg bl_umount_request; + struct bl_msg_hdr bl_msg = { + .type = BL_DEVICE_UMOUNT, + .totallen = sizeof(bl_umount_request), + }; + uint8_t *dataptr; + DECLARE_WAITQUEUE(wq, current); + struct nfs_net *nn = net_generic(net, nfs_net_id); + + dprintk("Entering %s\n", __func__); + + bl_pipe_msg.bl_wq = &nn->bl_wq; + memset(msg, 0, sizeof(*msg)); + msg->len = sizeof(bl_msg) + bl_msg.totallen; + msg->data = kzalloc(msg->len, GFP_NOFS); + if (!msg->data) + goto out; + + memset(&bl_umount_request, 0, sizeof(bl_umount_request)); + bl_umount_request.major = MAJOR(dev); + bl_umount_request.minor = MINOR(dev); + + memcpy(msg->data, &bl_msg, sizeof(bl_msg)); + dataptr = (uint8_t *) msg->data; + memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); + + add_wait_queue(&nn->bl_wq, &wq); + if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) { + remove_wait_queue(&nn->bl_wq, &wq); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&nn->bl_wq, &wq); + +out: + kfree(msg->data); +} + +/* + * Release meta device + */ +static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) +{ + dprintk("%s Releasing\n", __func__); + nfs4_blkdev_put(bdev->bm_mdev); + dev_remove(bdev->net, bdev->bm_mdev->bd_dev); +} + +void bl_free_block_dev(struct pnfs_block_dev *bdev) +{ + if (bdev) { + if (bdev->bm_mdev) { + dprintk("%s Removing DM device: %d:%d\n", + __func__, + MAJOR(bdev->bm_mdev->bd_dev), + MINOR(bdev->bm_mdev->bd_dev)); + nfs4_blk_metadev_release(bdev); + } + kfree(bdev); + } +} diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c new file mode 100644 index 00000000000..4d016144256 --- /dev/null +++ b/fs/nfs/blocklayout/extents.c @@ -0,0 +1,908 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include "blocklayout.h" +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* Bit numbers */ +#define EXTENT_INITIALIZED 0 +#define EXTENT_WRITTEN 1 +#define EXTENT_IN_COMMIT 2 +#define INTERNAL_EXISTS MY_MAX_TAGS +#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) + +/* Returns largest t<=s s.t. t%base==0 */ +static inline sector_t normalize(sector_t s, int base) +{ + sector_t tmp = s; /* Since do_div modifies its argument */ + return s - sector_div(tmp, base); +} + +static inline sector_t normalize_up(sector_t s, int base) +{ + return normalize(s + base - 1, base); +} + +/* Complete stub using list while determine API wanted */ + +/* Returns tags, or negative */ +static int32_t _find_entry(struct my_tree *tree, u64 s) +{ + struct pnfs_inval_tracking *pos; + + dprintk("%s(%llu) enter\n", __func__, s); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector > s) + continue; + else if (pos->it_sector == s) + return pos->it_tags & INTERNAL_MASK; + else + break; + } + return -ENOENT; +} + +static inline +int _has_tag(struct my_tree *tree, u64 s, int32_t tag) +{ + int32_t tags; + + dprintk("%s(%llu, %i) enter\n", __func__, s, tag); + s = normalize(s, tree->mtt_step_size); + tags = _find_entry(tree, s); + if ((tags < 0) || !(tags & (1 << tag))) + return 0; + else + return 1; +} + +/* Creates entry with tag, or if entry already exists, unions tag to it. + * If storage is not NULL, newly created entry will use it. + * Returns number of entries added, or negative on error. + */ +static int _add_entry(struct my_tree *tree, u64 s, int32_t tag, + struct pnfs_inval_tracking *storage) +{ + int found = 0; + struct pnfs_inval_tracking *pos; + + dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector > s) + continue; + else if (pos->it_sector == s) { + found = 1; + break; + } else + break; + } + if (found) { + pos->it_tags |= (1 << tag); + return 0; + } else { + struct pnfs_inval_tracking *new; + new = storage; + new->it_sector = s; + new->it_tags = (1 << tag); + list_add(&new->it_link, &pos->it_link); + return 1; + } +} + +/* XXXX Really want option to not create */ +/* Over range, unions tag with existing entries, else creates entry with tag */ +static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length) +{ + u64 i; + + dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); + for (i = normalize(s, tree->mtt_step_size); i < s + length; + i += tree->mtt_step_size) + if (_add_entry(tree, i, tag, NULL)) + return -ENOMEM; + return 0; +} + +/* Ensure that future operations on given range of tree will not malloc */ +static int _preload_range(struct pnfs_inval_markings *marks, + u64 offset, u64 length) +{ + u64 start, end, s; + int count, i, used = 0, status = -ENOMEM; + struct pnfs_inval_tracking **storage; + struct my_tree *tree = &marks->im_tree; + + dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); + start = normalize(offset, tree->mtt_step_size); + end = normalize_up(offset + length, tree->mtt_step_size); + count = (int)(end - start) / (int)tree->mtt_step_size; + + /* Pre-malloc what memory we might need */ + storage = kcalloc(count, sizeof(*storage), GFP_NOFS); + if (!storage) + return -ENOMEM; + for (i = 0; i < count; i++) { + storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), + GFP_NOFS); + if (!storage[i]) + goto out_cleanup; + } + + spin_lock_bh(&marks->im_lock); + for (s = start; s < end; s += tree->mtt_step_size) + used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); + spin_unlock_bh(&marks->im_lock); + + status = 0; + + out_cleanup: + for (i = used; i < count; i++) { + if (!storage[i]) + break; + kfree(storage[i]); + } + kfree(storage); + return status; +} + +/* We are relying on page lock to serialize this */ +int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) +{ + int rv; + + spin_lock_bh(&marks->im_lock); + rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); + spin_unlock_bh(&marks->im_lock); + return rv; +} + +/* Assume start, end already sector aligned */ +static int +_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag) +{ + struct pnfs_inval_tracking *pos; + u64 expect = 0; + + dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector >= end) + continue; + if (!expect) { + if ((pos->it_sector == end - tree->mtt_step_size) && + (pos->it_tags & (1 << tag))) { + expect = pos->it_sector - tree->mtt_step_size; + if (pos->it_sector < tree->mtt_step_size || expect < start) + return 1; + continue; + } else { + return 0; + } + } + if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) + return 0; + expect -= tree->mtt_step_size; + if (expect < start) + return 1; + } + return 0; +} + +static int is_range_written(struct pnfs_inval_markings *marks, + sector_t start, sector_t end) +{ + int rv; + + spin_lock_bh(&marks->im_lock); + rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); + spin_unlock_bh(&marks->im_lock); + return rv; +} + +/* Marks sectors in [offest, offset_length) as having been initialized. + * All lengths are step-aligned, where step is min(pagesize, blocksize). + * Currently assumes offset is page-aligned + */ +int bl_mark_sectors_init(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length) +{ + sector_t start, end; + + dprintk("%s(offset=%llu,len=%llu) enter\n", + __func__, (u64)offset, (u64)length); + + start = normalize(offset, marks->im_block_size); + end = normalize_up(offset + length, marks->im_block_size); + if (_preload_range(marks, start, end - start)) + goto outerr; + + spin_lock_bh(&marks->im_lock); + if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) + goto out_unlock; + spin_unlock_bh(&marks->im_lock); + + return 0; + +out_unlock: + spin_unlock_bh(&marks->im_lock); +outerr: + return -ENOMEM; +} + +/* Marks sectors in [offest, offset+length) as having been written to disk. + * All lengths should be block aligned. + */ +static int mark_written_sectors(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length) +{ + int status; + + dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, + (u64)offset, (u64)length); + spin_lock_bh(&marks->im_lock); + status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); + spin_unlock_bh(&marks->im_lock); + return status; +} + +static void print_short_extent(struct pnfs_block_short_extent *be) +{ + dprintk("PRINT SHORT EXTENT extent %p\n", be); + if (be) { + dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); + dprintk(" be_length %llu\n", (u64)be->bse_length); + } +} + +static void print_clist(struct list_head *list, unsigned int count) +{ + struct pnfs_block_short_extent *be; + unsigned int i = 0; + + ifdebug(FACILITY) { + printk(KERN_DEBUG "****************\n"); + printk(KERN_DEBUG "Extent list looks like:\n"); + list_for_each_entry(be, list, bse_node) { + i++; + print_short_extent(be); + } + if (i != count) + printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count); + printk(KERN_DEBUG "****************\n"); + } +} + +/* Note: In theory, we should do more checking that devid's match between + * old and new, but if they don't, the lists are too corrupt to salvage anyway. + */ +/* Note this is very similar to bl_add_merge_extent */ +static void add_to_commitlist(struct pnfs_block_layout *bl, + struct pnfs_block_short_extent *new) +{ + struct list_head *clist = &bl->bl_commit; + struct pnfs_block_short_extent *old, *save; + sector_t end = new->bse_f_offset + new->bse_length; + + dprintk("%s enter\n", __func__); + print_short_extent(new); + print_clist(clist, bl->bl_count); + bl->bl_count++; + /* Scan for proper place to insert, extending new to the left + * as much as possible. + */ + list_for_each_entry_safe(old, save, clist, bse_node) { + if (new->bse_f_offset < old->bse_f_offset) + break; + if (end <= old->bse_f_offset + old->bse_length) { + /* Range is already in list */ + bl->bl_count--; + kfree(new); + return; + } else if (new->bse_f_offset <= + old->bse_f_offset + old->bse_length) { + /* new overlaps or abuts existing be */ + if (new->bse_mdev == old->bse_mdev) { + /* extend new to fully replace old */ + new->bse_length += new->bse_f_offset - + old->bse_f_offset; + new->bse_f_offset = old->bse_f_offset; + list_del(&old->bse_node); + bl->bl_count--; + kfree(old); + } + } + } + /* Note that if we never hit the above break, old will not point to a + * valid extent. However, in that case &old->bse_node==list. + */ + list_add_tail(&new->bse_node, &old->bse_node); + /* Scan forward for overlaps. If we find any, extend new and + * remove the overlapped extent. + */ + old = list_prepare_entry(new, clist, bse_node); + list_for_each_entry_safe_continue(old, save, clist, bse_node) { + if (end < old->bse_f_offset) + break; + /* new overlaps or abuts old */ + if (new->bse_mdev == old->bse_mdev) { + if (end < old->bse_f_offset + old->bse_length) { + /* extend new to fully cover old */ + end = old->bse_f_offset + old->bse_length; + new->bse_length = end - new->bse_f_offset; + } + list_del(&old->bse_node); + bl->bl_count--; + kfree(old); + } + } + dprintk("%s: after merging\n", __func__); + print_clist(clist, bl->bl_count); +} + +/* Note the range described by offset, length is guaranteed to be contained + * within be. + * new will be freed, either by this function or add_to_commitlist if they + * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist. + */ +int bl_mark_for_commit(struct pnfs_block_extent *be, + sector_t offset, sector_t length, + struct pnfs_block_short_extent *new) +{ + sector_t new_end, end = offset + length; + struct pnfs_block_layout *bl = container_of(be->be_inval, + struct pnfs_block_layout, + bl_inval); + + mark_written_sectors(be->be_inval, offset, length); + /* We want to add the range to commit list, but it must be + * block-normalized, and verified that the normalized range has + * been entirely written to disk. + */ + new->bse_f_offset = offset; + offset = normalize(offset, bl->bl_blocksize); + if (offset < new->bse_f_offset) { + if (is_range_written(be->be_inval, offset, new->bse_f_offset)) + new->bse_f_offset = offset; + else + new->bse_f_offset = offset + bl->bl_blocksize; + } + new_end = normalize_up(end, bl->bl_blocksize); + if (end < new_end) { + if (is_range_written(be->be_inval, end, new_end)) + end = new_end; + else + end = new_end - bl->bl_blocksize; + } + if (end <= new->bse_f_offset) { + kfree(new); + return 0; + } + new->bse_length = end - new->bse_f_offset; + new->bse_devid = be->be_devid; + new->bse_mdev = be->be_mdev; + + spin_lock(&bl->bl_ext_lock); + add_to_commitlist(bl, new); + spin_unlock(&bl->bl_ext_lock); + return 0; +} + +static void print_bl_extent(struct pnfs_block_extent *be) +{ + dprintk("PRINT EXTENT extent %p\n", be); + if (be) { + dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); + dprintk(" be_length %llu\n", (u64)be->be_length); + dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); + dprintk(" be_state %d\n", be->be_state); + } +} + +static void +destroy_extent(struct kref *kref) +{ + struct pnfs_block_extent *be; + + be = container_of(kref, struct pnfs_block_extent, be_refcnt); + dprintk("%s be=%p\n", __func__, be); + kfree(be); +} + +void +bl_put_extent(struct pnfs_block_extent *be) +{ + if (be) { + dprintk("%s enter %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_put(&be->be_refcnt, destroy_extent); + } +} + +struct pnfs_block_extent *bl_alloc_extent(void) +{ + struct pnfs_block_extent *be; + + be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS); + if (!be) + return NULL; + INIT_LIST_HEAD(&be->be_node); + kref_init(&be->be_refcnt); + be->be_inval = NULL; + return be; +} + +static void print_elist(struct list_head *list) +{ + struct pnfs_block_extent *be; + dprintk("****************\n"); + dprintk("Extent list looks like:\n"); + list_for_each_entry(be, list, be_node) { + print_bl_extent(be); + } + dprintk("****************\n"); +} + +static inline int +extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) +{ + /* Note this assumes new->be_f_offset >= old->be_f_offset */ + return (new->be_state == old->be_state) && + ((new->be_state == PNFS_BLOCK_NONE_DATA) || + ((new->be_v_offset - old->be_v_offset == + new->be_f_offset - old->be_f_offset) && + new->be_mdev == old->be_mdev)); +} + +/* Adds new to appropriate list in bl, modifying new and removing existing + * extents as appropriate to deal with overlaps. + * + * See bl_find_get_extent for list constraints. + * + * Refcount on new is already set. If end up not using it, or error out, + * need to put the reference. + * + * bl->bl_ext_lock is held by caller. + */ +int +bl_add_merge_extent(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new) +{ + struct pnfs_block_extent *be, *tmp; + sector_t end = new->be_f_offset + new->be_length; + struct list_head *list; + + dprintk("%s enter with be=%p\n", __func__, new); + print_bl_extent(new); + list = &bl->bl_extents[bl_choose_list(new->be_state)]; + print_elist(list); + + /* Scan for proper place to insert, extending new to the left + * as much as possible. + */ + list_for_each_entry_safe_reverse(be, tmp, list, be_node) { + if (new->be_f_offset >= be->be_f_offset + be->be_length) + break; + if (new->be_f_offset >= be->be_f_offset) { + if (end <= be->be_f_offset + be->be_length) { + /* new is a subset of existing be*/ + if (extents_consistent(be, new)) { + dprintk("%s: new is subset, ignoring\n", + __func__); + bl_put_extent(new); + return 0; + } else { + goto out_err; + } + } else { + /* |<-- be -->| + * |<-- new -->| */ + if (extents_consistent(be, new)) { + /* extend new to fully replace be */ + new->be_length += new->be_f_offset - + be->be_f_offset; + new->be_f_offset = be->be_f_offset; + new->be_v_offset = be->be_v_offset; + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else { + goto out_err; + } + } + } else if (end >= be->be_f_offset + be->be_length) { + /* new extent overlap existing be */ + if (extents_consistent(be, new)) { + /* extend new to fully replace be */ + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else { + goto out_err; + } + } else if (end > be->be_f_offset) { + /* |<-- be -->| + *|<-- new -->| */ + if (extents_consistent(new, be)) { + /* extend new to fully replace be */ + new->be_length += be->be_f_offset + be->be_length - + new->be_f_offset - new->be_length; + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else { + goto out_err; + } + } + } + /* Note that if we never hit the above break, be will not point to a + * valid extent. However, in that case &be->be_node==list. + */ + list_add(&new->be_node, &be->be_node); + dprintk("%s: inserting new\n", __func__); + print_elist(list); + /* FIXME - The per-list consistency checks have all been done, + * should now check cross-list consistency. + */ + return 0; + + out_err: + bl_put_extent(new); + return -EIO; +} + +/* Returns extent, or NULL. If a second READ extent exists, it is returned + * in cow_read, if given. + * + * The extents are kept in two seperate ordered lists, one for READ and NONE, + * one for READWRITE and INVALID. Within each list, we assume: + * 1. Extents are ordered by file offset. + * 2. For any given isect, there is at most one extents that matches. + */ +struct pnfs_block_extent * +bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent **cow_read) +{ + struct pnfs_block_extent *be, *cow, *ret; + int i; + + dprintk("%s enter with isect %llu\n", __func__, (u64)isect); + cow = ret = NULL; + spin_lock(&bl->bl_ext_lock); + for (i = 0; i < EXTENT_LISTS; i++) { + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { + if (isect >= be->be_f_offset + be->be_length) + break; + if (isect >= be->be_f_offset) { + /* We have found an extent */ + dprintk("%s Get %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_get(&be->be_refcnt); + if (!ret) + ret = be; + else if (be->be_state != PNFS_BLOCK_READ_DATA) + bl_put_extent(be); + else + cow = be; + break; + } + } + if (ret && + (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) + break; + } + spin_unlock(&bl->bl_ext_lock); + if (cow_read) + *cow_read = cow; + print_bl_extent(ret); + return ret; +} + +/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */ +static struct pnfs_block_extent * +bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) +{ + struct pnfs_block_extent *be, *ret = NULL; + int i; + + dprintk("%s enter with isect %llu\n", __func__, (u64)isect); + for (i = 0; i < EXTENT_LISTS; i++) { + if (ret) + break; + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { + if (isect >= be->be_f_offset + be->be_length) + break; + if (isect >= be->be_f_offset) { + /* We have found an extent */ + dprintk("%s Get %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_get(&be->be_refcnt); + ret = be; + break; + } + } + } + print_bl_extent(ret); + return ret; +} + +int +encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg) +{ + struct pnfs_block_short_extent *lce, *save; + unsigned int count = 0; + __be32 *p, *xdr_start; + + dprintk("%s enter\n", __func__); + /* BUG - creation of bl_commit is buggy - need to wait for + * entire block to be marked WRITTEN before it can be added. + */ + spin_lock(&bl->bl_ext_lock); + /* Want to adjust for possible truncate */ + /* We now want to adjust argument range */ + + /* XDR encode the ranges found */ + xdr_start = xdr_reserve_space(xdr, 8); + if (!xdr_start) + goto out; + list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { + p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); + if (!p) + break; + p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE); + p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT); + p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); + p = xdr_encode_hyper(p, 0LL); + *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); + list_move_tail(&lce->bse_node, &bl->bl_committing); + bl->bl_count--; + count++; + } + xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); + xdr_start[1] = cpu_to_be32(count); +out: + spin_unlock(&bl->bl_ext_lock); + dprintk("%s found %i ranges\n", __func__, count); + return 0; +} + +/* Helper function to set_to_rw that initialize a new extent */ +static void +_prep_new_extent(struct pnfs_block_extent *new, + struct pnfs_block_extent *orig, + sector_t offset, sector_t length, int state) +{ + kref_init(&new->be_refcnt); + /* don't need to INIT_LIST_HEAD(&new->be_node) */ + memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); + new->be_mdev = orig->be_mdev; + new->be_f_offset = offset; + new->be_length = length; + new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; + new->be_state = state; + new->be_inval = orig->be_inval; +} + +/* Tries to merge be with extent in front of it in list. + * Frees storage if not used. + */ +static struct pnfs_block_extent * +_front_merge(struct pnfs_block_extent *be, struct list_head *head, + struct pnfs_block_extent *storage) +{ + struct pnfs_block_extent *prev; + + if (!storage) + goto no_merge; + if (&be->be_node == head || be->be_node.prev == head) + goto no_merge; + prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); + if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || + !extents_consistent(prev, be)) + goto no_merge; + _prep_new_extent(storage, prev, prev->be_f_offset, + prev->be_length + be->be_length, prev->be_state); + list_replace(&prev->be_node, &storage->be_node); + bl_put_extent(prev); + list_del(&be->be_node); + bl_put_extent(be); + return storage; + + no_merge: + kfree(storage); + return be; +} + +static u64 +set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) +{ + u64 rv = offset + length; + struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; + struct pnfs_block_extent *children[3]; + struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; + int i = 0, j; + + dprintk("%s(%llu, %llu)\n", __func__, offset, length); + /* Create storage for up to three new extents e1, e2, e3 */ + e1 = kmalloc(sizeof(*e1), GFP_ATOMIC); + e2 = kmalloc(sizeof(*e2), GFP_ATOMIC); + e3 = kmalloc(sizeof(*e3), GFP_ATOMIC); + /* BUG - we are ignoring any failure */ + if (!e1 || !e2 || !e3) + goto out_nosplit; + + spin_lock(&bl->bl_ext_lock); + be = bl_find_get_extent_locked(bl, offset); + rv = be->be_f_offset + be->be_length; + if (be->be_state != PNFS_BLOCK_INVALID_DATA) { + spin_unlock(&bl->bl_ext_lock); + goto out_nosplit; + } + /* Add e* to children, bumping e*'s krefs */ + if (be->be_f_offset != offset) { + _prep_new_extent(e1, be, be->be_f_offset, + offset - be->be_f_offset, + PNFS_BLOCK_INVALID_DATA); + children[i++] = e1; + print_bl_extent(e1); + } else + merge1 = e1; + _prep_new_extent(e2, be, offset, + min(length, be->be_f_offset + be->be_length - offset), + PNFS_BLOCK_READWRITE_DATA); + children[i++] = e2; + print_bl_extent(e2); + if (offset + length < be->be_f_offset + be->be_length) { + _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, + be->be_f_offset + be->be_length - + offset - length, + PNFS_BLOCK_INVALID_DATA); + children[i++] = e3; + print_bl_extent(e3); + } else + merge2 = e3; + + /* Remove be from list, and insert the e* */ + /* We don't get refs on e*, since this list is the base reference + * set when init'ed. + */ + if (i < 3) + children[i] = NULL; + new = children[0]; + list_replace(&be->be_node, &new->be_node); + bl_put_extent(be); + new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); + for (j = 1; j < i; j++) { + old = new; + new = children[j]; + list_add(&new->be_node, &old->be_node); + } + if (merge2) { + /* This is a HACK, should just create a _back_merge function */ + new = list_entry(new->be_node.next, + struct pnfs_block_extent, be_node); + new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); + } + spin_unlock(&bl->bl_ext_lock); + + /* Since we removed the base reference above, be is now scheduled for + * destruction. + */ + bl_put_extent(be); + dprintk("%s returns %llu after split\n", __func__, rv); + return rv; + + out_nosplit: + kfree(e1); + kfree(e2); + kfree(e3); + dprintk("%s returns %llu without splitting\n", __func__, rv); + return rv; +} + +void +clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + const struct nfs4_layoutcommit_args *arg, + int status) +{ + struct pnfs_block_short_extent *lce, *save; + + dprintk("%s status %d\n", __func__, status); + list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) { + if (likely(!status)) { + u64 offset = lce->bse_f_offset; + u64 end = offset + lce->bse_length; + + do { + offset = set_to_rw(bl, offset, end - offset); + } while (offset < end); + list_del(&lce->bse_node); + + kfree(lce); + } else { + list_del(&lce->bse_node); + spin_lock(&bl->bl_ext_lock); + add_to_commitlist(bl, lce); + spin_unlock(&bl->bl_ext_lock); + } + } +} + +int bl_push_one_short_extent(struct pnfs_inval_markings *marks) +{ + struct pnfs_block_short_extent *new; + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (unlikely(!new)) + return -ENOMEM; + + spin_lock_bh(&marks->im_lock); + list_add(&new->bse_node, &marks->im_extents); + spin_unlock_bh(&marks->im_lock); + + return 0; +} + +struct pnfs_block_short_extent * +bl_pop_one_short_extent(struct pnfs_inval_markings *marks) +{ + struct pnfs_block_short_extent *rv = NULL; + + spin_lock_bh(&marks->im_lock); + if (!list_empty(&marks->im_extents)) { + rv = list_entry((&marks->im_extents)->next, + struct pnfs_block_short_extent, bse_node); + list_del_init(&rv->bse_node); + } + spin_unlock_bh(&marks->im_lock); + + return rv; +} + +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free) +{ + struct pnfs_block_short_extent *se = NULL, *tmp; + + if (num_to_free <= 0) + return; + + spin_lock(&marks->im_lock); + list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) { + list_del(&se->bse_node); + kfree(se); + if (--num_to_free == 0) + break; + } + spin_unlock(&marks->im_lock); + + BUG_ON(num_to_free > 0); +} diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c new file mode 100644 index 00000000000..5f7b053720e --- /dev/null +++ b/fs/nfs/cache_lib.c @@ -0,0 +1,158 @@ +/* + * linux/fs/nfs/cache_lib.c + * + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/slab.h> +#include <linux/sunrpc/cache.h> +#include <linux/sunrpc/rpc_pipe_fs.h> +#include <net/net_namespace.h> + +#include "cache_lib.h" + +#define NFS_CACHE_UPCALL_PATHLEN 256 +#define NFS_CACHE_UPCALL_TIMEOUT 15 + +static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] = + "/sbin/nfs_cache_getent"; +static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT; + +module_param_string(cache_getent, nfs_cache_getent_prog, + sizeof(nfs_cache_getent_prog), 0600); +MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program"); +module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600); +MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which " + "the cache upcall is assumed to have failed"); + +int nfs_cache_upcall(struct cache_detail *cd, char *entry_name) +{ + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + char *argv[] = { + nfs_cache_getent_prog, + cd->name, + entry_name, + NULL + }; + int ret = -EACCES; + + if (nfs_cache_getent_prog[0] == '\0') + goto out; + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + /* + * Disable the upcall mechanism if we're getting an ENOENT or + * EACCES error. The admin can re-enable it on the fly by using + * sysfs to set the 'cache_getent' parameter once the problem + * has been fixed. + */ + if (ret == -ENOENT || ret == -EACCES) + nfs_cache_getent_prog[0] = '\0'; +out: + return ret > 0 ? 0 : ret; +} + +/* + * Deferred request handling + */ +void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq) +{ + if (atomic_dec_and_test(&dreq->count)) + kfree(dreq); +} + +static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(d, struct nfs_cache_defer_req, deferred_req); + + complete_all(&dreq->completion); + nfs_cache_defer_req_put(dreq); +} + +static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(req, struct nfs_cache_defer_req, req); + dreq->deferred_req.revisit = nfs_dns_cache_revisit; + atomic_inc(&dreq->count); + + return &dreq->deferred_req; +} + +struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void) +{ + struct nfs_cache_defer_req *dreq; + + dreq = kzalloc(sizeof(*dreq), GFP_KERNEL); + if (dreq) { + init_completion(&dreq->completion); + atomic_set(&dreq->count, 1); + dreq->req.defer = nfs_dns_cache_defer; + } + return dreq; +} + +int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq) +{ + if (wait_for_completion_timeout(&dreq->completion, + nfs_cache_getent_timeout * HZ) == 0) + return -ETIMEDOUT; + return 0; +} + +int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd) +{ + int ret; + struct dentry *dir; + + dir = rpc_d_lookup_sb(sb, "cache"); + ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd); + dput(dir); + return ret; +} + +int nfs_cache_register_net(struct net *net, struct cache_detail *cd) +{ + struct super_block *pipefs_sb; + int ret = 0; + + sunrpc_init_cache_detail(cd); + pipefs_sb = rpc_get_sb_net(net); + if (pipefs_sb) { + ret = nfs_cache_register_sb(pipefs_sb, cd); + rpc_put_sb_net(net); + if (ret) + sunrpc_destroy_cache_detail(cd); + } + return ret; +} + +void nfs_cache_unregister_sb(struct super_block *sb, struct cache_detail *cd) +{ + if (cd->u.pipefs.dir) + sunrpc_cache_unregister_pipefs(cd); +} + +void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd) +{ + struct super_block *pipefs_sb; + + pipefs_sb = rpc_get_sb_net(net); + if (pipefs_sb) { + nfs_cache_unregister_sb(pipefs_sb, cd); + rpc_put_sb_net(net); + } + sunrpc_destroy_cache_detail(cd); +} diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h new file mode 100644 index 00000000000..4116d2c3f52 --- /dev/null +++ b/fs/nfs/cache_lib.h @@ -0,0 +1,31 @@ +/* + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + */ + +#include <linux/completion.h> +#include <linux/sunrpc/cache.h> +#include <linux/atomic.h> + +/* + * Deferred request handling + */ +struct nfs_cache_defer_req { + struct cache_req req; + struct cache_deferred_req deferred_req; + struct completion completion; + atomic_t count; +}; + +extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name); +extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void); +extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq); +extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq); + +extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd); +extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd); +extern int nfs_cache_register_sb(struct super_block *sb, + struct cache_detail *cd); +extern void nfs_cache_unregister_sb(struct super_block *sb, + struct cache_detail *cd); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 66648dd92d9..073b4cf67ed 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -9,191 +9,455 @@ #include <linux/completion.h> #include <linux/ip.h> #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/nfs_fs.h> +#include <linux/errno.h> #include <linux/mutex.h> #include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/sunrpc/svcauth_gss.h> +#include <linux/sunrpc/bc_xprt.h> #include <net/inet_sock.h> #include "nfs4_fs.h" #include "callback.h" #include "internal.h" +#include "netns.h" #define NFSDBG_FACILITY NFSDBG_CALLBACK struct nfs_callback_data { unsigned int users; struct svc_serv *serv; - pid_t pid; - struct completion started; - struct completion stopped; + struct svc_rqst *rqst; + struct task_struct *task; }; -static struct nfs_callback_data nfs_callback_info; +static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1]; static DEFINE_MUTEX(nfs_callback_mutex); static struct svc_program nfs4_callback_program; -unsigned int nfs_callback_set_tcpport; -unsigned short nfs_callback_tcpport; -static const int nfs_set_port_min = 0; -static const int nfs_set_port_max = 65535; - -static int param_set_port(const char *val, struct kernel_param *kp) +static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net) { - char *endp; - int num = simple_strtol(val, &endp, 0); - if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max) - return -EINVAL; - *((int *)kp->arg) = num; + int ret; + struct nfs_net *nn = net_generic(net, nfs_net_id); + + ret = svc_create_xprt(serv, "tcp", net, PF_INET, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + if (ret <= 0) + goto out_err; + nn->nfs_callback_tcpport = ret; + dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", + nn->nfs_callback_tcpport, PF_INET, net); + + ret = svc_create_xprt(serv, "tcp", net, PF_INET6, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + if (ret > 0) { + nn->nfs_callback_tcpport6 = ret; + dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", + nn->nfs_callback_tcpport6, PF_INET6, net); + } else if (ret != -EAFNOSUPPORT) + goto out_err; return 0; -} -module_param_call(callback_tcpport, param_set_port, param_get_int, - &nfs_callback_set_tcpport, 0644); +out_err: + return (ret) ? ret : -ENOMEM; +} /* - * This is the callback kernel thread. + * This is the NFSv4 callback kernel thread. */ -static void nfs_callback_svc(struct svc_rqst *rqstp) +static int +nfs4_callback_svc(void *vrqstp) { int err; + struct svc_rqst *rqstp = vrqstp; - __module_get(THIS_MODULE); - lock_kernel(); - - nfs_callback_info.pid = current->pid; - daemonize("nfsv4-svc"); - /* Process request with signals blocked, but allow SIGKILL. */ - allow_signal(SIGKILL); set_freezable(); - complete(&nfs_callback_info.started); - - for(;;) { - if (signalled()) { - if (nfs_callback_info.users == 0) - break; - flush_signals(current); - } + while (!kthread_should_stop()) { /* * Listen for a request on the socket */ err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); if (err == -EAGAIN || err == -EINTR) continue; - if (err < 0) { - printk(KERN_WARNING - "%s: terminating on error %d\n", - __FUNCTION__, -err); - break; - } svc_process(rqstp); } + return 0; +} - flush_signals(current); - svc_exit_thread(rqstp); - nfs_callback_info.pid = 0; - complete(&nfs_callback_info.stopped); - unlock_kernel(); - module_put_and_exit(0); +/* + * Prepare to bring up the NFSv4 callback service + */ +static struct svc_rqst * +nfs4_callback_up(struct svc_serv *serv) +{ + return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); +} + +#if defined(CONFIG_NFS_V4_1) +static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net) +{ + /* + * Create an svc_sock for the back channel service that shares the + * fore channel connection. + * Returns the input port (0) and sets the svc_serv bc_xprt on success + */ + return svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0, + SVC_SOCK_ANONYMOUS); } /* - * Bring up the server process if it is not already up. + * The callback service for NFSv4.1 callbacks */ -int nfs_callback_up(void) +static int +nfs41_callback_svc(void *vrqstp) { - struct svc_serv *serv = NULL; - int ret = 0; + struct svc_rqst *rqstp = vrqstp; + struct svc_serv *serv = rqstp->rq_server; + struct rpc_rqst *req; + int error; + DEFINE_WAIT(wq); + + set_freezable(); + + while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); + spin_lock_bh(&serv->sv_cb_lock); + if (!list_empty(&serv->sv_cb_list)) { + req = list_first_entry(&serv->sv_cb_list, + struct rpc_rqst, rq_bc_list); + list_del(&req->rq_bc_list); + spin_unlock_bh(&serv->sv_cb_lock); + dprintk("Invoking bc_svc_process()\n"); + error = bc_svc_process(serv, req, rqstp); + dprintk("bc_svc_process() returned w/ error code= %d\n", + error); + } else { + spin_unlock_bh(&serv->sv_cb_lock); + schedule(); + } + finish_wait(&serv->sv_cb_waitq, &wq); + } + return 0; +} + +/* + * Bring up the NFSv4.1 callback service + */ +static struct svc_rqst * +nfs41_callback_up(struct svc_serv *serv) +{ + struct svc_rqst *rqstp; + + INIT_LIST_HEAD(&serv->sv_cb_list); + spin_lock_init(&serv->sv_cb_lock); + init_waitqueue_head(&serv->sv_cb_waitq); + rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); + if (IS_ERR(rqstp)) { + svc_xprt_put(serv->sv_bc_xprt); + serv->sv_bc_xprt = NULL; + } + dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp)); + return rqstp; +} + +static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, + struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) +{ + *rqstpp = nfs41_callback_up(serv); + *callback_svc = nfs41_callback_svc; +} + +static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, + struct svc_serv *serv) +{ + if (minorversion) + /* + * Save the svc_serv in the transport so that it can + * be referenced when the session backchannel is initialized + */ + xprt->bc_serv = serv; +} +#else +static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net) +{ + return 0; +} + +static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, + struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) +{ + *rqstpp = ERR_PTR(-ENOTSUPP); + *callback_svc = ERR_PTR(-ENOTSUPP); +} + +static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, + struct svc_serv *serv) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, + struct svc_serv *serv) +{ + struct svc_rqst *rqstp; + int (*callback_svc)(void *vrqstp); + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + int ret; + + nfs_callback_bc_serv(minorversion, xprt, serv); + + if (cb_info->task) + return 0; + + switch (minorversion) { + case 0: + /* v4.0 callback setup */ + rqstp = nfs4_callback_up(serv); + callback_svc = nfs4_callback_svc; + break; + default: + nfs_minorversion_callback_svc_setup(serv, + &rqstp, &callback_svc); + } + + if (IS_ERR(rqstp)) + return PTR_ERR(rqstp); + + svc_sock_update_bufs(serv); + + cb_info->serv = serv; + cb_info->rqst = rqstp; + cb_info->task = kthread_run(callback_svc, cb_info->rqst, + "nfsv4.%u-svc", minorversion); + if (IS_ERR(cb_info->task)) { + ret = PTR_ERR(cb_info->task); + svc_exit_thread(cb_info->rqst); + cb_info->rqst = NULL; + cb_info->task = NULL; + return ret; + } + dprintk("nfs_callback_up: service started\n"); + return 0; +} + +static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + if (--nn->cb_users[minorversion]) + return; + + dprintk("NFS: destroy per-net callback data; net=%p\n", net); + svc_shutdown_net(serv, net); +} + +static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + int ret; + + if (nn->cb_users[minorversion]++) + return 0; + + dprintk("NFS: create per-net callback data; net=%p\n", net); + + ret = svc_bind(serv, net); + if (ret < 0) { + printk(KERN_WARNING "NFS: bind callback service failed\n"); + goto err_bind; + } + + switch (minorversion) { + case 0: + ret = nfs4_callback_up_net(serv, net); + break; + case 1: + case 2: + ret = nfs41_callback_up_net(serv, net); + break; + default: + printk(KERN_ERR "NFS: unknown callback version: %d\n", + minorversion); + ret = -EINVAL; + break; + } + + if (ret < 0) { + printk(KERN_ERR "NFS: callback service start failed\n"); + goto err_socks; + } + return 0; + +err_socks: + svc_rpcb_cleanup(serv, net); +err_bind: + dprintk("NFS: Couldn't create callback socket: err = %d; " + "net = %p\n", ret, net); + return ret; +} + +static struct svc_serv *nfs_callback_create_svc(int minorversion) +{ + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + struct svc_serv *serv; + + /* + * Check whether we're already up and running. + */ + if (cb_info->task) { + /* + * Note: increase service usage, because later in case of error + * svc_destroy() will be called. + */ + svc_get(cb_info->serv); + return cb_info->serv; + } + + /* + * Sanity check: if there's no task, + * we should be the first user ... + */ + if (cb_info->users) + printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", + cb_info->users); - lock_kernel(); - mutex_lock(&nfs_callback_mutex); - if (nfs_callback_info.users++ || nfs_callback_info.pid != 0) - goto out; - init_completion(&nfs_callback_info.started); - init_completion(&nfs_callback_info.stopped); serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); - ret = -ENOMEM; - if (!serv) - goto out_err; + if (!serv) { + printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); + return ERR_PTR(-ENOMEM); + } + /* As there is only one thread we need to over-ride the + * default maximum of 80 connections + */ + serv->sv_maxconn = 1024; + dprintk("nfs_callback_create_svc: service created\n"); + return serv; +} - ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport, - SVC_SOCK_ANONYMOUS); - if (ret <= 0) - goto out_err; - nfs_callback_tcpport = ret; - dprintk("Callback port = 0x%x\n", nfs_callback_tcpport); +/* + * Bring up the callback thread if it is not already up. + */ +int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) +{ + struct svc_serv *serv; + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + int ret; + struct net *net = xprt->xprt_net; + + mutex_lock(&nfs_callback_mutex); - ret = svc_create_thread(nfs_callback_svc, serv); + serv = nfs_callback_create_svc(minorversion); + if (IS_ERR(serv)) { + ret = PTR_ERR(serv); + goto err_create; + } + + ret = nfs_callback_up_net(minorversion, serv, net); if (ret < 0) - goto out_err; - nfs_callback_info.serv = serv; - wait_for_completion(&nfs_callback_info.started); -out: + goto err_net; + + ret = nfs_callback_start_svc(minorversion, xprt, serv); + if (ret < 0) + goto err_start; + + cb_info->users++; /* * svc_create creates the svc_serv with sv_nrthreads == 1, and then - * svc_create_thread increments that. So we need to call svc_destroy + * svc_prepare_thread increments that. So we need to call svc_destroy * on both success and failure so that the refcount is 1 when the * thread exits. */ - if (serv) - svc_destroy(serv); +err_net: + svc_destroy(serv); +err_create: mutex_unlock(&nfs_callback_mutex); - unlock_kernel(); return ret; -out_err: - dprintk("Couldn't create callback socket or server thread; err = %d\n", - ret); - nfs_callback_info.users--; - goto out; + +err_start: + nfs_callback_down_net(minorversion, serv, net); + dprintk("NFS: Couldn't create server thread; err = %d\n", ret); + goto err_net; } /* - * Kill the server process if it is not already up. + * Kill the callback thread if it's no longer being used. */ -void nfs_callback_down(void) +void nfs_callback_down(int minorversion, struct net *net) { - lock_kernel(); + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + mutex_lock(&nfs_callback_mutex); - nfs_callback_info.users--; - do { - if (nfs_callback_info.users != 0 || nfs_callback_info.pid == 0) - break; - if (kill_proc(nfs_callback_info.pid, SIGKILL, 1) < 0) - break; - } while (wait_for_completion_timeout(&nfs_callback_info.stopped, 5*HZ) == 0); + nfs_callback_down_net(minorversion, cb_info->serv, net); + cb_info->users--; + if (cb_info->users == 0 && cb_info->task != NULL) { + kthread_stop(cb_info->task); + dprintk("nfs_callback_down: service stopped\n"); + svc_exit_thread(cb_info->rqst); + dprintk("nfs_callback_down: service destroyed\n"); + cb_info->serv = NULL; + cb_info->rqst = NULL; + cb_info->task = NULL; + } mutex_unlock(&nfs_callback_mutex); - unlock_kernel(); } -static int nfs_callback_authenticate(struct svc_rqst *rqstp) +/* Boolean check of RPC_AUTH_GSS principal */ +int +check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) { - struct nfs_client *clp; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + char *p = rqstp->rq_cred.cr_principal; - /* Don't talk to strangers */ - clp = nfs_find_client(svc_addr(rqstp), 4); - if (clp == NULL) - return SVC_DROP; + if (rqstp->rq_authop->flavour != RPC_AUTH_GSS) + return 1; - dprintk("%s: %s NFSv4 callback!\n", __FUNCTION__, - svc_print_addr(rqstp, buf, sizeof(buf))); - nfs_put_client(clp); + /* No RPC_AUTH_GSS on NFSv4.1 back channel yet */ + if (clp->cl_minorversion != 0) + return 0; + /* + * It might just be a normal user principal, in which case + * userspace won't bother to tell us the name at all. + */ + if (p == NULL) + return 0; + + /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */ + + if (memcmp(p, "nfs@", 4) != 0) + return 0; + p += 4; + if (strcmp(p, clp->cl_hostname) != 0) + return 0; + return 1; +} +/* + * pg_authenticate method for nfsv4 callback threads. + * + * The authflavor has been negotiated, so an incorrect flavor is a server + * bug. Drop packets with incorrect authflavor. + * + * All other checking done after NFS decoding where the nfs_client can be + * found in nfs4_callback_compound + */ +static int nfs_callback_authenticate(struct svc_rqst *rqstp) +{ switch (rqstp->rq_authop->flavour) { - case RPC_AUTH_NULL: - if (rqstp->rq_proc != CB_NULL) - return SVC_DENIED; - break; - case RPC_AUTH_UNIX: - break; - case RPC_AUTH_GSS: - /* FIXME: RPCSEC_GSS handling? */ - default: - return SVC_DENIED; + case RPC_AUTH_NULL: + if (rqstp->rq_proc != CB_NULL) + return SVC_DROP; + break; + case RPC_AUTH_GSS: + /* No RPC_AUTH_GSS support yet in NFSv4.1 */ + if (svc_is_backchannel(rqstp)) + return SVC_DROP; } return SVC_OK; } @@ -203,6 +467,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp) */ static struct svc_version *nfs4_callback_version[] = { [1] = &nfs4_callback_version1, + [4] = &nfs4_callback_version4, }; static struct svc_stat nfs4_callback_stats; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index bb25d2135ff..84326e9fb47 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -7,6 +7,7 @@ */ #ifndef __LINUX_FS_NFS_CALLBACK_H #define __LINUX_FS_NFS_CALLBACK_H +#include <linux/sunrpc/svc.h> #define NFS4_CALLBACK 0x40000000 #define NFS4_CALLBACK_XDRSIZE 2048 @@ -20,13 +21,35 @@ enum nfs4_callback_procnum { enum nfs4_callback_opnum { OP_CB_GETATTR = 3, OP_CB_RECALL = 4, +/* Callback operations new to NFSv4.1 */ + OP_CB_LAYOUTRECALL = 5, + OP_CB_NOTIFY = 6, + OP_CB_PUSH_DELEG = 7, + OP_CB_RECALL_ANY = 8, + OP_CB_RECALLABLE_OBJ_AVAIL = 9, + OP_CB_RECALL_SLOT = 10, + OP_CB_SEQUENCE = 11, + OP_CB_WANTS_CANCELLED = 12, + OP_CB_NOTIFY_LOCK = 13, + OP_CB_NOTIFY_DEVICEID = 14, +/* Callback operations new to NFSv4.2 */ + OP_CB_OFFLOAD = 15, OP_CB_ILLEGAL = 10044, }; +struct cb_process_state { + __be32 drc_status; + struct nfs_client *clp; + u32 slotid; + u32 minorversion; + struct net *net; +}; + struct cb_compound_hdr_arg { unsigned int taglen; const char *tag; - unsigned int callback_ident; + unsigned int minorversion; + unsigned int cb_ident; /* v4.0 callback identifier */ unsigned nops; }; @@ -59,16 +82,131 @@ struct cb_recallargs { uint32_t truncate; }; -extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); -extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy); +#if defined(CONFIG_NFS_V4_1) + +struct referring_call { + uint32_t rc_sequenceid; + uint32_t rc_slotid; +}; + +struct referring_call_list { + struct nfs4_sessionid rcl_sessionid; + uint32_t rcl_nrefcalls; + struct referring_call *rcl_refcalls; +}; + +struct cb_sequenceargs { + struct sockaddr *csa_addr; + struct nfs4_sessionid csa_sessionid; + uint32_t csa_sequenceid; + uint32_t csa_slotid; + uint32_t csa_highestslotid; + uint32_t csa_cachethis; + uint32_t csa_nrclists; + struct referring_call_list *csa_rclists; +}; + +struct cb_sequenceres { + __be32 csr_status; + struct nfs4_sessionid csr_sessionid; + uint32_t csr_sequenceid; + uint32_t csr_slotid; + uint32_t csr_highestslotid; + uint32_t csr_target_highestslotid; +}; + +extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, + struct cb_sequenceres *res, + struct cb_process_state *cps); + +extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, + const nfs4_stateid *stateid); + +#define RCA4_TYPE_MASK_RDATA_DLG 0 +#define RCA4_TYPE_MASK_WDATA_DLG 1 +#define RCA4_TYPE_MASK_DIR_DLG 2 +#define RCA4_TYPE_MASK_FILE_LAYOUT 3 +#define RCA4_TYPE_MASK_BLK_LAYOUT 4 +#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8 +#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 +#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 +#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 +#define RCA4_TYPE_MASK_ALL 0xf31f + +struct cb_recallanyargs { + struct sockaddr *craa_addr; + uint32_t craa_objs_to_keep; + uint32_t craa_type_mask; +}; + +extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, + void *dummy, + struct cb_process_state *cps); + +struct cb_recallslotargs { + struct sockaddr *crsa_addr; + uint32_t crsa_target_highest_slotid; +}; +extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, + void *dummy, + struct cb_process_state *cps); + +struct cb_layoutrecallargs { + struct sockaddr *cbl_addr; + uint32_t cbl_recall_type; + uint32_t cbl_layout_type; + uint32_t cbl_layoutchanged; + union { + struct { + struct nfs_fh cbl_fh; + struct pnfs_layout_range cbl_range; + nfs4_stateid cbl_stateid; + }; + struct nfs_fsid cbl_fsid; + }; +}; + +extern __be32 nfs4_callback_layoutrecall( + struct cb_layoutrecallargs *args, + void *dummy, struct cb_process_state *cps); -#ifdef CONFIG_NFS_V4 -extern int nfs_callback_up(void); -extern void nfs_callback_down(void); -#else -#define nfs_callback_up() (0) -#define nfs_callback_down() do {} while(0) -#endif +struct cb_devicenotifyitem { + uint32_t cbd_notify_type; + uint32_t cbd_layout_type; + struct nfs4_deviceid cbd_dev_id; + uint32_t cbd_immediate; +}; + +struct cb_devicenotifyargs { + int ndevs; + struct cb_devicenotifyitem *devs; +}; + +extern __be32 nfs4_callback_devicenotify( + struct cb_devicenotifyargs *args, + void *dummy, struct cb_process_state *cps); + +#endif /* CONFIG_NFS_V4_1 */ +extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); +extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, + struct cb_getattrres *res, + struct cb_process_state *cps); +extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, + struct cb_process_state *cps); +#if IS_ENABLED(CONFIG_NFS_V4) +extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); +extern void nfs_callback_down(int minorversion, struct net *net); +extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, + const nfs4_stateid *stateid); +extern int nfs4_set_callback_sessionid(struct nfs_client *clp); +#endif /* CONFIG_NFS_V4 */ +/* + * nfs41: Callbacks are expected to not cause substantial latency, + * so we limit their concurrency to 1 by setting up the maximum number + * of slots for the backchannel. + */ +#define NFS41_BC_MIN_CALLBACKS 1 +#define NFS41_BC_MAX_CALLBACKS 1 extern unsigned int nfs_callback_set_tcpport; extern unsigned short nfs_callback_tcpport; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 15f7785048d..41db5258e7a 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -7,37 +7,44 @@ */ #include <linux/nfs4.h> #include <linux/nfs_fs.h> +#include <linux/slab.h> +#include <linux/rcupdate.h> #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" #include "internal.h" +#include "pnfs.h" +#include "nfs4session.h" +#include "nfs4trace.h" #ifdef NFS_DEBUG #define NFSDBG_FACILITY NFSDBG_CALLBACK #endif - -__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res) + +__be32 nfs4_callback_getattr(struct cb_getattrargs *args, + struct cb_getattrres *res, + struct cb_process_state *cps) { - struct nfs_client *clp; struct nfs_delegation *delegation; struct nfs_inode *nfsi; struct inode *inode; + res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ + goto out; + res->bitmap[0] = res->bitmap[1] = 0; res->status = htonl(NFS4ERR_BADHANDLE); - clp = nfs_find_client(args->addr, 4); - if (clp == NULL) - goto out; - dprintk("NFS: GETATTR callback request from %s\n", - rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + dprintk_rcu("NFS: GETATTR callback request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); - inode = nfs_delegation_find_inode(clp, &args->fh); + inode = nfs_delegation_find_inode(cps->clp, &args->fh); if (inode == NULL) - goto out_putclient; + goto out; nfsi = NFS_I(inode); - down_read(&nfsi->rwsem); - delegation = nfsi->delegation; + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) goto out_iput; res->size = i_size_read(inode); @@ -52,52 +59,485 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres * args->bitmap[1]; res->status = 0; out_iput: - up_read(&nfsi->rwsem); + rcu_read_unlock(); iput(inode); -out_putclient: - nfs_put_client(clp); out: - dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res->status)); + dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); return res->status; } -__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) +__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, + struct cb_process_state *cps) { - struct nfs_client *clp; struct inode *inode; __be32 res; + res = htonl(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ + goto out; + + dprintk_rcu("NFS: RECALL callback request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + res = htonl(NFS4ERR_BADHANDLE); - clp = nfs_find_client(args->addr, 4); - if (clp == NULL) + inode = nfs_delegation_find_inode(cps->clp, &args->fh); + if (inode == NULL) goto out; + /* Set up a helper thread to actually return the delegation */ + switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { + case 0: + res = 0; + break; + case -ENOENT: + res = htonl(NFS4ERR_BAD_STATEID); + break; + default: + res = htonl(NFS4ERR_RESOURCE); + } + trace_nfs4_recall_delegation(inode, -ntohl(res)); + iput(inode); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); + return res; +} + +#if defined(CONFIG_NFS_V4_1) - dprintk("NFS: RECALL callback request from %s\n", - rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); - - do { - struct nfs_client *prev = clp; - - inode = nfs_delegation_find_inode(clp, &args->fh); - if (inode != NULL) { - /* Set up a helper thread to actually return the delegation */ - switch(nfs_async_inode_return_delegation(inode, &args->stateid)) { - case 0: - res = 0; - break; - case -ENOENT: - if (res != 0) - res = htonl(NFS4ERR_BAD_STATEID); - break; - default: - res = htonl(NFS4ERR_RESOURCE); +/* + * Lookup a layout by filehandle. + * + * Note: gets a refcount on the layout hdr and on its respective inode. + * Caller must put the layout hdr and the inode. + * + * TODO: keep track of all layouts (and delegations) in a hash table + * hashed by filehandle. + */ +static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, + struct nfs_fh *fh, nfs4_stateid *stateid) +{ + struct nfs_server *server; + struct inode *ino; + struct pnfs_layout_hdr *lo; + + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry(lo, &server->layouts, plh_layouts) { + if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid)) + continue; + if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) + continue; + ino = igrab(lo->plh_inode); + if (!ino) + break; + spin_lock(&ino->i_lock); + /* Is this layout in the process of being freed? */ + if (NFS_I(ino)->layout != lo) { + spin_unlock(&ino->i_lock); + iput(ino); + break; } - iput(inode); + pnfs_get_layout_hdr(lo); + spin_unlock(&ino->i_lock); + return lo; + } + } + + return NULL; +} + +static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, + struct nfs_fh *fh, nfs4_stateid *stateid) +{ + struct pnfs_layout_hdr *lo; + + spin_lock(&clp->cl_lock); + rcu_read_lock(); + lo = get_layout_by_fh_locked(clp, fh, stateid); + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + + return lo; +} + +static u32 initiate_file_draining(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + struct inode *ino; + struct pnfs_layout_hdr *lo; + u32 rv = NFS4ERR_NOMATCHING_LAYOUT; + LIST_HEAD(free_me_list); + + lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); + if (!lo) + goto out; + + ino = lo->plh_inode; + spin_lock(&ino->i_lock); + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || + pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, + &args->cbl_range)) + rv = NFS4ERR_DELAY; + else + rv = NFS4ERR_NOMATCHING_LAYOUT; + pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me_list); + pnfs_put_layout_hdr(lo); + iput(ino); +out: + return rv; +} + +static u32 initiate_bulk_draining(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + int stat; + + if (args->cbl_recall_type == RETURN_FSID) + stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true); + else + stat = pnfs_destroy_layouts_byclid(clp, true); + if (stat != 0) + return NFS4ERR_DELAY; + return NFS4ERR_NOMATCHING_LAYOUT; +} + +static u32 do_callback_layoutrecall(struct nfs_client *clp, + struct cb_layoutrecallargs *args) +{ + u32 res; + + dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); + if (args->cbl_recall_type == RETURN_FILE) + res = initiate_file_draining(clp, args); + else + res = initiate_bulk_draining(clp, args); + dprintk("%s returning %i\n", __func__, res); + return res; + +} + +__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, + void *dummy, struct cb_process_state *cps) +{ + u32 res; + + dprintk("%s: -->\n", __func__); + + if (cps->clp) + res = do_callback_layoutrecall(cps->clp, args); + else + res = NFS4ERR_OP_NOT_IN_SESSION; + + dprintk("%s: exit with status = %d\n", __func__, res); + return cpu_to_be32(res); +} + +static void pnfs_recall_all_layouts(struct nfs_client *clp) +{ + struct cb_layoutrecallargs args; + + /* Pretend we got a CB_LAYOUTRECALL(ALL) */ + memset(&args, 0, sizeof(args)); + args.cbl_recall_type = RETURN_ALL; + /* FIXME we ignore errors, what should we do? */ + do_callback_layoutrecall(clp, &args); +} + +__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, + void *dummy, struct cb_process_state *cps) +{ + int i; + __be32 res = 0; + struct nfs_client *clp = cps->clp; + struct nfs_server *server = NULL; + + dprintk("%s: -->\n", __func__); + + if (!clp) { + res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + goto out; + } + + for (i = 0; i < args->ndevs; i++) { + struct cb_devicenotifyitem *dev = &args->devs[i]; + + if (!server || + server->pnfs_curr_ld->id != dev->cbd_layout_type) { + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + if (server->pnfs_curr_ld && + server->pnfs_curr_ld->id == dev->cbd_layout_type) { + rcu_read_unlock(); + goto found; + } + rcu_read_unlock(); + dprintk("%s: layout type %u not found\n", + __func__, dev->cbd_layout_type); + continue; } - clp = nfs_find_client_next(prev); - nfs_put_client(prev); - } while (clp != NULL); + + found: + if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) + dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, " + "deleting instead\n", __func__); + nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); + } + out: - dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res)); + kfree(args->devs); + dprintk("%s: exit with status = %u\n", + __func__, be32_to_cpu(res)); return res; } + +/* + * Validate the sequenceID sent by the server. + * Return success if the sequenceID is one more than what we last saw on + * this slot, accounting for wraparound. Increments the slot's sequence. + * + * We don't yet implement a duplicate request cache, instead we set the + * back channel ca_maxresponsesize_cached to zero. This is OK for now + * since we only currently implement idempotent callbacks anyway. + * + * We have a single slot backchannel at this time, so we don't bother + * checking the used_slots bit array on the table. The lower layer guarantees + * a single outstanding callback request at a time. + */ +static __be32 +validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) +{ + struct nfs4_slot *slot; + + dprintk("%s enter. slotid %u seqid %u\n", + __func__, args->csa_slotid, args->csa_sequenceid); + + if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS) + return htonl(NFS4ERR_BADSLOT); + + slot = tbl->slots + args->csa_slotid; + dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr); + + /* Normal */ + if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { + slot->seq_nr++; + goto out_ok; + } + + /* Replay */ + if (args->csa_sequenceid == slot->seq_nr) { + dprintk("%s seqid %u is a replay\n", + __func__, args->csa_sequenceid); + /* Signal process_op to set this error on next op */ + if (args->csa_cachethis == 0) + return htonl(NFS4ERR_RETRY_UNCACHED_REP); + + /* The ca_maxresponsesize_cached is 0 with no DRC */ + else if (args->csa_cachethis == 1) + return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE); + } + + /* Wraparound */ + if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) { + slot->seq_nr = 1; + goto out_ok; + } + + /* Misordered request */ + return htonl(NFS4ERR_SEQ_MISORDERED); +out_ok: + tbl->highest_used_slotid = args->csa_slotid; + return htonl(NFS4_OK); +} + +/* + * For each referring call triple, check the session's slot table for + * a match. If the slot is in use and the sequence numbers match, the + * client is still waiting for a response to the original request. + */ +static bool referring_call_exists(struct nfs_client *clp, + uint32_t nrclists, + struct referring_call_list *rclists) +{ + bool status = 0; + int i, j; + struct nfs4_session *session; + struct nfs4_slot_table *tbl; + struct referring_call_list *rclist; + struct referring_call *ref; + + /* + * XXX When client trunking is implemented, this becomes + * a session lookup from within the loop + */ + session = clp->cl_session; + tbl = &session->fc_slot_table; + + for (i = 0; i < nrclists; i++) { + rclist = &rclists[i]; + if (memcmp(session->sess_id.data, + rclist->rcl_sessionid.data, + NFS4_MAX_SESSIONID_LEN) != 0) + continue; + + for (j = 0; j < rclist->rcl_nrefcalls; j++) { + ref = &rclist->rcl_refcalls[j]; + + dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u " + "slotid %u\n", __func__, + ((u32 *)&rclist->rcl_sessionid.data)[0], + ((u32 *)&rclist->rcl_sessionid.data)[1], + ((u32 *)&rclist->rcl_sessionid.data)[2], + ((u32 *)&rclist->rcl_sessionid.data)[3], + ref->rc_sequenceid, ref->rc_slotid); + + spin_lock(&tbl->slot_tbl_lock); + status = (test_bit(ref->rc_slotid, tbl->used_slots) && + tbl->slots[ref->rc_slotid].seq_nr == + ref->rc_sequenceid); + spin_unlock(&tbl->slot_tbl_lock); + if (status) + goto out; + } + } + +out: + return status; +} + +__be32 nfs4_callback_sequence(struct cb_sequenceargs *args, + struct cb_sequenceres *res, + struct cb_process_state *cps) +{ + struct nfs4_slot_table *tbl; + struct nfs_client *clp; + int i; + __be32 status = htonl(NFS4ERR_BADSESSION); + + clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, + &args->csa_sessionid, cps->minorversion); + if (clp == NULL) + goto out; + + tbl = &clp->cl_session->bc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); + /* state manager is resetting the session */ + if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { + spin_unlock(&tbl->slot_tbl_lock); + status = htonl(NFS4ERR_DELAY); + /* Return NFS4ERR_BADSESSION if we're draining the session + * in order to reset it. + */ + if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + status = htonl(NFS4ERR_BADSESSION); + goto out; + } + + status = validate_seqid(&clp->cl_session->bc_slot_table, args); + spin_unlock(&tbl->slot_tbl_lock); + if (status) + goto out; + + cps->slotid = args->csa_slotid; + + /* + * Check for pending referring calls. If a match is found, a + * related callback was received before the response to the original + * call. + */ + if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { + status = htonl(NFS4ERR_DELAY); + goto out; + } + + memcpy(&res->csr_sessionid, &args->csa_sessionid, + sizeof(res->csr_sessionid)); + res->csr_sequenceid = args->csa_sequenceid; + res->csr_slotid = args->csa_slotid; + res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + +out: + cps->clp = clp; /* put in nfs4_callback_compound */ + for (i = 0; i < args->csa_nrclists; i++) + kfree(args->csa_rclists[i].rcl_refcalls); + kfree(args->csa_rclists); + + if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) { + cps->drc_status = status; + status = 0; + } else + res->csr_status = status; + + trace_nfs4_cb_sequence(args, res, status); + dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, + ntohl(status), ntohl(res->csr_status)); + return status; +} + +static bool +validate_bitmap_values(unsigned long mask) +{ + return (mask & ~RCA4_TYPE_MASK_ALL) == 0; +} + +__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy, + struct cb_process_state *cps) +{ + __be32 status; + fmode_t flags = 0; + + status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* set in cb_sequence */ + goto out; + + dprintk_rcu("NFS: RECALL_ANY callback request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + status = cpu_to_be32(NFS4ERR_INVAL); + if (!validate_bitmap_values(args->craa_type_mask)) + goto out; + + status = cpu_to_be32(NFS4_OK); + if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags = FMODE_READ; + if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags |= FMODE_WRITE; + if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) + &args->craa_type_mask)) + pnfs_recall_all_layouts(cps->clp); + if (flags) + nfs_expire_unused_delegation_types(cps->clp, flags); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +/* Reduce the fore channel's max_slots to the target value */ +__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, + struct cb_process_state *cps) +{ + struct nfs4_slot_table *fc_tbl; + __be32 status; + + status = htonl(NFS4ERR_OP_NOT_IN_SESSION); + if (!cps->clp) /* set in cb_sequence */ + goto out; + + dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %u\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR), + args->crsa_target_highest_slotid); + + fc_tbl = &cps->clp->cl_session->fc_slot_table; + + status = htonl(NFS4_OK); + + nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid); + nfs41_server_notify_target_slotid_update(cps->clp); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} +#endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 13619d24f02..f4ccfe6521e 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -9,8 +9,14 @@ #include <linux/sunrpc/svc.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> +#include <linux/ratelimit.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/sunrpc/bc_xprt.h> #include "nfs4_fs.h" #include "callback.h" +#include "internal.h" +#include "nfs4session.h" #define CB_OP_TAGLEN_MAXSZ (512) #define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ) @@ -20,9 +26,22 @@ 2 + 2 + 3 + 3) #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#if defined(CONFIG_NFS_V4_1) +#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + 4 + 1 + 3) +#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#endif /* CONFIG_NFS_V4_1 */ + #define NFSDBG_FACILITY NFSDBG_CALLBACK -typedef __be32 (*callback_process_op_t)(void *, void *); +/* Internal error code */ +#define NFS4ERR_RESOURCE_HDR 11050 + +typedef __be32 (*callback_process_op_t)(void *, void *, + struct cb_process_state *); typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *); typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); @@ -57,7 +76,7 @@ static __be32 *read_buf(struct xdr_stream *xdr, int nbytes) p = xdr_inline_decode(xdr, nbytes); if (unlikely(p == NULL)) - printk(KERN_WARNING "NFSv4 callback reply buffer overflowed!\n"); + printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n"); return p; } @@ -122,17 +141,16 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) { __be32 *p; - p = read_buf(xdr, 16); + p = read_buf(xdr, NFS4_STATEID_SIZE); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); - memcpy(stateid->data, p, 16); + memcpy(stateid, p, NFS4_STATEID_SIZE); return 0; } static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr) { __be32 *p; - unsigned int minor_version; __be32 status; status = decode_string(xdr, &hdr->taglen, &hdr->tag); @@ -140,22 +158,26 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound return status; /* We do not like overly long tags! */ if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) { - printk("NFSv4 CALLBACK %s: client sent tag of length %u\n", - __FUNCTION__, hdr->taglen); + printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n", + __func__, hdr->taglen); return htonl(NFS4ERR_RESOURCE); } p = read_buf(xdr, 12); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); - minor_version = ntohl(*p++); - /* Check minor version is zero. */ - if (minor_version != 0) { - printk(KERN_WARNING "%s: NFSv4 server callback with illegal minor version %u!\n", - __FUNCTION__, minor_version); + hdr->minorversion = ntohl(*p++); + /* Check for minor version support */ + if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) { + hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */ + } else { + pr_warn_ratelimited("NFS: %s: NFSv4 server callback with " + "illegal minor version %u!\n", + __func__, hdr->minorversion); return htonl(NFS4ERR_MINOR_VERS_MISMATCH); } - hdr->callback_ident = ntohl(*p++); hdr->nops = ntohl(*p); + dprintk("%s: minorversion %d nops %d\n", __func__, + hdr->minorversion, hdr->nops); return 0; } @@ -164,7 +186,7 @@ static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op) __be32 *p; p = read_buf(xdr, 4); if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); + return htonl(NFS4ERR_RESOURCE_HDR); *op = ntohl(*p); return 0; } @@ -179,7 +201,7 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr args->addr = svc_addr(rqstp); status = decode_bitmap(xdr, args->bitmap); out: - dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status)); + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; } @@ -200,10 +222,311 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, args->truncate = ntohl(*p); status = decode_fh(xdr, &args->fh); out: - dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status)); + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +#if defined(CONFIG_NFS_V4_1) + +static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_layoutrecallargs *args) +{ + __be32 *p; + __be32 status = 0; + uint32_t iomode; + + args->cbl_addr = svc_addr(rqstp); + p = read_buf(xdr, 4 * sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + + args->cbl_layout_type = ntohl(*p++); + /* Depite the spec's xdr, iomode really belongs in the FILE switch, + * as it is unusable and ignored with the other types. + */ + iomode = ntohl(*p++); + args->cbl_layoutchanged = ntohl(*p++); + args->cbl_recall_type = ntohl(*p++); + + if (args->cbl_recall_type == RETURN_FILE) { + args->cbl_range.iomode = iomode; + status = decode_fh(xdr, &args->cbl_fh); + if (unlikely(status != 0)) + goto out; + + p = read_buf(xdr, 2 * sizeof(uint64_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + p = xdr_decode_hyper(p, &args->cbl_range.offset); + p = xdr_decode_hyper(p, &args->cbl_range.length); + status = decode_stateid(xdr, &args->cbl_stateid); + if (unlikely(status != 0)) + goto out; + } else if (args->cbl_recall_type == RETURN_FSID) { + p = read_buf(xdr, 2 * sizeof(uint64_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + p = xdr_decode_hyper(p, &args->cbl_fsid.major); + p = xdr_decode_hyper(p, &args->cbl_fsid.minor); + } else if (args->cbl_recall_type != RETURN_ALL) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n", + __func__, + args->cbl_layout_type, iomode, + args->cbl_layoutchanged, args->cbl_recall_type); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +static +__be32 decode_devicenotify_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_devicenotifyargs *args) +{ + __be32 *p; + __be32 status = 0; + u32 tmp; + int n, i; + args->ndevs = 0; + + /* Num of device notifications */ + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + n = ntohl(*p++); + if (n <= 0) + goto out; + if (n > ULONG_MAX / sizeof(*args->devs)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + + args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL); + if (!args->devs) { + status = htonl(NFS4ERR_DELAY); + goto out; + } + + /* Decode each dev notification */ + for (i = 0; i < n; i++) { + struct cb_devicenotifyitem *dev = &args->devs[i]; + + p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto err; + } + + tmp = ntohl(*p++); /* bitmap size */ + if (tmp != 1) { + status = htonl(NFS4ERR_INVAL); + goto err; + } + dev->cbd_notify_type = ntohl(*p++); + if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && + dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { + status = htonl(NFS4ERR_INVAL); + goto err; + } + + tmp = ntohl(*p++); /* opaque size */ + if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && + (tmp != NFS4_DEVICEID4_SIZE + 8)) || + ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && + (tmp != NFS4_DEVICEID4_SIZE + 4))) { + status = htonl(NFS4ERR_INVAL); + goto err; + } + dev->cbd_layout_type = ntohl(*p++); + memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + + if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto err; + } + dev->cbd_immediate = ntohl(*p++); + } else { + dev->cbd_immediate = 0; + } + + args->ndevs++; + + dprintk("%s: type %d layout 0x%x immediate %d\n", + __func__, dev->cbd_notify_type, dev->cbd_layout_type, + dev->cbd_immediate); + } +out: + dprintk("%s: status %d ndevs %d\n", + __func__, ntohl(status), args->ndevs); + return status; +err: + kfree(args->devs); + goto out; +} + +static __be32 decode_sessionid(struct xdr_stream *xdr, + struct nfs4_sessionid *sid) +{ + __be32 *p; + int len = NFS4_MAX_SESSIONID_LEN; + + p = read_buf(xdr, len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + + memcpy(sid->data, p, len); + return 0; +} + +static __be32 decode_rc_list(struct xdr_stream *xdr, + struct referring_call_list *rc_list) +{ + __be32 *p; + int i; + __be32 status; + + status = decode_sessionid(xdr, &rc_list->rcl_sessionid); + if (status) + goto out; + + status = htonl(NFS4ERR_RESOURCE); + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) + goto out; + + rc_list->rcl_nrefcalls = ntohl(*p++); + if (rc_list->rcl_nrefcalls) { + p = read_buf(xdr, + rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t)); + if (unlikely(p == NULL)) + goto out; + rc_list->rcl_refcalls = kmalloc(rc_list->rcl_nrefcalls * + sizeof(*rc_list->rcl_refcalls), + GFP_KERNEL); + if (unlikely(rc_list->rcl_refcalls == NULL)) + goto out; + for (i = 0; i < rc_list->rcl_nrefcalls; i++) { + rc_list->rcl_refcalls[i].rc_sequenceid = ntohl(*p++); + rc_list->rcl_refcalls[i].rc_slotid = ntohl(*p++); + } + } + status = 0; + +out: + return status; +} + +static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_sequenceargs *args) +{ + __be32 *p; + int i; + __be32 status; + + status = decode_sessionid(xdr, &args->csa_sessionid); + if (status) + goto out; + + status = htonl(NFS4ERR_RESOURCE); + p = read_buf(xdr, 5 * sizeof(uint32_t)); + if (unlikely(p == NULL)) + goto out; + + args->csa_addr = svc_addr(rqstp); + args->csa_sequenceid = ntohl(*p++); + args->csa_slotid = ntohl(*p++); + args->csa_highestslotid = ntohl(*p++); + args->csa_cachethis = ntohl(*p++); + args->csa_nrclists = ntohl(*p++); + args->csa_rclists = NULL; + if (args->csa_nrclists) { + args->csa_rclists = kmalloc_array(args->csa_nrclists, + sizeof(*args->csa_rclists), + GFP_KERNEL); + if (unlikely(args->csa_rclists == NULL)) + goto out; + + for (i = 0; i < args->csa_nrclists; i++) { + status = decode_rc_list(xdr, &args->csa_rclists[i]); + if (status) + goto out_free; + } + } + status = 0; + + dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u " + "highestslotid %u cachethis %d nrclists %u\n", + __func__, + ((u32 *)&args->csa_sessionid)[0], + ((u32 *)&args->csa_sessionid)[1], + ((u32 *)&args->csa_sessionid)[2], + ((u32 *)&args->csa_sessionid)[3], + args->csa_sequenceid, args->csa_slotid, + args->csa_highestslotid, args->csa_cachethis, + args->csa_nrclists); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; + +out_free: + for (i = 0; i < args->csa_nrclists; i++) + kfree(args->csa_rclists[i].rcl_refcalls); + kfree(args->csa_rclists); + goto out; } +static __be32 decode_recallany_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_recallanyargs *args) +{ + uint32_t bitmap[2]; + __be32 *p, status; + + args->craa_addr = svc_addr(rqstp); + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + args->craa_objs_to_keep = ntohl(*p++); + status = decode_bitmap(xdr, bitmap); + if (unlikely(status)) + return status; + args->craa_type_mask = bitmap[0]; + + return 0; +} + +static __be32 decode_recallslot_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_recallslotargs *args) +{ + __be32 *p; + + args->crsa_addr = svc_addr(rqstp); + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + args->crsa_target_highest_slotid = ntohl(*p++); + return 0; +} + +#endif /* CONFIG_NFS_V4_1 */ + static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) { __be32 *p; @@ -321,7 +644,7 @@ static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res) p = xdr_reserve_space(xdr, 8); if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); + return htonl(NFS4ERR_RESOURCE_HDR); *p++ = htonl(op); *p = res; return 0; @@ -349,50 +672,213 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, status = encode_attr_mtime(xdr, res->bitmap, &res->mtime); *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1))); out: - dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status)); + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +#if defined(CONFIG_NFS_V4_1) + +static __be32 encode_sessionid(struct xdr_stream *xdr, + const struct nfs4_sessionid *sid) +{ + __be32 *p; + int len = NFS4_MAX_SESSIONID_LEN; + + p = xdr_reserve_space(xdr, len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + + memcpy(p, sid, len); + return 0; +} + +static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + const struct cb_sequenceres *res) +{ + __be32 *p; + __be32 status = res->csr_status; + + if (unlikely(status != 0)) + goto out; + + encode_sessionid(xdr, &res->csr_sessionid); + + p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t)); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); + + *p++ = htonl(res->csr_sequenceid); + *p++ = htonl(res->csr_slotid); + *p++ = htonl(res->csr_highestslotid); + *p++ = htonl(res->csr_target_highestslotid); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; } -static __be32 process_op(struct svc_rqst *rqstp, +static __be32 +preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + if (op_nr == OP_CB_SEQUENCE) { + if (nop != 0) + return htonl(NFS4ERR_SEQUENCE_POS); + } else { + if (nop == 0) + return htonl(NFS4ERR_OP_NOT_IN_SESSION); + } + + switch (op_nr) { + case OP_CB_GETATTR: + case OP_CB_RECALL: + case OP_CB_SEQUENCE: + case OP_CB_RECALL_ANY: + case OP_CB_RECALL_SLOT: + case OP_CB_LAYOUTRECALL: + case OP_CB_NOTIFY_DEVICEID: + *op = &callback_ops[op_nr]; + break; + + case OP_CB_NOTIFY: + case OP_CB_PUSH_DELEG: + case OP_CB_RECALLABLE_OBJ_AVAIL: + case OP_CB_WANTS_CANCELLED: + case OP_CB_NOTIFY_LOCK: + return htonl(NFS4ERR_NOTSUPP); + + default: + return htonl(NFS4ERR_OP_ILLEGAL); + } + + return htonl(NFS_OK); +} + +static void nfs4_callback_free_slot(struct nfs4_session *session) +{ + struct nfs4_slot_table *tbl = &session->bc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); + /* + * Let the state manager know callback processing done. + * A single slot, so highest used slotid is either 0 or -1 + */ + tbl->highest_used_slotid = NFS4_NO_SLOT; + nfs4_slot_tbl_drain_complete(tbl); + spin_unlock(&tbl->slot_tbl_lock); +} + +static void nfs4_cb_free_slot(struct cb_process_state *cps) +{ + if (cps->slotid != NFS4_NO_SLOT) + nfs4_callback_free_slot(cps->clp->cl_session); +} + +#else /* CONFIG_NFS_V4_1 */ + +static __be32 +preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + return htonl(NFS4ERR_MINOR_VERS_MISMATCH); +} + +static void nfs4_cb_free_slot(struct cb_process_state *cps) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +#ifdef CONFIG_NFS_V4_2 +static __be32 +preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + __be32 status = preprocess_nfs41_op(nop, op_nr, op); + if (status != htonl(NFS4ERR_OP_ILLEGAL)) + return status; + + if (op_nr == OP_CB_OFFLOAD) + return htonl(NFS4ERR_NOTSUPP); + return htonl(NFS4ERR_OP_ILLEGAL); +} +#else /* CONFIG_NFS_V4_2 */ +static __be32 +preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + return htonl(NFS4ERR_MINOR_VERS_MISMATCH); +} +#endif /* CONFIG_NFS_V4_2 */ + +static __be32 +preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) +{ + switch (op_nr) { + case OP_CB_GETATTR: + case OP_CB_RECALL: + *op = &callback_ops[op_nr]; + break; + default: + return htonl(NFS4ERR_OP_ILLEGAL); + } + + return htonl(NFS_OK); +} + +static __be32 process_op(int nop, struct svc_rqst *rqstp, struct xdr_stream *xdr_in, void *argp, - struct xdr_stream *xdr_out, void *resp) + struct xdr_stream *xdr_out, void *resp, + struct cb_process_state *cps) { struct callback_op *op = &callback_ops[0]; - unsigned int op_nr = OP_CB_ILLEGAL; - __be32 status = 0; + unsigned int op_nr; + __be32 status; long maxlen; __be32 res; - dprintk("%s: start\n", __FUNCTION__); + dprintk("%s: start\n", __func__); status = decode_op_hdr(xdr_in, &op_nr); - if (likely(status == 0)) { - switch (op_nr) { - case OP_CB_GETATTR: - case OP_CB_RECALL: - op = &callback_ops[op_nr]; - break; - default: - op_nr = OP_CB_ILLEGAL; - op = &callback_ops[0]; - status = htonl(NFS4ERR_OP_ILLEGAL); - } + if (unlikely(status)) + return status; + + dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", + __func__, cps->minorversion, nop, op_nr); + + switch (cps->minorversion) { + case 0: + status = preprocess_nfs4_op(op_nr, &op); + break; + case 1: + status = preprocess_nfs41_op(nop, op_nr, &op); + break; + case 2: + status = preprocess_nfs42_op(nop, op_nr, &op); + break; + default: + status = htonl(NFS4ERR_MINOR_VERS_MISMATCH); + } + + if (status == htonl(NFS4ERR_OP_ILLEGAL)) + op_nr = OP_CB_ILLEGAL; + if (status) + goto encode_hdr; + + if (cps->drc_status) { + status = cps->drc_status; + goto encode_hdr; } maxlen = xdr_out->end - xdr_out->p; if (maxlen > 0 && maxlen < PAGE_SIZE) { - if (likely(status == 0 && op->decode_args != NULL)) - status = op->decode_args(rqstp, xdr_in, argp); - if (likely(status == 0 && op->process_op != NULL)) - status = op->process_op(argp, resp); + status = op->decode_args(rqstp, xdr_in, argp); + if (likely(status == 0)) + status = op->process_op(argp, resp, cps); } else status = htonl(NFS4ERR_RESOURCE); +encode_hdr: res = encode_op_hdr(xdr_out, op_nr, status); - if (status == 0) - status = res; + if (unlikely(res)) + return res; if (op->encode_res != NULL && status == 0) status = op->encode_res(rqstp, xdr_out, resp); - dprintk("%s: done, status = %d\n", __FUNCTION__, ntohl(status)); + dprintk("%s: done, status = %d\n", __func__, ntohl(status)); return status; } @@ -401,37 +887,59 @@ static __be32 process_op(struct svc_rqst *rqstp, */ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp) { - struct cb_compound_hdr_arg hdr_arg; - struct cb_compound_hdr_res hdr_res; + struct cb_compound_hdr_arg hdr_arg = { 0 }; + struct cb_compound_hdr_res hdr_res = { NULL }; struct xdr_stream xdr_in, xdr_out; - __be32 *p; - __be32 status; - unsigned int nops = 1; + __be32 *p, status; + struct cb_process_state cps = { + .drc_status = 0, + .clp = NULL, + .slotid = NFS4_NO_SLOT, + .net = SVC_NET(rqstp), + }; + unsigned int nops = 0; - dprintk("%s: start\n", __FUNCTION__); + dprintk("%s: start\n", __func__); xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); xdr_init_encode(&xdr_out, &rqstp->rq_res, p); - decode_compound_hdr_arg(&xdr_in, &hdr_arg); + status = decode_compound_hdr_arg(&xdr_in, &hdr_arg); + if (status == __constant_htonl(NFS4ERR_RESOURCE)) + return rpc_garbage_args; + + if (hdr_arg.minorversion == 0) { + cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); + if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) + return rpc_drop_reply; + } + + cps.minorversion = hdr_arg.minorversion; hdr_res.taglen = hdr_arg.taglen; hdr_res.tag = hdr_arg.tag; - hdr_res.nops = NULL; - encode_compound_hdr_res(&xdr_out, &hdr_res); - - for (;;) { - status = process_op(rqstp, &xdr_in, argp, &xdr_out, resp); - if (status != 0) - break; - if (nops == hdr_arg.nops) - break; + if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) + return rpc_system_err; + + while (status == 0 && nops != hdr_arg.nops) { + status = process_op(nops, rqstp, &xdr_in, + argp, &xdr_out, resp, &cps); nops++; } + + /* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return + * resource error in cb_compound status without returning op */ + if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) { + status = htonl(NFS4ERR_RESOURCE); + nops--; + } + *hdr_res.status = status; *hdr_res.nops = htonl(nops); - dprintk("%s: done, status = %u\n", __FUNCTION__, ntohl(status)); + nfs4_cb_free_slot(&cps); + nfs_put_client(cps.clp); + dprintk("%s: done, status = %u\n", __func__, ntohl(status)); return rpc_success; } @@ -452,7 +960,37 @@ static struct callback_op callback_ops[] = { .process_op = (callback_process_op_t)nfs4_callback_recall, .decode_args = (callback_decode_arg_t)decode_recall_args, .res_maxsize = CB_OP_RECALL_RES_MAXSZ, - } + }, +#if defined(CONFIG_NFS_V4_1) + [OP_CB_LAYOUTRECALL] = { + .process_op = (callback_process_op_t)nfs4_callback_layoutrecall, + .decode_args = + (callback_decode_arg_t)decode_layoutrecall_args, + .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, + }, + [OP_CB_NOTIFY_DEVICEID] = { + .process_op = (callback_process_op_t)nfs4_callback_devicenotify, + .decode_args = + (callback_decode_arg_t)decode_devicenotify_args, + .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, + }, + [OP_CB_SEQUENCE] = { + .process_op = (callback_process_op_t)nfs4_callback_sequence, + .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, + .encode_res = (callback_encode_res_t)encode_cb_sequence_res, + .res_maxsize = CB_OP_SEQUENCE_RES_MAXSZ, + }, + [OP_CB_RECALL_ANY] = { + .process_op = (callback_process_op_t)nfs4_callback_recallany, + .decode_args = (callback_decode_arg_t)decode_recallany_args, + .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ, + }, + [OP_CB_RECALL_SLOT] = { + .process_op = (callback_process_op_t)nfs4_callback_recallslot, + .decode_args = (callback_decode_arg_t)decode_recallslot_args, + .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ, + }, +#endif /* CONFIG_NFS_V4_1 */ }; /* @@ -480,5 +1018,14 @@ struct svc_version nfs4_callback_version1 = { .vs_proc = nfs4_callback_procedures1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, .vs_dispatch = NULL, + .vs_hidden = 1, }; +struct svc_version nfs4_callback_version4 = { + .vs_vers = 4, + .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), + .vs_proc = nfs4_callback_procedures1, + .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, + .vs_dispatch = NULL, + .vs_hidden = 1, +}; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index c5c0175898f..1d09289c8f0 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -35,73 +35,112 @@ #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> +#include <linux/slab.h> +#include <linux/idr.h> #include <net/ipv6.h> #include <linux/nfs_xdr.h> +#include <linux/sunrpc/bc_xprt.h> +#include <linux/nsproxy.h> +#include <linux/pid_namespace.h> -#include <asm/system.h> #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" #include "iostat.h" #include "internal.h" +#include "fscache.h" +#include "pnfs.h" +#include "nfs.h" +#include "netns.h" #define NFSDBG_FACILITY NFSDBG_CLIENT -static DEFINE_SPINLOCK(nfs_client_lock); -static LIST_HEAD(nfs_client_list); -static LIST_HEAD(nfs_volume_list); static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); +static DEFINE_SPINLOCK(nfs_version_lock); +static DEFINE_MUTEX(nfs_version_mutex); +static LIST_HEAD(nfs_versions); /* * RPC cruft for NFS */ -static struct rpc_version *nfs_version[5] = { - [2] = &nfs_version2, -#ifdef CONFIG_NFS_V3 - [3] = &nfs_version3, -#endif -#ifdef CONFIG_NFS_V4 - [4] = &nfs_version4, -#endif +static const struct rpc_version *nfs_version[5] = { + [2] = NULL, + [3] = NULL, + [4] = NULL, }; -struct rpc_program nfs_program = { +const struct rpc_program nfs_program = { .name = "nfs", .number = NFS_PROGRAM, .nrvers = ARRAY_SIZE(nfs_version), .version = nfs_version, .stats = &nfs_rpcstat, - .pipe_dir_name = "/nfs", + .pipe_dir_name = NFS_PIPE_DIRNAME, }; struct rpc_stat nfs_rpcstat = { .program = &nfs_program }; +static struct nfs_subversion *find_nfs_version(unsigned int version) +{ + struct nfs_subversion *nfs; + spin_lock(&nfs_version_lock); -#ifdef CONFIG_NFS_V3_ACL -static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; -static struct rpc_version * nfsacl_version[] = { - [3] = &nfsacl_version3, -}; + list_for_each_entry(nfs, &nfs_versions, list) { + if (nfs->rpc_ops->version == version) { + spin_unlock(&nfs_version_lock); + return nfs; + } + } -struct rpc_program nfsacl_program = { - .name = "nfsacl", - .number = NFS_ACL_PROGRAM, - .nrvers = ARRAY_SIZE(nfsacl_version), - .version = nfsacl_version, - .stats = &nfsacl_rpcstat, -}; -#endif /* CONFIG_NFS_V3_ACL */ - -struct nfs_client_initdata { - const char *hostname; - const struct sockaddr *addr; - size_t addrlen; - const struct nfs_rpc_ops *rpc_ops; - int proto; -}; + spin_unlock(&nfs_version_lock); + return ERR_PTR(-EPROTONOSUPPORT); +} + +struct nfs_subversion *get_nfs_version(unsigned int version) +{ + struct nfs_subversion *nfs = find_nfs_version(version); + + if (IS_ERR(nfs)) { + mutex_lock(&nfs_version_mutex); + request_module("nfsv%d", version); + nfs = find_nfs_version(version); + mutex_unlock(&nfs_version_mutex); + } + + if (!IS_ERR(nfs)) + try_module_get(nfs->owner); + return nfs; +} + +void put_nfs_version(struct nfs_subversion *nfs) +{ + module_put(nfs->owner); +} + +void register_nfs_version(struct nfs_subversion *nfs) +{ + spin_lock(&nfs_version_lock); + + list_add(&nfs->list, &nfs_versions); + nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers; + + spin_unlock(&nfs_version_lock); +} +EXPORT_SYMBOL_GPL(register_nfs_version); + +void unregister_nfs_version(struct nfs_subversion *nfs) +{ + spin_lock(&nfs_version_lock); + + nfs_version[nfs->rpc_ops->version] = NULL; + list_del(&nfs->list); + + spin_unlock(&nfs_version_lock); +} +EXPORT_SYMBOL_GPL(unregister_nfs_version); /* * Allocate a shared client record @@ -109,20 +148,19 @@ struct nfs_client_initdata { * Since these are allocated/deallocated very rarely, we don't * bother putting them in a slab cache... */ -static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) +struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp; + struct rpc_cred *cred; + int err = -ENOMEM; if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) goto error_0; - clp->rpc_ops = cl_init->rpc_ops; + clp->cl_nfs_mod = cl_init->nfs_mod; + try_module_get(clp->cl_nfs_mod->owner); - if (cl_init->rpc_ops->version == 4) { - if (nfs_callback_up() < 0) - goto error_2; - __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state); - } + clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; atomic_set(&clp->cl_count, 1); clp->cl_cons_state = NFS_CS_INITING; @@ -131,183 +169,228 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ clp->cl_addrlen = cl_init->addrlen; if (cl_init->hostname) { + err = -ENOMEM; clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL); if (!clp->cl_hostname) - goto error_3; + goto error_cleanup; } INIT_LIST_HEAD(&clp->cl_superblocks); clp->cl_rpcclient = ERR_PTR(-EINVAL); clp->cl_proto = cl_init->proto; + clp->cl_net = get_net(cl_init->net); -#ifdef CONFIG_NFS_V4 - init_rwsem(&clp->cl_sem); - INIT_LIST_HEAD(&clp->cl_delegations); - spin_lock_init(&clp->cl_lock); - INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); - rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); - clp->cl_boot_time = CURRENT_TIME; - clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; -#endif + cred = rpc_lookup_machine_cred("*"); + if (!IS_ERR(cred)) + clp->cl_machine_cred = cred; + nfs_fscache_get_client_cookie(clp); return clp; -error_3: - if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) - nfs_callback_down(); -error_2: +error_cleanup: + put_nfs_version(clp->cl_nfs_mod); kfree(clp); error_0: - return NULL; + return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(nfs_alloc_client); -static void nfs4_shutdown_client(struct nfs_client *clp) +#if IS_ENABLED(CONFIG_NFS_V4) +void nfs_cleanup_cb_ident_idr(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + idr_destroy(&nn->cb_ident_idr); +} + +/* nfs_client_lock held */ +static void nfs_cb_idr_remove_locked(struct nfs_client *clp) +{ + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + + if (clp->cl_cb_ident) + idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident); +} + +static void pnfs_init_server(struct nfs_server *server) +{ + rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); +} + +#else +void nfs_cleanup_cb_ident_idr(struct net *net) +{ +} + +static void nfs_cb_idr_remove_locked(struct nfs_client *clp) +{ +} + +static void pnfs_init_server(struct nfs_server *server) { -#ifdef CONFIG_NFS_V4 - if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) - nfs4_kill_renewd(clp); - BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners)); - if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) - nfs_idmap_delete(clp); -#endif } +#endif /* CONFIG_NFS_V4 */ + /* * Destroy a shared client record */ -static void nfs_free_client(struct nfs_client *clp) +void nfs_free_client(struct nfs_client *clp) { dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); - nfs4_shutdown_client(clp); + nfs_fscache_release_client_cookie(clp); /* -EIO all pending I/O */ if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); - if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) - nfs_callback_down(); + if (clp->cl_machine_cred != NULL) + put_rpccred(clp->cl_machine_cred); + put_net(clp->cl_net); + put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp); dprintk("<-- nfs_free_client()\n"); } +EXPORT_SYMBOL_GPL(nfs_free_client); /* * Release a reference to a shared client record */ void nfs_put_client(struct nfs_client *clp) { + struct nfs_net *nn; + if (!clp) return; dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count)); + nn = net_generic(clp->cl_net, nfs_net_id); - if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) { + if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { list_del(&clp->cl_share_link); - spin_unlock(&nfs_client_lock); + nfs_cb_idr_remove_locked(clp); + spin_unlock(&nn->nfs_client_lock); - BUG_ON(!list_empty(&clp->cl_superblocks)); + WARN_ON_ONCE(!list_empty(&clp->cl_superblocks)); - nfs_free_client(clp); + clp->rpc_ops->free_client(clp); } } +EXPORT_SYMBOL_GPL(nfs_put_client); -static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1, - const struct sockaddr_in *sa2) +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/* + * Test if two ip6 socket addresses refer to the same socket by + * comparing relevant fields. The padding bytes specifically, are not + * compared. sin6_flowinfo is not compared because it only affects QoS + * and sin6_scope_id is only compared if the address is "link local" + * because "link local" addresses need only be unique to a specific + * link. Conversely, ordinary unicast addresses might have different + * sin6_scope_id. + * + * The caller should ensure both socket addresses are AF_INET6. + */ +static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, + const struct sockaddr *sa2) { - return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr; -} + const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; -static int nfs_sockaddr_match_ipaddr6(const struct sockaddr_in6 *sa1, - const struct sockaddr_in6 *sa2) -{ - return ipv6_addr_equal(&sa1->sin6_addr, &sa2->sin6_addr); -} + if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) + return 0; + else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) + return sin1->sin6_scope_id == sin2->sin6_scope_id; -static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, - const struct sockaddr *sa2) + return 1; +} +#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ +static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, + const struct sockaddr *sa2) { - switch (sa1->sa_family) { - case AF_INET: - return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1, - (const struct sockaddr_in *)sa2); - case AF_INET6: - return nfs_sockaddr_match_ipaddr6((const struct sockaddr_in6 *)sa1, - (const struct sockaddr_in6 *)sa2); - } - BUG(); + return 0; } +#endif /* - * Find a client by IP address and protocol version - * - returns NULL if no such client + * Test if two ip4 socket addresses refer to the same socket, by + * comparing relevant fields. The padding bytes specifically, are + * not compared. + * + * The caller should ensure both socket addresses are AF_INET. */ -struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion) +static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1, + const struct sockaddr *sa2) { - struct nfs_client *clp; + const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; - spin_lock(&nfs_client_lock); - list_for_each_entry(clp, &nfs_client_list, cl_share_link) { - struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; + return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; +} - /* Don't match clients that failed to initialise properly */ - if (clp->cl_cons_state != NFS_CS_READY) - continue; +static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - /* Different NFS versions cannot share the same nfs_client */ - if (clp->rpc_ops->version != nfsversion) - continue; + return nfs_sockaddr_match_ipaddr6(sa1, sa2) && + (sin1->sin6_port == sin2->sin6_port); +} - if (addr->sa_family != clap->sa_family) - continue; - /* Match only the IP address, not the port number */ - if (!nfs_sockaddr_match_ipaddr(addr, clap)) - continue; +static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; - atomic_inc(&clp->cl_count); - spin_unlock(&nfs_client_lock); - return clp; - } - spin_unlock(&nfs_client_lock); - return NULL; + return nfs_sockaddr_match_ipaddr4(sa1, sa2) && + (sin1->sin_port == sin2->sin_port); } +#if defined(CONFIG_NFS_V4_1) /* - * Find a client by IP address and protocol version - * - returns NULL if no such client + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, excluding the port number. */ -struct nfs_client *nfs_find_client_next(struct nfs_client *clp) +int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, + const struct sockaddr *sa2) { - struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr; - u32 nfsvers = clp->rpc_ops->version; - - spin_lock(&nfs_client_lock); - list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) { - struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; - - /* Don't match clients that failed to initialise properly */ - if (clp->cl_cons_state != NFS_CS_READY) - continue; + if (sa1->sa_family != sa2->sa_family) + return 0; - /* Different NFS versions cannot share the same nfs_client */ - if (clp->rpc_ops->version != nfsvers) - continue; + switch (sa1->sa_family) { + case AF_INET: + return nfs_sockaddr_match_ipaddr4(sa1, sa2); + case AF_INET6: + return nfs_sockaddr_match_ipaddr6(sa1, sa2); + } + return 0; +} +EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr); +#endif /* CONFIG_NFS_V4_1 */ - if (sap->sa_family != clap->sa_family) - continue; - /* Match only the IP address, not the port number */ - if (!nfs_sockaddr_match_ipaddr(sap, clap)) - continue; +/* + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, including the port number. + */ +static int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + if (sa1->sa_family != sa2->sa_family) + return 0; - atomic_inc(&clp->cl_count); - spin_unlock(&nfs_client_lock); - return clp; + switch (sa1->sa_family) { + case AF_INET: + return nfs_sockaddr_cmp_ip4(sa1, sa2); + case AF_INET6: + return nfs_sockaddr_cmp_ip6(sa1, sa2); } - spin_unlock(&nfs_client_lock); - return NULL; + return 0; } /* @@ -317,21 +400,26 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp) static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) { struct nfs_client *clp; + const struct sockaddr *sap = data->addr; + struct nfs_net *nn = net_generic(data->net, nfs_net_id); - list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { + const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; /* Different NFS versions cannot share the same nfs_client */ - if (clp->rpc_ops != data->rpc_ops) + if (clp->rpc_ops != data->nfs_mod->rpc_ops) continue; if (clp->cl_proto != data->proto) continue; - + /* Match nfsv4 minorversion */ + if (clp->cl_minorversion != data->minorversion) + continue; /* Match the full socket address */ - if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0) + if (!nfs_sockaddr_cmp(sap, clap)) continue; atomic_inc(&clp->cl_count); @@ -340,54 +428,28 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat return NULL; } +static bool nfs_client_init_is_complete(const struct nfs_client *clp) +{ + return clp->cl_cons_state != NFS_CS_INITING; +} + +int nfs_wait_client_init_complete(const struct nfs_client *clp) +{ + return wait_event_killable(nfs_client_active_wq, + nfs_client_init_is_complete(clp)); +} +EXPORT_SYMBOL_GPL(nfs_wait_client_init_complete); + /* - * Look up a client by IP address and protocol version - * - creates a new record if one doesn't yet exist + * Found an existing client. Make sure it's ready before returning. */ -static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) +static struct nfs_client * +nfs_found_client(const struct nfs_client_initdata *cl_init, + struct nfs_client *clp) { - struct nfs_client *clp, *new = NULL; int error; - dprintk("--> nfs_get_client(%s,v%u)\n", - cl_init->hostname ?: "", cl_init->rpc_ops->version); - - /* see if the client already exists */ - do { - spin_lock(&nfs_client_lock); - - clp = nfs_match_client(cl_init); - if (clp) - goto found_client; - if (new) - goto install_client; - - spin_unlock(&nfs_client_lock); - - new = nfs_alloc_client(cl_init); - } while (new); - - return ERR_PTR(-ENOMEM); - - /* install a new client and return with it unready */ -install_client: - clp = new; - list_add(&clp->cl_share_link, &nfs_client_list); - spin_unlock(&nfs_client_lock); - dprintk("--> nfs_get_client() = %p [new]\n", clp); - return clp; - - /* found an existing client - * - make sure it's ready before returning - */ -found_client: - spin_unlock(&nfs_client_lock); - - if (new) - nfs_free_client(new); - - error = wait_event_killable(nfs_client_active_wq, - clp->cl_cons_state != NFS_CS_INITING); + error = nfs_wait_client_init_complete(clp); if (error < 0) { nfs_put_client(clp); return ERR_PTR(-ERESTARTSYS); @@ -399,37 +461,87 @@ found_client: return ERR_PTR(error); } - BUG_ON(clp->cl_cons_state != NFS_CS_READY); + smp_rmb(); - dprintk("--> nfs_get_client() = %p [share]\n", clp); + dprintk("<-- %s found nfs_client %p for %s\n", + __func__, clp, cl_init->hostname ?: ""); return clp; } /* + * Look up a client by IP address and protocol version + * - creates a new record if one doesn't yet exist + */ +struct nfs_client * +nfs_get_client(const struct nfs_client_initdata *cl_init, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour) +{ + struct nfs_client *clp, *new = NULL; + struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); + const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops; + + dprintk("--> nfs_get_client(%s,v%u)\n", + cl_init->hostname ?: "", rpc_ops->version); + + /* see if the client already exists */ + do { + spin_lock(&nn->nfs_client_lock); + + clp = nfs_match_client(cl_init); + if (clp) { + spin_unlock(&nn->nfs_client_lock); + if (new) + new->rpc_ops->free_client(new); + return nfs_found_client(cl_init, clp); + } + if (new) { + list_add_tail(&new->cl_share_link, + &nn->nfs_client_list); + spin_unlock(&nn->nfs_client_lock); + new->cl_flags = cl_init->init_flags; + return rpc_ops->init_client(new, timeparms, ip_addr); + } + + spin_unlock(&nn->nfs_client_lock); + + new = rpc_ops->alloc_client(cl_init); + } while (!IS_ERR(new)); + + dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n", + cl_init->hostname ?: "", PTR_ERR(new)); + return new; +} +EXPORT_SYMBOL_GPL(nfs_get_client); + +/* * Mark a server as ready or failed */ -static void nfs_mark_client_ready(struct nfs_client *clp, int state) +void nfs_mark_client_ready(struct nfs_client *clp, int state) { + smp_wmb(); clp->cl_cons_state = state; wake_up_all(&nfs_client_active_wq); } +EXPORT_SYMBOL_GPL(nfs_mark_client_ready); /* * Initialise the timeout values for a connection */ -static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, +void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans) { to->to_initval = timeo * HZ / 10; to->to_retries = retrans; - if (!to->to_retries) - to->to_retries = 2; switch (proto) { case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_RDMA: + if (to->to_retries == 0) + to->to_retries = NFS_DEF_TCP_RETRANS; if (to->to_initval == 0) - to->to_initval = 60 * HZ; + to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; to->to_increment = to->to_initval; @@ -441,27 +553,31 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, to->to_exponential = 0; break; case XPRT_TRANSPORT_UDP: - default: + if (to->to_retries == 0) + to->to_retries = NFS_DEF_UDP_RETRANS; if (!to->to_initval) - to->to_initval = 11 * HZ / 10; + to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_UDP_TIMEOUT) to->to_initval = NFS_MAX_UDP_TIMEOUT; to->to_maxval = NFS_MAX_UDP_TIMEOUT; to->to_exponential = 1; break; + default: + BUG(); } } +EXPORT_SYMBOL_GPL(nfs_init_timeout_values); /* * Create an RPC client handle */ -static int nfs_create_rpc_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - rpc_authflavor_t flavor, - int flags) +int nfs_create_rpc_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + rpc_authflavor_t flavor) { struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { + .net = clp->cl_net, .protocol = clp->cl_proto, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, @@ -470,29 +586,38 @@ static int nfs_create_rpc_client(struct nfs_client *clp, .program = &nfs_program, .version = clp->rpc_ops->version, .authflavor = flavor, - .flags = flags, }; + if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_DISCRTRY; + if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT; + if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS; + if (!IS_ERR(clp->cl_rpcclient)) return 0; clnt = rpc_create(&args); if (IS_ERR(clnt)) { dprintk("%s: cannot create RPC client. Error = %ld\n", - __FUNCTION__, PTR_ERR(clnt)); + __func__, PTR_ERR(clnt)); return PTR_ERR(clnt); } clp->cl_rpcclient = clnt; return 0; } +EXPORT_SYMBOL_GPL(nfs_create_rpc_client); /* * Version 2 or 3 client destruction */ static void nfs_destroy_server(struct nfs_server *server) { - if (!(server->flags & NFS_MOUNT_NONLM)) + if (server->nlm_host) nlmclnt_done(server->nlm_host); } @@ -507,16 +632,26 @@ static int nfs_start_lockd(struct nfs_server *server) .hostname = clp->cl_hostname, .address = (struct sockaddr *)&clp->cl_addr, .addrlen = clp->cl_addrlen, - .protocol = server->flags & NFS_MOUNT_TCP ? - IPPROTO_TCP : IPPROTO_UDP, .nfs_version = clp->rpc_ops->version, + .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? + 1 : 0, + .net = clp->cl_net, }; if (nlm_init.nfs_version > 3) return 0; - if (server->flags & NFS_MOUNT_NONLM) + if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) && + (server->flags & NFS_MOUNT_LOCAL_FCNTL)) return 0; + switch (clp->cl_proto) { + default: + nlm_init.protocol = IPPROTO_TCP; + break; + case XPRT_TRANSPORT_UDP: + nlm_init.protocol = IPPROTO_UDP; + } + host = nlmclnt_init(&nlm_init); if (IS_ERR(host)) return PTR_ERR(host); @@ -527,47 +662,18 @@ static int nfs_start_lockd(struct nfs_server *server) } /* - * Initialise an NFSv3 ACL client connection - */ -#ifdef CONFIG_NFS_V3_ACL -static void nfs_init_server_aclclient(struct nfs_server *server) -{ - if (server->nfs_client->rpc_ops->version != 3) - goto out_noacl; - if (server->flags & NFS_MOUNT_NOACL) - goto out_noacl; - - server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3); - if (IS_ERR(server->client_acl)) - goto out_noacl; - - /* No errors! Assume that Sun nfsacls are supported */ - server->caps |= NFS_CAP_ACLS; - return; - -out_noacl: - server->caps &= ~NFS_CAP_ACLS; -} -#else -static inline void nfs_init_server_aclclient(struct nfs_server *server) -{ - server->flags &= ~NFS_MOUNT_NOACL; - server->caps &= ~NFS_CAP_ACLS; -} -#endif - -/* * Create a general RPC client */ -static int nfs_init_server_rpcclient(struct nfs_server *server, +int nfs_init_server_rpcclient(struct nfs_server *server, const struct rpc_timeout *timeo, rpc_authflavor_t pseudoflavour) { struct nfs_client *clp = server->nfs_client; - server->client = rpc_clone_client(clp->cl_rpcclient); + server->client = rpc_clone_client_set_auth(clp->cl_rpcclient, + pseudoflavour); if (IS_ERR(server->client)) { - dprintk("%s: couldn't create rpc_client!\n", __FUNCTION__); + dprintk("%s: couldn't create rpc_client!\n", __func__); return PTR_ERR(server->client); } @@ -575,66 +681,67 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, timeo, sizeof(server->client->cl_timeout_default)); server->client->cl_timeout = &server->client->cl_timeout_default; - - if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) { - struct rpc_auth *auth; - - auth = rpcauth_create(pseudoflavour, server->client); - if (IS_ERR(auth)) { - dprintk("%s: couldn't create credcache!\n", __FUNCTION__); - return PTR_ERR(auth); - } - } server->client->cl_softrtry = 0; if (server->flags & NFS_MOUNT_SOFT) server->client->cl_softrtry = 1; return 0; } +EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); -/* - * Initialise an NFS2 or NFS3 client +/** + * nfs_init_client - Initialise an NFS2 or NFS3 client + * + * @clp: nfs_client to initialise + * @timeparms: timeout parameters for underlying RPC transport + * @ip_addr: IP presentation address (not used) + * + * Returns pointer to an NFS client, or an ERR_PTR value. */ -static int nfs_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const struct nfs_parsed_mount_data *data) +struct nfs_client *nfs_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr) { int error; if (clp->cl_cons_state == NFS_CS_READY) { /* the client is already initialised */ dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp); - return 0; + return clp; } /* * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, 0); + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); if (error < 0) goto error; nfs_mark_client_ready(clp, NFS_CS_READY); - return 0; + return clp; error: nfs_mark_client_ready(clp, error); + nfs_put_client(clp); dprintk("<-- nfs_init_client() = xerror %d\n", error); - return error; + return ERR_PTR(error); } +EXPORT_SYMBOL_GPL(nfs_init_client); /* * Create a version 2 or 3 client */ static int nfs_init_server(struct nfs_server *server, - const struct nfs_parsed_mount_data *data) + const struct nfs_parsed_mount_data *data, + struct nfs_subversion *nfs_mod) { struct nfs_client_initdata cl_init = { .hostname = data->nfs_server.hostname, .addr = (const struct sockaddr *)&data->nfs_server.address, .addrlen = data->nfs_server.addrlen, - .rpc_ops = &nfs_v2_clientops, + .nfs_mod = nfs_mod, .proto = data->nfs_server.protocol, + .net = data->net, }; struct rpc_timeout timeparms; struct nfs_client *clp; @@ -642,28 +749,26 @@ static int nfs_init_server(struct nfs_server *server, dprintk("--> nfs_init_server()\n"); -#ifdef CONFIG_NFS_V3 - if (data->flags & NFS_MOUNT_VER3) - cl_init.rpc_ops = &nfs_v3_clientops; -#endif + nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, + data->timeo, data->retrans); + if (data->flags & NFS_MOUNT_NORESVPORT) + set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init); + clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); if (IS_ERR(clp)) { dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); return PTR_ERR(clp); } - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, - data->timeo, data->retrans); - error = nfs_init_client(clp, &timeparms, data); - if (error < 0) - goto error; - server->nfs_client = clp; /* Initialise the client representation from the mount data */ - server->flags = data->flags & NFS_MOUNT_FLAGMASK; + server->flags = data->flags; + server->options = data->options; + server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| + NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| + NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -680,13 +785,25 @@ static int nfs_init_server(struct nfs_server *server, if (error < 0) goto error; - error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); + server->port = data->nfs_server.port; + server->auth_info = data->auth_info; + + error = nfs_init_server_rpcclient(server, &timeparms, + data->selected_flavor); if (error < 0) goto error; + /* Preserve the values of mount_server-related mount options */ + if (data->mount_server.addrlen) { + memcpy(&server->mountd_address, &data->mount_server.address, + data->mount_server.addrlen); + server->mountd_addrlen = data->mount_server.addrlen; + } + server->mountd_version = data->mount_server.version; + server->mountd_port = data->mount_server.port; + server->mountd_protocol = data->mount_server.protocol; + server->namelen = data->namlen; - /* Create a client RPC handle for the NFSv3 ACL management interface */ - nfs_init_server_aclclient(server); dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp); return 0; @@ -700,7 +817,9 @@ error: /* * Load up the server record from information gained in an fsinfo record */ -static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) +static void nfs_server_set_fsinfo(struct nfs_server *server, + struct nfs_fh *mntfh, + struct nfs_fsinfo *fsinfo) { unsigned long max_rpc_payload; @@ -722,6 +841,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + server->backing_dev_info.name = "nfs"; server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; if (server->wsize > max_rpc_payload) @@ -729,11 +849,12 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); - if (server->dtsize > PAGE_CACHE_SIZE) - server->dtsize = PAGE_CACHE_SIZE; + if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES) + server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES; if (server->dtsize > server->rsize) server->dtsize = server->rsize; @@ -744,6 +865,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * server->maxfilesize = fsinfo->maxfilesize; + server->time_delta = fsinfo->time_delta; + /* We're airborne Set socket buffersize */ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); } @@ -751,7 +874,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * /* * Probe filesystem information, including the FSID on v2/v3 */ -static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) +int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) { struct nfs_fsinfo fsinfo; struct nfs_client *clp = server->nfs_client; @@ -766,16 +889,12 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str } fsinfo.fattr = fattr; - nfs_fattr_init(fattr); + fsinfo.layouttype = 0; error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) goto out_error; - nfs_server_set_fsinfo(server, &fsinfo); - error = bdi_init(&server->backing_dev_info); - if (error) - goto out_error; - + nfs_server_set_fsinfo(server, mntfh, &fsinfo); /* Get some general file system info */ if (server->namelen == 0) { @@ -795,24 +914,63 @@ out_error: dprintk("nfs_probe_fsinfo: error = %d\n", -error); return error; } +EXPORT_SYMBOL_GPL(nfs_probe_fsinfo); /* * Copy useful information when duplicating a server record */ -static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) +void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) { target->flags = source->flags; + target->rsize = source->rsize; + target->wsize = source->wsize; target->acregmin = source->acregmin; target->acregmax = source->acregmax; target->acdirmin = source->acdirmin; target->acdirmax = source->acdirmax; target->caps = source->caps; + target->options = source->options; + target->auth_info = source->auth_info; } +EXPORT_SYMBOL_GPL(nfs_server_copy_userdata); + +void nfs_server_insert_lists(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + + spin_lock(&nn->nfs_client_lock); + list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); + list_add_tail(&server->master_link, &nn->nfs_volume_list); + clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); + spin_unlock(&nn->nfs_client_lock); + +} +EXPORT_SYMBOL_GPL(nfs_server_insert_lists); + +void nfs_server_remove_lists(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs_net *nn; + + if (clp == NULL) + return; + nn = net_generic(clp->cl_net, nfs_net_id); + spin_lock(&nn->nfs_client_lock); + list_del_rcu(&server->client_link); + if (list_empty(&clp->cl_superblocks)) + set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); + list_del(&server->master_link); + spin_unlock(&nn->nfs_client_lock); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(nfs_server_remove_lists); /* * Allocate and initialise a server record */ -static struct nfs_server *nfs_alloc_server(void) +struct nfs_server *nfs_alloc_server(void) { struct nfs_server *server; @@ -825,8 +983,10 @@ static struct nfs_server *nfs_alloc_server(void) /* Zero out the NFS state stuff */ INIT_LIST_HEAD(&server->client_link); INIT_LIST_HEAD(&server->master_link); + INIT_LIST_HEAD(&server->delegations); + INIT_LIST_HEAD(&server->layouts); + INIT_LIST_HEAD(&server->state_owners_lru); - init_waitqueue_head(&server->active_wq); atomic_set(&server->active, 0); server->io_stats = nfs_alloc_iostats(); @@ -835,8 +995,19 @@ static struct nfs_server *nfs_alloc_server(void) return NULL; } + if (bdi_init(&server->backing_dev_info)) { + nfs_free_iostats(server->io_stats); + kfree(server); + return NULL; + } + + ida_init(&server->openowner_id); + ida_init(&server->lockowner_id); + pnfs_init_server(server); + return server; } +EXPORT_SYMBOL_GPL(nfs_alloc_server); /* * Free up a server record @@ -845,10 +1016,7 @@ void nfs_free_server(struct nfs_server *server) { dprintk("--> nfs_free_server()\n"); - spin_lock(&nfs_client_lock); - list_del(&server->client_link); - list_del(&server->master_link); - spin_unlock(&nfs_client_lock); + nfs_server_remove_lists(server); if (server->destroy != NULL) server->destroy(server); @@ -860,366 +1028,90 @@ void nfs_free_server(struct nfs_server *server) nfs_put_client(server->nfs_client); + ida_destroy(&server->lockowner_id); + ida_destroy(&server->openowner_id); nfs_free_iostats(server->io_stats); bdi_destroy(&server->backing_dev_info); kfree(server); nfs_release_automount_timer(); dprintk("<-- nfs_free_server()\n"); } +EXPORT_SYMBOL_GPL(nfs_free_server); /* * Create a version 2 or 3 volume record * - keyed on server and FSID */ -struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, - struct nfs_fh *mntfh) +struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) { struct nfs_server *server; - struct nfs_fattr fattr; + struct nfs_fattr *fattr; int error; server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); + error = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto error; + /* Get a client representation */ - error = nfs_init_server(server, data); + error = nfs_init_server(server, mount_info->parsed, nfs_mod); if (error < 0) goto error; - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - /* Probe the root fh to retrieve its FSID */ - error = nfs_probe_fsinfo(server, mntfh, &fattr); + error = nfs_probe_fsinfo(server, mount_info->mntfh, fattr); if (error < 0) goto error; if (server->nfs_client->rpc_ops->version == 3) { if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) server->namelen = NFS3_MAXNAMLEN; - if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) + if (!(mount_info->parsed->flags & NFS_MOUNT_NORDIRPLUS)) server->caps |= NFS_CAP_READDIRPLUS; } else { if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) server->namelen = NFS2_MAXNAMLEN; } - if (!(fattr.valid & NFS_ATTR_FATTR)) { - error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); + if (!(fattr->valid & NFS_ATTR_FATTR)) { + error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL); if (error < 0) { dprintk("nfs_create_server: getattr error = %d\n", -error); goto error; } } - memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid)); + memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); dprintk("Server FSID: %llx:%llx\n", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - - spin_lock(&nfs_client_lock); - list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); - list_add_tail(&server->master_link, &nfs_volume_list); - spin_unlock(&nfs_client_lock); - + nfs_server_insert_lists(server); server->mount_time = jiffies; + nfs_free_fattr(fattr); return server; error: + nfs_free_fattr(fattr); nfs_free_server(server); return ERR_PTR(error); } - -#ifdef CONFIG_NFS_V4 -/* - * Initialise an NFS4 client record - */ -static int nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr, - rpc_authflavor_t authflavour) -{ - int error; - - if (clp->cl_cons_state == NFS_CS_READY) { - /* the client is initialised already */ - dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp); - return 0; - } - - /* Check NFS protocol revision and initialize RPC op vector */ - clp->rpc_ops = &nfs_v4_clientops; - - error = nfs_create_rpc_client(clp, timeparms, authflavour, - RPC_CLNT_CREATE_DISCRTRY); - if (error < 0) - goto error; - memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); - - error = nfs_idmap_new(clp); - if (error < 0) { - dprintk("%s: failed to create idmapper. Error = %d\n", - __FUNCTION__, error); - goto error; - } - __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); - - nfs_mark_client_ready(clp, NFS_CS_READY); - return 0; - -error: - nfs_mark_client_ready(clp, error); - dprintk("<-- nfs4_init_client() = xerror %d\n", error); - return error; -} - -/* - * Set up an NFS4 client - */ -static int nfs4_set_client(struct nfs_server *server, - const char *hostname, - const struct sockaddr *addr, - const size_t addrlen, - const char *ip_addr, - rpc_authflavor_t authflavour, - int proto, const struct rpc_timeout *timeparms) -{ - struct nfs_client_initdata cl_init = { - .hostname = hostname, - .addr = addr, - .addrlen = addrlen, - .rpc_ops = &nfs_v4_clientops, - .proto = proto, - }; - struct nfs_client *clp; - int error; - - dprintk("--> nfs4_set_client()\n"); - - /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init); - if (IS_ERR(clp)) { - error = PTR_ERR(clp); - goto error; - } - error = nfs4_init_client(clp, timeparms, ip_addr, authflavour); - if (error < 0) - goto error_put; - - server->nfs_client = clp; - dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); - return 0; - -error_put: - nfs_put_client(clp); -error: - dprintk("<-- nfs4_set_client() = xerror %d\n", error); - return error; -} - -/* - * Create a version 4 volume record - */ -static int nfs4_init_server(struct nfs_server *server, - const struct nfs_parsed_mount_data *data) -{ - struct rpc_timeout timeparms; - int error; - - dprintk("--> nfs4_init_server()\n"); - - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, - data->timeo, data->retrans); - - /* Get a client record */ - error = nfs4_set_client(server, - data->nfs_server.hostname, - (const struct sockaddr *)&data->nfs_server.address, - data->nfs_server.addrlen, - data->client_address, - data->auth_flavors[0], - data->nfs_server.protocol, - &timeparms); - if (error < 0) - goto error; - - /* Initialise the client representation from the mount data */ - server->flags = data->flags & NFS_MOUNT_FLAGMASK; - server->caps |= NFS_CAP_ATOMIC_OPEN; - - if (data->rsize) - server->rsize = nfs_block_size(data->rsize, NULL); - if (data->wsize) - server->wsize = nfs_block_size(data->wsize, NULL); - - server->acregmin = data->acregmin * HZ; - server->acregmax = data->acregmax * HZ; - server->acdirmin = data->acdirmin * HZ; - server->acdirmax = data->acdirmax * HZ; - - error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); - -error: - /* Done */ - dprintk("<-- nfs4_init_server() = %d\n", error); - return error; -} - -/* - * Create a version 4 volume record - * - keyed on server and FSID - */ -struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, - struct nfs_fh *mntfh) -{ - struct nfs_fattr fattr; - struct nfs_server *server; - int error; - - dprintk("--> nfs4_create_server()\n"); - - server = nfs_alloc_server(); - if (!server) - return ERR_PTR(-ENOMEM); - - /* set up the general RPC client */ - error = nfs4_init_server(server, data); - if (error < 0) - goto error; - - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - - /* Probe the root fh to retrieve its FSID */ - error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path); - if (error < 0) - goto error; - - dprintk("Server FSID: %llx:%llx\n", - (unsigned long long) server->fsid.major, - (unsigned long long) server->fsid.minor); - dprintk("Mount FH: %d\n", mntfh->size); - - error = nfs_probe_fsinfo(server, mntfh, &fattr); - if (error < 0) - goto error; - - if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) - server->namelen = NFS4_MAXNAMLEN; - - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - - spin_lock(&nfs_client_lock); - list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); - list_add_tail(&server->master_link, &nfs_volume_list); - spin_unlock(&nfs_client_lock); - - server->mount_time = jiffies; - dprintk("<-- nfs4_create_server() = %p\n", server); - return server; - -error: - nfs_free_server(server); - dprintk("<-- nfs4_create_server() = error %d\n", error); - return ERR_PTR(error); -} - -/* - * Create an NFS4 referral server record - */ -struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, - struct nfs_fh *mntfh) -{ - struct nfs_client *parent_client; - struct nfs_server *server, *parent_server; - struct nfs_fattr fattr; - int error; - - dprintk("--> nfs4_create_referral_server()\n"); - - server = nfs_alloc_server(); - if (!server) - return ERR_PTR(-ENOMEM); - - parent_server = NFS_SB(data->sb); - parent_client = parent_server->nfs_client; - - /* Get a client representation. - * Note: NFSv4 always uses TCP, */ - error = nfs4_set_client(server, data->hostname, - data->addr, - data->addrlen, - parent_client->cl_ipaddr, - data->authflavor, - parent_server->client->cl_xprt->prot, - parent_server->client->cl_timeout); - if (error < 0) - goto error; - - /* Initialise the client representation from the parent server */ - nfs_server_copy_userdata(server, parent_server); - server->caps |= NFS_CAP_ATOMIC_OPEN; - - error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); - if (error < 0) - goto error; - - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - - /* Probe the root fh to retrieve its FSID and filehandle */ - error = nfs4_path_walk(server, mntfh, data->mnt_path); - if (error < 0) - goto error; - - /* probe the filesystem info for this server filesystem */ - error = nfs_probe_fsinfo(server, mntfh, &fattr); - if (error < 0) - goto error; - - if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) - server->namelen = NFS4_MAXNAMLEN; - - dprintk("Referral FSID: %llx:%llx\n", - (unsigned long long) server->fsid.major, - (unsigned long long) server->fsid.minor); - - spin_lock(&nfs_client_lock); - list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); - list_add_tail(&server->master_link, &nfs_volume_list); - spin_unlock(&nfs_client_lock); - - server->mount_time = jiffies; - - dprintk("<-- nfs_create_referral_server() = %p\n", server); - return server; - -error: - nfs_free_server(server); - dprintk("<-- nfs4_create_referral_server() = error %d\n", error); - return ERR_PTR(error); -} - -#endif /* CONFIG_NFS_V4 */ +EXPORT_SYMBOL_GPL(nfs_create_server); /* * Clone an NFS2, NFS3 or NFS4 server record */ struct nfs_server *nfs_clone_server(struct nfs_server *source, struct nfs_fh *fh, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, + rpc_authflavor_t flavor) { struct nfs_server *server; - struct nfs_fattr fattr_fsinfo; + struct nfs_fattr *fattr_fsinfo; int error; dprintk("--> nfs_clone_server(,%llx:%llx,)\n", @@ -1230,8 +1122,14 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, if (!server) return ERR_PTR(-ENOMEM); + error = -ENOMEM; + fattr_fsinfo = nfs_alloc_fattr(); + if (fattr_fsinfo == NULL) + goto out_free_server; + /* Copy data from the source */ server->nfs_client = source->nfs_client; + server->destroy = source->destroy; atomic_inc(&server->nfs_client->cl_count); nfs_server_copy_userdata(server, source); @@ -1239,14 +1137,12 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, error = nfs_init_server_rpcclient(server, source->client->cl_timeout, - source->client->cl_auth->au_flavor); + flavor); if (error < 0) goto out_free_server; - if (!IS_ERR(source->client_acl)) - nfs_init_server_aclclient(server); /* probe the filesystem info for this server filesystem */ - error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo); + error = nfs_probe_fsinfo(server, fh, fattr_fsinfo); if (error < 0) goto out_free_server; @@ -1261,21 +1157,33 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, if (error < 0) goto out_free_server; - spin_lock(&nfs_client_lock); - list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); - list_add_tail(&server->master_link, &nfs_volume_list); - spin_unlock(&nfs_client_lock); - + nfs_server_insert_lists(server); server->mount_time = jiffies; + nfs_free_fattr(fattr_fsinfo); dprintk("<-- nfs_clone_server() = %p\n", server); return server; out_free_server: + nfs_free_fattr(fattr_fsinfo); nfs_free_server(server); dprintk("<-- nfs_clone_server() = error %d\n", error); return ERR_PTR(error); } +EXPORT_SYMBOL_GPL(nfs_clone_server); + +void nfs_clients_init(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + INIT_LIST_HEAD(&nn->nfs_client_list); + INIT_LIST_HEAD(&nn->nfs_volume_list); +#if IS_ENABLED(CONFIG_NFS_V4) + idr_init(&nn->cb_ident_idr); +#endif + spin_lock_init(&nn->nfs_client_lock); + nn->boot_time = CURRENT_TIME; +} #ifdef CONFIG_PROC_FS static struct proc_dir_entry *proc_fs_nfs; @@ -1286,7 +1194,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_server_list_stop(struct seq_file *p, void *v); static int nfs_server_list_show(struct seq_file *m, void *v); -static struct seq_operations nfs_server_list_ops = { +static const struct seq_operations nfs_server_list_ops = { .start = nfs_server_list_start, .next = nfs_server_list_next, .stop = nfs_server_list_stop, @@ -1298,6 +1206,7 @@ static const struct file_operations nfs_server_list_fops = { .read = seq_read, .llseek = seq_lseek, .release = seq_release, + .owner = THIS_MODULE, }; static int nfs_volume_list_open(struct inode *inode, struct file *file); @@ -1306,7 +1215,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_volume_list_stop(struct seq_file *p, void *v); static int nfs_volume_list_show(struct seq_file *m, void *v); -static struct seq_operations nfs_volume_list_ops = { +static const struct seq_operations nfs_volume_list_ops = { .start = nfs_volume_list_start, .next = nfs_volume_list_next, .stop = nfs_volume_list_stop, @@ -1318,6 +1227,7 @@ static const struct file_operations nfs_volume_list_fops = { .read = seq_read, .llseek = seq_lseek, .release = seq_release, + .owner = THIS_MODULE, }; /* @@ -1328,13 +1238,15 @@ static int nfs_server_list_open(struct inode *inode, struct file *file) { struct seq_file *m; int ret; + struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info; + struct net *net = pid_ns->child_reaper->nsproxy->net_ns; ret = seq_open(file, &nfs_server_list_ops); if (ret < 0) return ret; m = file->private_data; - m->private = PDE(inode)->data; + m->private = net; return 0; } @@ -1344,9 +1256,11 @@ static int nfs_server_list_open(struct inode *inode, struct file *file) */ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) { + struct nfs_net *nn = net_generic(m->private, nfs_net_id); + /* lock the list against modification */ - spin_lock(&nfs_client_lock); - return seq_list_start_head(&nfs_client_list, *_pos); + spin_lock(&nn->nfs_client_lock); + return seq_list_start_head(&nn->nfs_client_list, *_pos); } /* @@ -1354,7 +1268,9 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) */ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) { - return seq_list_next(v, &nfs_client_list, pos); + struct nfs_net *nn = net_generic(p->private, nfs_net_id); + + return seq_list_next(v, &nn->nfs_client_list, pos); } /* @@ -1362,7 +1278,9 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) */ static void nfs_server_list_stop(struct seq_file *p, void *v) { - spin_unlock(&nfs_client_lock); + struct nfs_net *nn = net_generic(p->private, nfs_net_id); + + spin_unlock(&nn->nfs_client_lock); } /* @@ -1371,9 +1289,10 @@ static void nfs_server_list_stop(struct seq_file *p, void *v) static int nfs_server_list_show(struct seq_file *m, void *v) { struct nfs_client *clp; + struct nfs_net *nn = net_generic(m->private, nfs_net_id); /* display header on line 1 */ - if (v == &nfs_client_list) { + if (v == &nn->nfs_client_list) { seq_puts(m, "NV SERVER PORT USE HOSTNAME\n"); return 0; } @@ -1381,12 +1300,18 @@ static int nfs_server_list_show(struct seq_file *m, void *v) /* display one transport per line on subsequent lines */ clp = list_entry(v, struct nfs_client, cl_share_link); + /* Check if the client is initialized */ + if (clp->cl_cons_state != NFS_CS_READY) + return 0; + + rcu_read_lock(); seq_printf(m, "v%u %s %s %3d %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), atomic_read(&clp->cl_count), clp->cl_hostname); + rcu_read_unlock(); return 0; } @@ -1398,13 +1323,15 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file) { struct seq_file *m; int ret; + struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info; + struct net *net = pid_ns->child_reaper->nsproxy->net_ns; ret = seq_open(file, &nfs_volume_list_ops); if (ret < 0) return ret; m = file->private_data; - m->private = PDE(inode)->data; + m->private = net; return 0; } @@ -1414,9 +1341,11 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file) */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) { + struct nfs_net *nn = net_generic(m->private, nfs_net_id); + /* lock the list against modification */ - spin_lock(&nfs_client_lock); - return seq_list_start_head(&nfs_volume_list, *_pos); + spin_lock(&nn->nfs_client_lock); + return seq_list_start_head(&nn->nfs_volume_list, *_pos); } /* @@ -1424,7 +1353,9 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) */ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) { - return seq_list_next(v, &nfs_volume_list, pos); + struct nfs_net *nn = net_generic(p->private, nfs_net_id); + + return seq_list_next(v, &nn->nfs_volume_list, pos); } /* @@ -1432,7 +1363,9 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) */ static void nfs_volume_list_stop(struct seq_file *p, void *v) { - spin_unlock(&nfs_client_lock); + struct nfs_net *nn = net_generic(p->private, nfs_net_id); + + spin_unlock(&nn->nfs_client_lock); } /* @@ -1443,10 +1376,11 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) struct nfs_server *server; struct nfs_client *clp; char dev[8], fsid[17]; + struct nfs_net *nn = net_generic(m->private, nfs_net_id); /* display header on line 1 */ - if (v == &nfs_volume_list) { - seq_puts(m, "NV SERVER PORT DEV FSID\n"); + if (v == &nn->nfs_volume_list) { + seq_puts(m, "NV SERVER PORT DEV FSID FSC\n"); return 0; } /* display one transport per line on subsequent lines */ @@ -1460,12 +1394,15 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); - seq_printf(m, "v%u %s %s %-7s %-17s\n", + rcu_read_lock(); + seq_printf(m, "v%u %s %s %-7s %-17s %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), dev, - fsid); + fsid, + nfs_server_fscache_state(server)); + rcu_read_unlock(); return 0; } @@ -1477,33 +1414,27 @@ int __init nfs_fs_proc_init(void) { struct proc_dir_entry *p; - proc_fs_nfs = proc_mkdir("nfsfs", proc_root_fs); + proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL); if (!proc_fs_nfs) goto error_0; - proc_fs_nfs->owner = THIS_MODULE; - /* a file of servers with which we're dealing */ - p = create_proc_entry("servers", S_IFREG|S_IRUGO, proc_fs_nfs); + p = proc_create("servers", S_IFREG|S_IRUGO, + proc_fs_nfs, &nfs_server_list_fops); if (!p) goto error_1; - p->proc_fops = &nfs_server_list_fops; - p->owner = THIS_MODULE; - /* a file of volumes that we have mounted */ - p = create_proc_entry("volumes", S_IFREG|S_IRUGO, proc_fs_nfs); + p = proc_create("volumes", S_IFREG|S_IRUGO, + proc_fs_nfs, &nfs_volume_list_fops); if (!p) goto error_2; - - p->proc_fops = &nfs_volume_list_fops; - p->owner = THIS_MODULE; return 0; error_2: remove_proc_entry("servers", proc_fs_nfs); error_1: - remove_proc_entry("nfsfs", proc_root_fs); + remove_proc_entry("fs/nfsfs", NULL); error_0: return -ENOMEM; } @@ -1515,7 +1446,7 @@ void nfs_fs_proc_exit(void) { remove_proc_entry("volumes", proc_fs_nfs); remove_proc_entry("servers", proc_fs_nfs); - remove_proc_entry("nfsfs", proc_root_fs); + remove_proc_entry("fs/nfsfs", NULL); } #endif /* CONFIG_PROC_FS */ diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 00a5e4405e1..5d8ccecf5f5 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -10,6 +10,7 @@ #include <linux/kthread.h> #include <linux/module.h> #include <linux/sched.h> +#include <linux/slab.h> #include <linux/spinlock.h> #include <linux/nfs4.h> @@ -19,65 +20,85 @@ #include "nfs4_fs.h" #include "delegation.h" #include "internal.h" +#include "nfs4trace.h" -static void nfs_do_free_delegation(struct nfs_delegation *delegation) +static void nfs_free_delegation(struct nfs_delegation *delegation) { - kfree(delegation); + if (delegation->cred) { + put_rpccred(delegation->cred); + delegation->cred = NULL; + } + kfree_rcu(delegation, rcu); } -static void nfs_free_delegation_callback(struct rcu_head *head) +/** + * nfs_mark_delegation_referenced - set delegation's REFERENCED flag + * @delegation: delegation to process + * + */ +void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) { - struct nfs_delegation *delegation = container_of(head, struct nfs_delegation, rcu); - - nfs_do_free_delegation(delegation); + set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); } -static void nfs_free_delegation(struct nfs_delegation *delegation) +/** + * nfs_have_delegation - check if inode has a delegation + * @inode: inode to check + * @flags: delegation types to check for + * + * Returns one if inode has the indicated delegation, otherwise zero. + */ +int nfs4_have_delegation(struct inode *inode, fmode_t flags) { - struct rpc_cred *cred; + struct nfs_delegation *delegation; + int ret = 0; - cred = rcu_dereference(delegation->cred); - rcu_assign_pointer(delegation->cred, NULL); - call_rcu(&delegation->rcu, nfs_free_delegation_callback); - if (cred) - put_rpccred(cred); + flags &= FMODE_READ|FMODE_WRITE; + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL && (delegation->type & flags) == flags && + !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + nfs_mark_delegation_referenced(delegation); + ret = 1; + } + rcu_read_unlock(); + return ret; } -static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state) +static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) { struct inode *inode = state->inode; struct file_lock *fl; - int status; + int status = 0; + + if (inode->i_flock == NULL) + goto out; + /* Protect inode->i_flock using the i_lock */ + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) continue; if (nfs_file_open_context(fl->fl_file) != ctx) continue; - status = nfs4_lock_delegation_recall(state, fl); - if (status >= 0) - continue; - switch (status) { - default: - printk(KERN_ERR "%s: unhandled error %d.\n", - __FUNCTION__, status); - case -NFS4ERR_EXPIRED: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ - case -NFS4ERR_STALE_CLIENTID: - nfs4_schedule_state_recovery(NFS_SERVER(inode)->nfs_client); - goto out_err; - } + spin_unlock(&inode->i_lock); + status = nfs4_lock_delegation_recall(fl, state, stateid); + if (status < 0) + goto out; + spin_lock(&inode->i_lock); } - return 0; -out_err: + spin_unlock(&inode->i_lock); +out: return status; } -static void nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid) +static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *ctx; + struct nfs4_state_owner *sp; struct nfs4_state *state; + unsigned int seq; int err; again: @@ -88,41 +109,68 @@ again: continue; if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) continue; - if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) + if (!nfs4_stateid_match(&state->stateid, stateid)) continue; get_nfs_open_context(ctx); spin_unlock(&inode->i_lock); + sp = state->owner; + /* Block nfs4_proc_unlck */ + mutex_lock(&sp->so_delegreturn_mutex); + seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); err = nfs4_open_delegation_recall(ctx, state, stateid); - if (err >= 0) - err = nfs_delegation_claim_locks(ctx, state); + if (!err) + err = nfs_delegation_claim_locks(ctx, state, stateid); + if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) + err = -EAGAIN; + mutex_unlock(&sp->so_delegreturn_mutex); put_nfs_open_context(ctx); if (err != 0) - return; + return err; goto again; } spin_unlock(&inode->i_lock); + return 0; } -/* - * Set up a delegation on an inode +/** + * nfs_inode_reclaim_delegation - process a delegation reclaim request + * @inode: inode to process + * @cred: credential to use for request + * @res: new delegation state from server + * */ -void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) +void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, + struct nfs_openres *res) { - struct nfs_delegation *delegation = NFS_I(inode)->delegation; - struct rpc_cred *oldcred; + struct nfs_delegation *delegation; + struct rpc_cred *oldcred = NULL; - if (delegation == NULL) - return; - memcpy(delegation->stateid.data, res->delegation.data, - sizeof(delegation->stateid.data)); - delegation->type = res->delegation_type; - delegation->maxsize = res->maxsize; - oldcred = delegation->cred; - delegation->cred = get_rpccred(cred); - delegation->flags &= ~NFS_DELEGATION_NEED_RECLAIM; - NFS_I(inode)->delegation_state = delegation->type; - smp_wmb(); - put_rpccred(oldcred); + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL) { + nfs4_stateid_copy(&delegation->stateid, &res->delegation); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + oldcred = delegation->cred; + delegation->cred = get_rpccred(cred); + clear_bit(NFS_DELEGATION_NEED_RECLAIM, + &delegation->flags); + NFS_I(inode)->delegation_state = delegation->type; + spin_unlock(&delegation->lock); + put_rpccred(oldcred); + rcu_read_unlock(); + trace_nfs4_reclaim_delegation(inode, res->delegation_type); + } else { + /* We appear to have raced with a delegation return. */ + spin_unlock(&delegation->lock); + rcu_read_unlock(); + nfs_inode_set_delegation(inode, cred, res); + } + } else { + rcu_read_unlock(); + } } static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) @@ -134,67 +182,162 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation * return res; } -static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid) +static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation) +{ + struct inode *inode = NULL; + + spin_lock(&delegation->lock); + if (delegation->inode != NULL) + inode = igrab(delegation->inode); + spin_unlock(&delegation->lock); + return inode; +} + +static struct nfs_delegation * +nfs_start_delegation_return_locked(struct nfs_inode *nfsi) { + struct nfs_delegation *ret = NULL; struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation); if (delegation == NULL) - goto nomatch; - if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, - sizeof(delegation->stateid.data)) != 0) - goto nomatch; + goto out; + spin_lock(&delegation->lock); + if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + ret = delegation; + spin_unlock(&delegation->lock); +out: + return ret; +} + +static struct nfs_delegation * +nfs_start_delegation_return(struct nfs_inode *nfsi) +{ + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = nfs_start_delegation_return_locked(nfsi); + rcu_read_unlock(); + return delegation; +} + +static void +nfs_abort_delegation_return(struct nfs_delegation *delegation, + struct nfs_client *clp) +{ + + spin_lock(&delegation->lock); + clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + spin_unlock(&delegation->lock); + set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); +} + +static struct nfs_delegation * +nfs_detach_delegation_locked(struct nfs_inode *nfsi, + struct nfs_delegation *delegation, + struct nfs_client *clp) +{ + struct nfs_delegation *deleg_cur = + rcu_dereference_protected(nfsi->delegation, + lockdep_is_held(&clp->cl_lock)); + + if (deleg_cur == NULL || delegation != deleg_cur) + return NULL; + + spin_lock(&delegation->lock); + set_bit(NFS_DELEGATION_RETURNING, &delegation->flags); list_del_rcu(&delegation->super_list); + delegation->inode = NULL; nfsi->delegation_state = 0; rcu_assign_pointer(nfsi->delegation, NULL); + spin_unlock(&delegation->lock); return delegation; -nomatch: - return NULL; } -/* - * Set up a delegation on an inode +static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi, + struct nfs_delegation *delegation, + struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + + spin_lock(&clp->cl_lock); + delegation = nfs_detach_delegation_locked(nfsi, delegation, clp); + spin_unlock(&clp->cl_lock); + return delegation; +} + +static struct nfs_delegation * +nfs_inode_detach_delegation(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_delegation *delegation; + + delegation = nfs_start_delegation_return(nfsi); + if (delegation == NULL) + return NULL; + return nfs_detach_delegation(nfsi, delegation, server); +} + +/** + * nfs_inode_set_delegation - set up a delegation on an inode + * @inode: inode to which delegation applies + * @cred: cred to use for subsequent delegation processing + * @res: new delegation state from server + * + * Returns zero on success, or a negative errno value. */ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_delegation *delegation; + struct nfs_delegation *delegation, *old_delegation; struct nfs_delegation *freeme = NULL; int status = 0; - delegation = kmalloc(sizeof(*delegation), GFP_KERNEL); + delegation = kmalloc(sizeof(*delegation), GFP_NOFS); if (delegation == NULL) return -ENOMEM; - memcpy(delegation->stateid.data, res->delegation.data, - sizeof(delegation->stateid.data)); + nfs4_stateid_copy(&delegation->stateid, &res->delegation); delegation->type = res->delegation_type; delegation->maxsize = res->maxsize; - delegation->change_attr = nfsi->change_attr; + delegation->change_attr = inode->i_version; delegation->cred = get_rpccred(cred); delegation->inode = inode; + delegation->flags = 1<<NFS_DELEGATION_REFERENCED; + spin_lock_init(&delegation->lock); spin_lock(&clp->cl_lock); - if (rcu_dereference(nfsi->delegation) != NULL) { - if (memcmp(&delegation->stateid, &nfsi->delegation->stateid, - sizeof(delegation->stateid)) == 0 && - delegation->type == nfsi->delegation->type) { + old_delegation = rcu_dereference_protected(nfsi->delegation, + lockdep_is_held(&clp->cl_lock)); + if (old_delegation != NULL) { + if (nfs4_stateid_match(&delegation->stateid, + &old_delegation->stateid) && + delegation->type == old_delegation->type) { goto out; } /* * Deal with broken servers that hand out two * delegations for the same file. + * Allow for upgrades to a WRITE delegation, but + * nothing else. */ dfprintk(FILE, "%s: server %s handed out " "a duplicate delegation!\n", - __FUNCTION__, clp->cl_hostname); - if (delegation->type <= nfsi->delegation->type) { + __func__, clp->cl_hostname); + if (delegation->type == old_delegation->type || + !(delegation->type & FMODE_WRITE)) { freeme = delegation; delegation = NULL; goto out; } - freeme = nfs_detach_delegation_locked(nfsi, NULL); + freeme = nfs_detach_delegation_locked(nfsi, + old_delegation, clp); + if (freeme == NULL) + goto out; } - list_add_rcu(&delegation->super_list, &clp->cl_delegations); + list_add_rcu(&delegation->super_list, &server->delegations); nfsi->delegation_state = delegation->type; rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; @@ -203,6 +346,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct spin_lock(&inode->i_lock); nfsi->cache_validity |= NFS_INO_REVAL_FORCED; spin_unlock(&inode->i_lock); + trace_nfs4_set_delegation(inode, res->delegation_type); out: spin_unlock(&clp->cl_lock); @@ -213,306 +357,470 @@ out: return status; } -/* Sync all data to disk upon delegation return */ -static void nfs_msync_inode(struct inode *inode) -{ - filemap_fdatawrite(inode->i_mapping); - nfs_wb_all(inode); - filemap_fdatawait(inode->i_mapping); -} - /* * Basic procedure for returning a delegation to the server */ -static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation) +static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync) { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); + int err; - nfs_msync_inode(inode); - down_read(&clp->cl_sem); - /* Guard against new delegated open calls */ - down_write(&nfsi->rwsem); - nfs_delegation_claim_opens(inode, &delegation->stateid); - up_write(&nfsi->rwsem); - up_read(&clp->cl_sem); - nfs_msync_inode(inode); + if (delegation == NULL) + return 0; + do { + err = nfs_delegation_claim_opens(inode, &delegation->stateid); + if (!issync || err != -EAGAIN) + break; + /* + * Guard against state recovery + */ + err = nfs4_wait_clnt_recover(clp); + } while (err == 0); + + if (err) { + nfs_abort_delegation_return(delegation, clp); + goto out; + } + if (!nfs_detach_delegation(nfsi, delegation, NFS_SERVER(inode))) + goto out; - return nfs_do_return_delegation(inode, delegation, 1); + err = nfs_do_return_delegation(inode, delegation, issync); +out: + return err; } -/* - * This function returns the delegation without reclaiming opens - * or protecting against delegation reclaims. - * It is therefore really only safe to be called from - * nfs4_clear_inode() +static bool nfs_delegation_need_return(struct nfs_delegation *delegation) +{ + bool ret = false; + + if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) + ret = true; + if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) { + struct inode *inode; + + spin_lock(&delegation->lock); + inode = delegation->inode; + if (inode && list_empty(&NFS_I(inode)->open_files)) + ret = true; + spin_unlock(&delegation->lock); + } + return ret; +} + +/** + * nfs_client_return_marked_delegations - return previously marked delegations + * @clp: nfs_client to process + * + * Note that this function is designed to be called by the state + * manager thread. For this reason, it cannot flush the dirty data, + * since that could deadlock in case of a state recovery error. + * + * Returns zero on success, or a negative errno value. */ -void nfs_inode_return_delegation_noreclaim(struct inode *inode) +int nfs_client_return_marked_delegations(struct nfs_client *clp) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; - struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; + struct nfs_server *server; + struct inode *inode; + int err = 0; - if (rcu_dereference(nfsi->delegation) != NULL) { - spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(nfsi, NULL); - spin_unlock(&clp->cl_lock); - if (delegation != NULL) - nfs_do_return_delegation(inode, delegation, 0); +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry_rcu(delegation, &server->delegations, + super_list) { + if (!nfs_delegation_need_return(delegation)) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); + rcu_read_unlock(); + + err = nfs_end_delegation_return(inode, delegation, 0); + iput(inode); + if (!err) + goto restart; + set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); + return err; + } } + rcu_read_unlock(); + return 0; } -int nfs_inode_return_delegation(struct inode *inode) +/** + * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens + * @inode: inode to process + * + * Does not protect against delegation reclaims, therefore really only safe + * to be called from nfs4_clear_inode(). + */ +void nfs_inode_return_delegation_noreclaim(struct inode *inode) +{ + struct nfs_delegation *delegation; + + delegation = nfs_inode_detach_delegation(inode); + if (delegation != NULL) + nfs_do_return_delegation(inode, delegation, 0); +} + +/** + * nfs_inode_return_delegation - synchronously return a delegation + * @inode: inode to process + * + * This routine will always flush any dirty data to disk on the + * assumption that if we need to return the delegation, then + * we should stop caching. + * + * Returns zero on success, or a negative errno value. + */ +int nfs4_inode_return_delegation(struct inode *inode) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; int err = 0; - if (rcu_dereference(nfsi->delegation) != NULL) { - spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(nfsi, NULL); - spin_unlock(&clp->cl_lock); - if (delegation != NULL) - err = __nfs_inode_return_delegation(inode, delegation); - } + nfs_wb_all(inode); + delegation = nfs_start_delegation_return(nfsi); + if (delegation != NULL) + err = nfs_end_delegation_return(inode, delegation, 1); return err; } -/* - * Return all delegations associated to a super block - */ -void nfs_return_all_delegations(struct super_block *sb) +static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); +} + +static void nfs_mark_return_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); +} + +static bool nfs_server_mark_return_all_delegations(struct nfs_server *server) { - struct nfs_client *clp = NFS_SB(sb)->nfs_client; struct nfs_delegation *delegation; - struct inode *inode; + bool ret = false; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + nfs_mark_return_delegation(server, delegation); + ret = true; + } + return ret; +} + +static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_server_mark_return_all_delegations(server); + rcu_read_unlock(); +} + +static void nfs_delegation_run_state_manager(struct nfs_client *clp) +{ + if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) + nfs4_schedule_state_manager(clp); +} + +/** + * nfs_expire_all_delegations + * @clp: client to process + * + */ +void nfs_expire_all_delegations(struct nfs_client *clp) +{ + nfs_client_mark_return_all_delegations(clp); + nfs_delegation_run_state_manager(clp); +} + +/** + * nfs_super_return_all_delegations - return delegations for one superblock + * @sb: sb to process + * + */ +void nfs_server_return_all_delegations(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + bool need_wait; if (clp == NULL) return; -restart: + rcu_read_lock(); - list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { - if (delegation->inode->i_sb != sb) - continue; - inode = igrab(delegation->inode); - if (inode == NULL) - continue; - spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); - spin_unlock(&clp->cl_lock); - rcu_read_unlock(); - if (delegation != NULL) - __nfs_inode_return_delegation(inode, delegation); - iput(inode); - goto restart; - } + need_wait = nfs_server_mark_return_all_delegations(server); rcu_read_unlock(); + + if (need_wait) { + nfs4_schedule_state_manager(clp); + nfs4_wait_clnt_recover(clp); + } } -static int nfs_do_expire_all_delegations(void *ptr) +static void nfs_mark_return_unused_delegation_types(struct nfs_server *server, + fmode_t flags) { - struct nfs_client *clp = ptr; struct nfs_delegation *delegation; - struct inode *inode; - allow_signal(SIGKILL); -restart: - if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) != 0) - goto out; - if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) - goto out; - rcu_read_lock(); - list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { - inode = igrab(delegation->inode); - if (inode == NULL) + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) continue; - spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); - spin_unlock(&clp->cl_lock); - rcu_read_unlock(); - if (delegation) - __nfs_inode_return_delegation(inode, delegation); - iput(inode); - goto restart; + if (delegation->type & flags) + nfs_mark_return_if_closed_delegation(server, delegation); } +} + +static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *clp, + fmode_t flags) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_mark_return_unused_delegation_types(server, flags); rcu_read_unlock(); -out: - nfs_put_client(clp); - module_put_and_exit(0); } -void nfs_expire_all_delegations(struct nfs_client *clp) +void nfs_remove_bad_delegation(struct inode *inode) { - struct task_struct *task; + struct nfs_delegation *delegation; - __module_get(THIS_MODULE); - atomic_inc(&clp->cl_count); - task = kthread_run(nfs_do_expire_all_delegations, clp, - "%s-delegreturn", - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR)); - if (!IS_ERR(task)) - return; - nfs_put_client(clp); - module_put(THIS_MODULE); + delegation = nfs_inode_detach_delegation(inode); + if (delegation) { + nfs_inode_find_state_and_recover(inode, &delegation->stateid); + nfs_free_delegation(delegation); + } } +EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); -/* - * Return all delegations following an NFS4ERR_CB_PATH_DOWN error. +/** + * nfs_expire_unused_delegation_types + * @clp: client to process + * @flags: delegation types to expire + * */ -void nfs_handle_cb_pathdown(struct nfs_client *clp) +void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags) +{ + nfs_client_mark_return_unused_delegation_types(clp, flags); + nfs_delegation_run_state_manager(clp); +} + +static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) { struct nfs_delegation *delegation; - struct inode *inode; - if (clp == NULL) - return; -restart: - rcu_read_lock(); - list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { - inode = igrab(delegation->inode); - if (inode == NULL) + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) continue; - spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); - spin_unlock(&clp->cl_lock); - rcu_read_unlock(); - if (delegation != NULL) - __nfs_inode_return_delegation(inode, delegation); - iput(inode); - goto restart; + nfs_mark_return_if_closed_delegation(server, delegation); } - rcu_read_unlock(); } -struct recall_threadargs { - struct inode *inode; - struct nfs_client *clp; - const nfs4_stateid *stateid; +/** + * nfs_expire_unreferenced_delegations - Eliminate unused delegations + * @clp: nfs_client to process + * + */ +void nfs_expire_unreferenced_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; - struct completion started; - int result; -}; + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_mark_return_unreferenced_delegations(server); + rcu_read_unlock(); -static int recall_thread(void *data) + nfs_delegation_run_state_manager(clp); +} + +/** + * nfs_async_inode_return_delegation - asynchronously return a delegation + * @inode: inode to process + * @stateid: state ID information + * + * Returns zero on success, or a negative errno value. + */ +int nfs_async_inode_return_delegation(struct inode *inode, + const nfs4_stateid *stateid) { - struct recall_threadargs *args = (struct recall_threadargs *)data; - struct inode *inode = igrab(args->inode); - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; - struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; struct nfs_delegation *delegation; - daemonize("nfsv4-delegreturn"); + filemap_flush(inode->i_mapping); - nfs_msync_inode(inode); - down_read(&clp->cl_sem); - down_write(&nfsi->rwsem); - spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(nfsi, args->stateid); - if (delegation != NULL) - args->result = 0; - else - args->result = -ENOENT; - spin_unlock(&clp->cl_lock); - complete(&args->started); - nfs_delegation_claim_opens(inode, args->stateid); - up_write(&nfsi->rwsem); - up_read(&clp->cl_sem); - nfs_msync_inode(inode); + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation == NULL) + goto out_enoent; - if (delegation != NULL) - nfs_do_return_delegation(inode, delegation, 1); - iput(inode); - module_put_and_exit(0); + if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) + goto out_enoent; + nfs_mark_return_delegation(server, delegation); + rcu_read_unlock(); + + nfs_delegation_run_state_manager(clp); + return 0; +out_enoent: + rcu_read_unlock(); + return -ENOENT; } -/* - * Asynchronous delegation recall! - */ -int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) -{ - struct recall_threadargs data = { - .inode = inode, - .stateid = stateid, - }; - int status; - - init_completion(&data.started); - __module_get(THIS_MODULE); - status = kernel_thread(recall_thread, &data, CLONE_KERNEL); - if (status < 0) - goto out_module_put; - wait_for_completion(&data.started); - return data.result; -out_module_put: - module_put(THIS_MODULE); - return status; +static struct inode * +nfs_delegation_find_inode_server(struct nfs_server *server, + const struct nfs_fh *fhandle) +{ + struct nfs_delegation *delegation; + struct inode *res = NULL; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL && + nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { + res = igrab(delegation->inode); + } + spin_unlock(&delegation->lock); + if (res != NULL) + break; + } + return res; } -/* - * Retrieve the inode associated with a delegation +/** + * nfs_delegation_find_inode - retrieve the inode associated with a delegation + * @clp: client state handle + * @fhandle: filehandle from a delegation recall + * + * Returns pointer to inode matching "fhandle," or NULL if a matching inode + * cannot be found. */ -struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle) +struct inode *nfs_delegation_find_inode(struct nfs_client *clp, + const struct nfs_fh *fhandle) { - struct nfs_delegation *delegation; + struct nfs_server *server; struct inode *res = NULL; + rcu_read_lock(); - list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { - if (nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { - res = igrab(delegation->inode); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + res = nfs_delegation_find_inode_server(server, fhandle); + if (res != NULL) break; - } } rcu_read_unlock(); return res; } -/* - * Mark all delegations as needing to be reclaimed +static void nfs_delegation_mark_reclaim_server(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) + set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); +} + +/** + * nfs_delegation_mark_reclaim - mark all delegations as needing to be reclaimed + * @clp: nfs_client to process + * */ void nfs_delegation_mark_reclaim(struct nfs_client *clp) { - struct nfs_delegation *delegation; + struct nfs_server *server; + rcu_read_lock(); - list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) - delegation->flags |= NFS_DELEGATION_NEED_RECLAIM; + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_delegation_mark_reclaim_server(server); rcu_read_unlock(); } -/* - * Reap all unclaimed delegations after reboot recovery is done +/** + * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done + * @clp: nfs_client to process + * */ void nfs_delegation_reap_unclaimed(struct nfs_client *clp) { struct nfs_delegation *delegation; + struct nfs_server *server; + struct inode *inode; + restart: rcu_read_lock(); - list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { - if ((delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) - continue; - spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(NFS_I(delegation->inode), NULL); - spin_unlock(&clp->cl_lock); - rcu_read_unlock(); - if (delegation != NULL) - nfs_free_delegation(delegation); - goto restart; + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry_rcu(delegation, &server->delegations, + super_list) { + if (test_bit(NFS_DELEGATION_NEED_RECLAIM, + &delegation->flags) == 0) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; + delegation = nfs_detach_delegation(NFS_I(inode), + delegation, server); + rcu_read_unlock(); + + if (delegation != NULL) + nfs_free_delegation(delegation); + iput(inode); + goto restart; + } } rcu_read_unlock(); } -int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode) +/** + * nfs_delegations_present - check for existence of delegations + * @clp: client state handle + * + * Returns one if there are any nfs_delegation structures attached + * to this nfs_client. + */ +int nfs_delegations_present(struct nfs_client *clp) +{ + struct nfs_server *server; + int ret = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + if (!list_empty(&server->delegations)) { + ret = 1; + break; + } + rcu_read_unlock(); + return ret; +} + +/** + * nfs4_copy_delegation_stateid - Copy inode's state ID information + * @dst: stateid data structure to fill in + * @inode: inode to check + * @flags: delegation type requirement + * + * Returns "true" and fills in "dst->data" * if inode had a delegation, + * otherwise "false" is returned. + */ +bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, + fmode_t flags) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; - int ret = 0; + bool ret; + flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); - if (delegation != NULL) { - memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); - ret = 1; + ret = (delegation != NULL && (delegation->type & flags) == flags); + if (ret) { + nfs4_stateid_copy(dst, &delegation->stateid); + nfs_mark_delegation_referenced(delegation); } rcu_read_unlock(); return ret; diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index f1c5e2a5d88..9a79c7a99d6 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -8,7 +8,7 @@ #ifndef FS_NFS_DELEGATION_H #define FS_NFS_DELEGATION_H -#if defined(CONFIG_NFS_V4) +#if IS_ENABLED(CONFIG_NFS_V4) /* * NFSv4 delegation */ @@ -17,24 +17,36 @@ struct nfs_delegation { struct rpc_cred *cred; struct inode *inode; nfs4_stateid stateid; - int type; -#define NFS_DELEGATION_NEED_RECLAIM 1 - long flags; + fmode_t type; loff_t maxsize; __u64 change_attr; + unsigned long flags; + spinlock_t lock; struct rcu_head rcu; }; +enum { + NFS_DELEGATION_NEED_RECLAIM = 0, + NFS_DELEGATION_RETURN, + NFS_DELEGATION_RETURN_IF_CLOSED, + NFS_DELEGATION_REFERENCED, + NFS_DELEGATION_RETURNING, +}; + int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); -int nfs_inode_return_delegation(struct inode *inode); +int nfs4_inode_return_delegation(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_inode_return_delegation_noreclaim(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); -void nfs_return_all_delegations(struct super_block *sb); +void nfs_server_return_all_delegations(struct nfs_server *); void nfs_expire_all_delegations(struct nfs_client *clp); -void nfs_handle_cb_pathdown(struct nfs_client *clp); +void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags); +void nfs_expire_unreferenced_delegations(struct nfs_client *clp); +int nfs_client_return_marked_delegations(struct nfs_client *clp); +int nfs_delegations_present(struct nfs_client *clp); +void nfs_remove_bad_delegation(struct inode *inode); void nfs_delegation_mark_reclaim(struct nfs_client *clp); void nfs_delegation_reap_unclaimed(struct nfs_client *clp); @@ -42,33 +54,18 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp); /* NFSv4 delegation-related procedures */ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); -int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); -int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode); +int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); +bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags); -static inline int nfs_have_delegation(struct inode *inode, int flags) -{ - struct nfs_delegation *delegation; - int ret = 0; - - flags &= FMODE_READ|FMODE_WRITE; - rcu_read_lock(); - delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL && (delegation->type & flags) == flags) - ret = 1; - rcu_read_unlock(); - return ret; -} +void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); +int nfs4_have_delegation(struct inode *inode, fmode_t flags); -#else -static inline int nfs_have_delegation(struct inode *inode, int flags) -{ - return 0; -} +#endif -static inline int nfs_inode_return_delegation(struct inode *inode) +static inline int nfs_have_delegated_attributes(struct inode *inode) { - return 0; + return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) && + !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED); } -#endif #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 6cea7479c5b..4a3d4ef7612 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -17,6 +17,7 @@ * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM */ +#include <linux/module.h> #include <linux/time.h> #include <linux/errno.h> #include <linux/stat.h> @@ -29,101 +30,70 @@ #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/pagemap.h> -#include <linux/smp_lock.h> #include <linux/pagevec.h> #include <linux/namei.h> #include <linux/mount.h> +#include <linux/swap.h> #include <linux/sched.h> +#include <linux/kmemleak.h> +#include <linux/xattr.h> -#include "nfs4_fs.h" #include "delegation.h" #include "iostat.h" #include "internal.h" +#include "fscache.h" + +#include "nfstrace.h" /* #define NFS_DEBUG_VERBOSE 1 */ static int nfs_opendir(struct inode *, struct file *); -static int nfs_readdir(struct file *, void *, filldir_t); -static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *); -static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *); -static int nfs_mkdir(struct inode *, struct dentry *, int); -static int nfs_rmdir(struct inode *, struct dentry *); -static int nfs_unlink(struct inode *, struct dentry *); -static int nfs_symlink(struct inode *, struct dentry *, const char *); -static int nfs_link(struct dentry *, struct inode *, struct dentry *); -static int nfs_mknod(struct inode *, struct dentry *, int, dev_t); -static int nfs_rename(struct inode *, struct dentry *, - struct inode *, struct dentry *); -static int nfs_fsync_dir(struct file *, struct dentry *, int); +static int nfs_closedir(struct inode *, struct file *); +static int nfs_readdir(struct file *, struct dir_context *); +static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); +static void nfs_readdir_clear_array(struct page*); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, .read = generic_read_dir, - .readdir = nfs_readdir, + .iterate = nfs_readdir, .open = nfs_opendir, - .release = nfs_release, + .release = nfs_closedir, .fsync = nfs_fsync_dir, }; -const struct inode_operations nfs_dir_inode_operations = { - .create = nfs_create, - .lookup = nfs_lookup, - .link = nfs_link, - .unlink = nfs_unlink, - .symlink = nfs_symlink, - .mkdir = nfs_mkdir, - .rmdir = nfs_rmdir, - .mknod = nfs_mknod, - .rename = nfs_rename, - .permission = nfs_permission, - .getattr = nfs_getattr, - .setattr = nfs_setattr, +const struct address_space_operations nfs_dir_aops = { + .freepage = nfs_readdir_clear_array, }; -#ifdef CONFIG_NFS_V3 -const struct inode_operations nfs3_dir_inode_operations = { - .create = nfs_create, - .lookup = nfs_lookup, - .link = nfs_link, - .unlink = nfs_unlink, - .symlink = nfs_symlink, - .mkdir = nfs_mkdir, - .rmdir = nfs_rmdir, - .mknod = nfs_mknod, - .rename = nfs_rename, - .permission = nfs_permission, - .getattr = nfs_getattr, - .setattr = nfs_setattr, - .listxattr = nfs3_listxattr, - .getxattr = nfs3_getxattr, - .setxattr = nfs3_setxattr, - .removexattr = nfs3_removexattr, -}; -#endif /* CONFIG_NFS_V3 */ - -#ifdef CONFIG_NFS_V4 - -static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *); -const struct inode_operations nfs4_dir_inode_operations = { - .create = nfs_create, - .lookup = nfs_atomic_lookup, - .link = nfs_link, - .unlink = nfs_unlink, - .symlink = nfs_symlink, - .mkdir = nfs_mkdir, - .rmdir = nfs_rmdir, - .mknod = nfs_mknod, - .rename = nfs_rename, - .permission = nfs_permission, - .getattr = nfs_getattr, - .setattr = nfs_setattr, - .getxattr = nfs4_getxattr, - .setxattr = nfs4_setxattr, - .listxattr = nfs4_listxattr, -}; +static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) +{ + struct nfs_inode *nfsi = NFS_I(dir); + struct nfs_open_dir_context *ctx; + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (ctx != NULL) { + ctx->duped = 0; + ctx->attr_gencount = nfsi->attr_gencount; + ctx->dir_cookie = 0; + ctx->dup_cookie = 0; + ctx->cred = get_rpccred(cred); + spin_lock(&dir->i_lock); + list_add(&ctx->list, &nfsi->open_files); + spin_unlock(&dir->i_lock); + return ctx; + } + return ERR_PTR(-ENOMEM); +} -#endif /* CONFIG_NFS_V4 */ +static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx) +{ + spin_lock(&dir->i_lock); + list_del(&ctx->list); + spin_unlock(&dir->i_lock); + put_rpccred(ctx->cred); + kfree(ctx); +} /* * Open file @@ -131,61 +101,283 @@ const struct inode_operations nfs4_dir_inode_operations = { static int nfs_opendir(struct inode *inode, struct file *filp) { - int res; + int res = 0; + struct nfs_open_dir_context *ctx; + struct rpc_cred *cred; + + dfprintk(FILE, "NFS: open dir(%pD2)\n", filp); - dfprintk(VFS, "NFS: opendir(%s/%ld)\n", - inode->i_sb->s_id, inode->i_ino); + nfs_inc_stats(inode, NFSIOS_VFSOPEN); - lock_kernel(); - /* Call generic open code in order to cache credentials */ - res = nfs_open(inode, filp); - unlock_kernel(); + cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return PTR_ERR(cred); + ctx = alloc_nfs_open_dir_context(inode, cred); + if (IS_ERR(ctx)) { + res = PTR_ERR(ctx); + goto out; + } + filp->private_data = ctx; + if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) { + /* This is a mountpoint, so d_revalidate will never + * have been called, so we need to refresh the + * inode (for close-open consistency) ourselves. + */ + __nfs_revalidate_inode(NFS_SERVER(inode), inode); + } +out: + put_rpccred(cred); return res; } -typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int); +static int +nfs_closedir(struct inode *inode, struct file *filp) +{ + put_nfs_open_dir_context(filp->f_path.dentry->d_inode, filp->private_data); + return 0; +} + +struct nfs_cache_array_entry { + u64 cookie; + u64 ino; + struct qstr string; + unsigned char d_type; +}; + +struct nfs_cache_array { + int size; + int eof_index; + u64 last_cookie; + struct nfs_cache_array_entry array[0]; +}; + +typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int); typedef struct { struct file *file; struct page *page; + struct dir_context *ctx; unsigned long page_index; - __be32 *ptr; u64 *dir_cookie; + u64 last_cookie; loff_t current_index; - struct nfs_entry *entry; decode_dirent_t decode; - int plus; + unsigned long timestamp; - int timestamp_valid; + unsigned long gencount; + unsigned int cache_entry_index; + unsigned int plus:1; + unsigned int eof:1; } nfs_readdir_descriptor_t; -/* Now we cache directories properly, by stuffing the dirent - * data directly in the page cache. - * - * Inode invalidation due to refresh etc. takes care of - * _everything_, no sloppy entry flushing logic, no extraneous - * copying, network direct to page cache, the way it was meant - * to be. - * - * NOTE: Dirent information verification is done always by the - * page-in of the RPC reply, nowhere else, this simplies - * things substantially. +/* + * The caller is responsible for calling nfs_readdir_release_array(page) */ static -int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) +struct nfs_cache_array *nfs_readdir_get_array(struct page *page) { - struct file *file = desc->file; - struct inode *inode = file->f_path.dentry->d_inode; - struct rpc_cred *cred = nfs_file_cred(file); - unsigned long timestamp; - int error; + void *ptr; + if (page == NULL) + return ERR_PTR(-EIO); + ptr = kmap(page); + if (ptr == NULL) + return ERR_PTR(-ENOMEM); + return ptr; +} - dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n", - __FUNCTION__, (long long)desc->entry->cookie, - page->index); +static +void nfs_readdir_release_array(struct page *page) +{ + kunmap(page); +} + +/* + * we are freeing strings created by nfs_add_to_readdir_array() + */ +static +void nfs_readdir_clear_array(struct page *page) +{ + struct nfs_cache_array *array; + int i; + + array = kmap_atomic(page); + for (i = 0; i < array->size; i++) + kfree(array->array[i].string.name); + kunmap_atomic(array); +} + +/* + * the caller is responsible for freeing qstr.name + * when called by nfs_readdir_add_to_array, the strings will be freed in + * nfs_clear_readdir_array() + */ +static +int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len) +{ + string->len = len; + string->name = kmemdup(name, len, GFP_KERNEL); + if (string->name == NULL) + return -ENOMEM; + /* + * Avoid a kmemleak false positive. The pointer to the name is stored + * in a page cache page which kmemleak does not scan. + */ + kmemleak_not_leak(string->name); + string->hash = full_name_hash(name, len); + return 0; +} + +static +int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) +{ + struct nfs_cache_array *array = nfs_readdir_get_array(page); + struct nfs_cache_array_entry *cache_entry; + int ret; + + if (IS_ERR(array)) + return PTR_ERR(array); + + cache_entry = &array->array[array->size]; + + /* Check that this entry lies within the page bounds */ + ret = -ENOSPC; + if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE) + goto out; + + cache_entry->cookie = entry->prev_cookie; + cache_entry->ino = entry->ino; + cache_entry->d_type = entry->d_type; + ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); + if (ret) + goto out; + array->last_cookie = entry->cookie; + array->size++; + if (entry->eof != 0) + array->eof_index = array->size; +out: + nfs_readdir_release_array(page); + return ret; +} + +static +int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +{ + loff_t diff = desc->ctx->pos - desc->current_index; + unsigned int index; + + if (diff < 0) + goto out_eof; + if (diff >= array->size) { + if (array->eof_index >= 0) + goto out_eof; + return -EAGAIN; + } + + index = (unsigned int)diff; + *desc->dir_cookie = array->array[index].cookie; + desc->cache_entry_index = index; + return 0; +out_eof: + desc->eof = 1; + return -EBADCOOKIE; +} + +static bool +nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) +{ + if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) + return false; + smp_rmb(); + return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); +} + +static +int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +{ + int i; + loff_t new_pos; + int status = -EAGAIN; + + for (i = 0; i < array->size; i++) { + if (array->array[i].cookie == *desc->dir_cookie) { + struct nfs_inode *nfsi = NFS_I(file_inode(desc->file)); + struct nfs_open_dir_context *ctx = desc->file->private_data; + + new_pos = desc->current_index + i; + if (ctx->attr_gencount != nfsi->attr_gencount || + !nfs_readdir_inode_mapping_valid(nfsi)) { + ctx->duped = 0; + ctx->attr_gencount = nfsi->attr_gencount; + } else if (new_pos < desc->ctx->pos) { + if (ctx->duped > 0 + && ctx->dup_cookie == *desc->dir_cookie) { + if (printk_ratelimit()) { + pr_notice("NFS: directory %pD2 contains a readdir loop." + "Please contact your server vendor. " + "The file: %.*s has duplicate cookie %llu\n", + desc->file, array->array[i].string.len, + array->array[i].string.name, *desc->dir_cookie); + } + status = -ELOOP; + goto out; + } + ctx->dup_cookie = *desc->dir_cookie; + ctx->duped = -1; + } + desc->ctx->pos = new_pos; + desc->cache_entry_index = i; + return 0; + } + } + if (array->eof_index >= 0) { + status = -EBADCOOKIE; + if (*desc->dir_cookie == array->last_cookie) + desc->eof = 1; + } +out: + return status; +} + +static +int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) +{ + struct nfs_cache_array *array; + int status; + + array = nfs_readdir_get_array(desc->page); + if (IS_ERR(array)) { + status = PTR_ERR(array); + goto out; + } + + if (*desc->dir_cookie == 0) + status = nfs_readdir_search_for_pos(array, desc); + else + status = nfs_readdir_search_for_cookie(array, desc); + + if (status == -EAGAIN) { + desc->last_cookie = array->last_cookie; + desc->current_index += array->size; + desc->page_index++; + } + nfs_readdir_release_array(desc->page); +out: + return status; +} + +/* Fill a page with xdr information before transferring to the cache page */ +static +int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, + struct nfs_entry *entry, struct file *file, struct inode *inode) +{ + struct nfs_open_dir_context *ctx = file->private_data; + struct rpc_cred *cred = ctx->cred; + unsigned long timestamp, gencount; + int error; again: timestamp = jiffies; - error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page, + gencount = nfs_inc_attr_generation_counter(); + error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ @@ -198,252 +390,399 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) goto error; } desc->timestamp = timestamp; - desc->timestamp_valid = 1; - SetPageUptodate(page); - /* Ensure consistent page alignment of the data. - * Note: assumes we have exclusive access to this mapping either - * through inode->i_mutex or some other mechanism. - */ - if (page->index == 0 && invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1) < 0) { - /* Should never happen */ - nfs_zap_mapping(inode, inode->i_mapping); - } - unlock_page(page); + desc->gencount = gencount; +error: + return error; +} + +static int xdr_decode(nfs_readdir_descriptor_t *desc, + struct nfs_entry *entry, struct xdr_stream *xdr) +{ + int error; + + error = desc->decode(xdr, entry, desc->plus); + if (error) + return error; + entry->fattr->time_start = desc->timestamp; + entry->fattr->gencount = desc->gencount; return 0; - error: - unlock_page(page); - return -EIO; } -static inline -int dir_decode(nfs_readdir_descriptor_t *desc) -{ - __be32 *p = desc->ptr; - p = desc->decode(p, desc->entry, desc->plus); - if (IS_ERR(p)) - return PTR_ERR(p); - desc->ptr = p; - if (desc->timestamp_valid) - desc->entry->fattr->time_start = desc->timestamp; - else - desc->entry->fattr->valid &= ~NFS_ATTR_FATTR; +static +int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) +{ + if (dentry->d_inode == NULL) + goto different; + if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0) + goto different; + return 1; +different: return 0; } -static inline -void dir_page_release(nfs_readdir_descriptor_t *desc) +static +bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx) { - kunmap(desc->page); - page_cache_release(desc->page); - desc->page = NULL; - desc->ptr = NULL; + if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) + return false; + if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags)) + return true; + if (ctx->pos == 0) + return true; + return false; } /* - * Given a pointer to a buffer that has already been filled by a call - * to readdir, find the next entry with cookie '*desc->dir_cookie'. + * This function is called by the lookup code to request the use of + * readdirplus to accelerate any future lookups in the same + * directory. + */ +static +void nfs_advise_use_readdirplus(struct inode *dir) +{ + set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags); +} + +/* + * This function is mainly for use by nfs_getattr(). * - * If the end of the buffer has been reached, return -EAGAIN, if not, - * return the offset within the buffer of the next entry to be - * read. + * If this is an 'ls -l', we want to force use of readdirplus. + * Do this by checking if there is an active file descriptor + * and calling nfs_advise_use_readdirplus, then forcing a + * cache flush. */ -static inline -int find_dirent(nfs_readdir_descriptor_t *desc) +void nfs_force_use_readdirplus(struct inode *dir) +{ + if (!list_empty(&NFS_I(dir)->open_files)) { + nfs_advise_use_readdirplus(dir); + nfs_zap_mapping(dir, dir->i_mapping); + } +} + +static +void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) { - struct nfs_entry *entry = desc->entry; - int loop_count = 0, - status; + struct qstr filename = QSTR_INIT(entry->name, entry->len); + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = parent->d_inode; + struct inode *inode; + int status; - while((status = dir_decode(desc)) == 0) { - dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n", - __FUNCTION__, (unsigned long long)entry->cookie); - if (entry->prev_cookie == *desc->dir_cookie) - break; - if (loop_count++ > 200) { - loop_count = 0; - schedule(); + if (filename.name[0] == '.') { + if (filename.len == 1) + return; + if (filename.len == 2 && filename.name[1] == '.') + return; + } + filename.hash = full_name_hash(filename.name, filename.len); + + dentry = d_lookup(parent, &filename); + if (dentry != NULL) { + if (nfs_same_file(dentry, entry)) { + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + status = nfs_refresh_inode(dentry->d_inode, entry->fattr); + if (!status) + nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label); + goto out; + } else { + if (d_invalidate(dentry) != 0) + goto out; + dput(dentry); } } - return status; + + dentry = d_alloc(parent, &filename); + if (dentry == NULL) + return; + + inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label); + if (IS_ERR(inode)) + goto out; + + alias = d_materialise_unique(dentry, inode); + if (IS_ERR(alias)) + goto out; + else if (alias) { + nfs_set_verifier(alias, nfs_save_change_attribute(dir)); + dput(alias); + } else + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + +out: + dput(dentry); } -/* - * Given a pointer to a buffer that has already been filled by a call - * to readdir, find the entry at offset 'desc->file->f_pos'. - * - * If the end of the buffer has been reached, return -EAGAIN, if not, - * return the offset within the buffer of the next entry to be - * read. - */ -static inline -int find_dirent_index(nfs_readdir_descriptor_t *desc) +/* Perform conversion from xdr to cache array */ +static +int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, + struct page **xdr_pages, struct page *page, unsigned int buflen) { - struct nfs_entry *entry = desc->entry; - int loop_count = 0, - status; + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + struct nfs_cache_array *array; + unsigned int count = 0; + int status; - for(;;) { - status = dir_decode(desc); - if (status) - break; + scratch = alloc_page(GFP_KERNEL); + if (scratch == NULL) + return -ENOMEM; - dfprintk(DIRCACHE, "NFS: found cookie %Lu at index %Ld\n", - (unsigned long long)entry->cookie, desc->current_index); + xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); - if (desc->file->f_pos == desc->current_index) { - *desc->dir_cookie = entry->cookie; + do { + status = xdr_decode(desc, entry, &stream); + if (status != 0) { + if (status == -EAGAIN) + status = 0; break; } - desc->current_index++; - if (loop_count++ > 200) { - loop_count = 0; - schedule(); - } + + count++; + + if (desc->plus != 0) + nfs_prime_dcache(desc->file->f_path.dentry, entry); + + status = nfs_readdir_add_to_array(entry, page); + if (status != 0) + break; + } while (!entry->eof); + + if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { + array = nfs_readdir_get_array(page); + if (!IS_ERR(array)) { + array->eof_index = array->size; + status = 0; + nfs_readdir_release_array(page); + } else + status = PTR_ERR(array); } + + put_page(scratch); return status; } +static +void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages) +{ + unsigned int i; + for (i = 0; i < npages; i++) + put_page(pages[i]); +} + +static +void nfs_readdir_free_large_page(void *ptr, struct page **pages, + unsigned int npages) +{ + nfs_readdir_free_pagearray(pages, npages); +} + /* - * Find the given page, and call find_dirent() or find_dirent_index in - * order to try to return the next entry. + * nfs_readdir_large_page will allocate pages that must be freed with a call + * to nfs_readdir_free_large_page */ -static inline -int find_dirent_page(nfs_readdir_descriptor_t *desc) +static +int nfs_readdir_large_page(struct page **pages, unsigned int npages) { - struct inode *inode = desc->file->f_path.dentry->d_inode; - struct page *page; - int status; + unsigned int i; - dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n", - __FUNCTION__, desc->page_index, - (long long) *desc->dir_cookie); + for (i = 0; i < npages; i++) { + struct page *page = alloc_page(GFP_KERNEL); + if (page == NULL) + goto out_freepages; + pages[i] = page; + } + return 0; - /* If we find the page in the page_cache, we cannot be sure - * how fresh the data is, so we will ignore readdir_plus attributes. - */ - desc->timestamp_valid = 0; - page = read_cache_page(inode->i_mapping, desc->page_index, - (filler_t *)nfs_readdir_filler, desc); - if (IS_ERR(page)) { - status = PTR_ERR(page); +out_freepages: + nfs_readdir_free_pagearray(pages, i); + return -ENOMEM; +} + +static +int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) +{ + struct page *pages[NFS_MAX_READDIR_PAGES]; + void *pages_ptr = NULL; + struct nfs_entry entry; + struct file *file = desc->file; + struct nfs_cache_array *array; + int status = -ENOMEM; + unsigned int array_size = ARRAY_SIZE(pages); + + entry.prev_cookie = 0; + entry.cookie = desc->last_cookie; + entry.eof = 0; + entry.fh = nfs_alloc_fhandle(); + entry.fattr = nfs_alloc_fattr(); + entry.server = NFS_SERVER(inode); + if (entry.fh == NULL || entry.fattr == NULL) + goto out; + + entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); + if (IS_ERR(entry.label)) { + status = PTR_ERR(entry.label); goto out; } - /* NOTE: Someone else may have changed the READDIRPLUS flag */ - desc->page = page; - desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ - if (*desc->dir_cookie != 0) - status = find_dirent(desc); - else - status = find_dirent_index(desc); + array = nfs_readdir_get_array(page); + if (IS_ERR(array)) { + status = PTR_ERR(array); + goto out_label_free; + } + memset(array, 0, sizeof(struct nfs_cache_array)); + array->eof_index = -1; + + status = nfs_readdir_large_page(pages, array_size); if (status < 0) - dir_page_release(desc); - out: - dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __FUNCTION__, status); + goto out_release_array; + do { + unsigned int pglen; + status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); + + if (status < 0) + break; + pglen = status; + status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen); + if (status < 0) { + if (status == -ENOSPC) + status = 0; + break; + } + } while (array->eof_index < 0); + + nfs_readdir_free_large_page(pages_ptr, pages, array_size); +out_release_array: + nfs_readdir_release_array(page); +out_label_free: + nfs4_label_free(entry.label); +out: + nfs_free_fattr(entry.fattr); + nfs_free_fhandle(entry.fh); return status; } /* - * Recurse through the page cache pages, and return a - * filled nfs_entry structure of the next directory entry if possible. - * - * The target for the search is '*desc->dir_cookie' if non-0, - * 'desc->file->f_pos' otherwise + * Now we cache directories properly, by converting xdr information + * to an array that can be used for lookups later. This results in + * fewer cache pages, since we can store more information on each page. + * We only need to convert from xdr once so future lookups are much simpler */ -static inline -int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) +static +int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) { - int loop_count = 0; - int res; + struct inode *inode = file_inode(desc->file); + int ret; - /* Always search-by-index from the beginning of the cache */ - if (*desc->dir_cookie == 0) { - dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for offset %Ld\n", - (long long)desc->file->f_pos); - desc->page_index = 0; - desc->entry->cookie = desc->entry->prev_cookie = 0; - desc->entry->eof = 0; - desc->current_index = 0; - } else - dfprintk(DIRCACHE, "NFS: readdir_search_pagecache() searching for cookie %Lu\n", - (unsigned long long)*desc->dir_cookie); + ret = nfs_readdir_xdr_to_array(desc, page, inode); + if (ret < 0) + goto error; + SetPageUptodate(page); - for (;;) { - res = find_dirent_page(desc); - if (res != -EAGAIN) - break; - /* Align to beginning of next page */ - desc->page_index ++; - if (loop_count++ > 200) { - loop_count = 0; - schedule(); - } + if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { + /* Should never happen */ + nfs_zap_mapping(inode, inode->i_mapping); } + unlock_page(page); + return 0; + error: + unlock_page(page); + return ret; +} - dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __FUNCTION__, res); - return res; +static +void cache_page_release(nfs_readdir_descriptor_t *desc) +{ + if (!desc->page->mapping) + nfs_readdir_clear_array(desc->page); + page_cache_release(desc->page); + desc->page = NULL; } -static inline unsigned int dt_type(struct inode *inode) +static +struct page *get_cache_page(nfs_readdir_descriptor_t *desc) { - return (inode->i_mode >> 12) & 15; + return read_cache_page(file_inode(desc->file)->i_mapping, + desc->page_index, (filler_t *)nfs_readdir_filler, desc); } -static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc); +/* + * Returns 0 if desc->dir_cookie was found on page desc->page_index + */ +static +int find_cache_page(nfs_readdir_descriptor_t *desc) +{ + int res; + + desc->page = get_cache_page(desc); + if (IS_ERR(desc->page)) + return PTR_ERR(desc->page); + + res = nfs_readdir_search_array(desc); + if (res != 0) + cache_page_release(desc); + return res; +} + +/* Search for desc->dir_cookie from the beginning of the page cache */ +static inline +int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) +{ + int res; + + if (desc->page_index == 0) { + desc->current_index = 0; + desc->last_cookie = 0; + } + do { + res = find_cache_page(desc); + } while (res == -EAGAIN); + return res; +} /* * Once we've found the start of the dirent within a page: fill 'er up... */ static -int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, - filldir_t filldir) +int nfs_do_filldir(nfs_readdir_descriptor_t *desc) { struct file *file = desc->file; - struct nfs_entry *entry = desc->entry; - struct dentry *dentry = NULL; - u64 fileid; - int loop_count = 0, - res; - - dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n", - (unsigned long long)entry->cookie); - - for(;;) { - unsigned d_type = DT_UNKNOWN; - /* Note: entry->prev_cookie contains the cookie for - * retrieving the current dirent on the server */ - fileid = entry->ino; - - /* Get a dentry if we have one */ - if (dentry != NULL) - dput(dentry); - dentry = nfs_readdir_lookup(desc); + int i = 0; + int res = 0; + struct nfs_cache_array *array = NULL; + struct nfs_open_dir_context *ctx = file->private_data; - /* Use readdirplus info */ - if (dentry != NULL && dentry->d_inode != NULL) { - d_type = dt_type(dentry->d_inode); - fileid = NFS_FILEID(dentry->d_inode); - } + array = nfs_readdir_get_array(desc->page); + if (IS_ERR(array)) { + res = PTR_ERR(array); + goto out; + } - res = filldir(dirent, entry->name, entry->len, - file->f_pos, nfs_compat_user_ino64(fileid), - d_type); - if (res < 0) - break; - file->f_pos++; - *desc->dir_cookie = entry->cookie; - if (dir_decode(desc) != 0) { - desc->page_index ++; + for (i = desc->cache_entry_index; i < array->size; i++) { + struct nfs_cache_array_entry *ent; + + ent = &array->array[i]; + if (!dir_emit(desc->ctx, ent->string.name, ent->string.len, + nfs_compat_user_ino64(ent->ino), ent->d_type)) { + desc->eof = 1; break; } - if (loop_count++ > 200) { - loop_count = 0; - schedule(); - } + desc->ctx->pos++; + if (i < (array->size-1)) + *desc->dir_cookie = array->array[i+1].cookie; + else + *desc->dir_cookie = array->last_cookie; + if (ctx->duped != 0) + ctx->duped = 1; } - dir_page_release(desc); - if (dentry != NULL) - dput(dentry); + if (array->eof_index >= 0) + desc->eof = 1; + + nfs_readdir_release_array(desc->page); +out: + cache_page_release(desc); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (unsigned long long)*desc->dir_cookie, res); return res; @@ -462,15 +801,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, * directory in the page cache by the time we get here. */ static inline -int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, - filldir_t filldir) +int uncached_readdir(nfs_readdir_descriptor_t *desc) { - struct file *file = desc->file; - struct inode *inode = file->f_path.dentry->d_inode; - struct rpc_cred *cred = nfs_file_cred(file); struct page *page = NULL; int status; - unsigned long timestamp; + struct inode *inode = file_inode(desc->file); + struct nfs_open_dir_context *ctx = desc->file->private_data; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie); @@ -480,131 +816,122 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, status = -ENOMEM; goto out; } - timestamp = jiffies; - status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, - *desc->dir_cookie, page, - NFS_SERVER(inode)->dtsize, - desc->plus); + + desc->page_index = 0; + desc->last_cookie = *desc->dir_cookie; desc->page = page; - desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ - if (status >= 0) { - desc->timestamp = timestamp; - desc->timestamp_valid = 1; - if ((status = dir_decode(desc)) == 0) - desc->entry->prev_cookie = *desc->dir_cookie; - } else - status = -EIO; + ctx->duped = 0; + + status = nfs_readdir_xdr_to_array(desc, page, inode); if (status < 0) goto out_release; - status = nfs_do_filldir(desc, dirent, filldir); + status = nfs_do_filldir(desc); - /* Reset read descriptor so it searches the page cache from - * the start upon the next call to readdir_search_pagecache() */ - desc->page_index = 0; - desc->entry->cookie = desc->entry->prev_cookie = 0; - desc->entry->eof = 0; out: dfprintk(DIRCACHE, "NFS: %s: returns %d\n", - __FUNCTION__, status); + __func__, status); return status; out_release: - dir_page_release(desc); + cache_page_release(desc); goto out; } +static bool nfs_dir_mapping_need_revalidate(struct inode *dir) +{ + struct nfs_inode *nfsi = NFS_I(dir); + + if (nfs_attribute_cache_expired(dir)) + return true; + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + return true; + return false; +} + /* The file offset position represents the dirent entry number. A last cookie cache takes care of the common case of reading the whole directory. */ -static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int nfs_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; nfs_readdir_descriptor_t my_desc, *desc = &my_desc; - struct nfs_entry my_entry; - struct nfs_fh fh; - struct nfs_fattr fattr; - long res; - - dfprintk(VFS, "NFS: readdir(%s/%s) starting at cookie %Lu\n", - dentry->d_parent->d_name.name, dentry->d_name.name, - (long long)filp->f_pos); - nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); + struct nfs_open_dir_context *dir_ctx = file->private_data; + int res = 0; - lock_kernel(); + dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", + file, (long long)ctx->pos); + nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); /* - * filp->f_pos points to the dirent entry number. + * ctx->pos points to the dirent entry number. * *desc->dir_cookie has the cookie for the next entry. We have * to either find the entry with the appropriate number or * revalidate the cookie. */ memset(desc, 0, sizeof(*desc)); - desc->file = filp; - desc->dir_cookie = &nfs_file_open_context(filp)->dir_cookie; + desc->file = file; + desc->ctx = ctx; + desc->dir_cookie = &dir_ctx->dir_cookie; desc->decode = NFS_PROTO(inode)->decode_dirent; - desc->plus = NFS_USE_READDIRPLUS(inode); - - my_entry.cookie = my_entry.prev_cookie = 0; - my_entry.eof = 0; - my_entry.fh = &fh; - my_entry.fattr = &fattr; - nfs_fattr_init(&fattr); - desc->entry = &my_entry; + desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; nfs_block_sillyrename(dentry); - res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping); + if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode)) + res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) goto out; - while(!desc->entry->eof) { + do { res = readdir_search_pagecache(desc); if (res == -EBADCOOKIE) { + res = 0; /* This means either end of directory */ - if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) { + if (*desc->dir_cookie && desc->eof == 0) { /* Or that the server has 'lost' a cookie */ - res = uncached_readdir(desc, dirent, filldir); - if (res >= 0) + res = uncached_readdir(desc); + if (res == 0) continue; } - res = 0; break; } if (res == -ETOOSMALL && desc->plus) { clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); nfs_zap_caches(inode); + desc->page_index = 0; desc->plus = 0; - desc->entry->eof = 0; + desc->eof = 0; continue; } if (res < 0) break; - res = nfs_do_filldir(desc, dirent, filldir); - if (res < 0) { - res = 0; + res = nfs_do_filldir(desc); + if (res < 0) break; - } - } + } while (!desc->eof); out: nfs_unblock_sillyrename(dentry); - unlock_kernel(); if (res > 0) res = 0; - dfprintk(VFS, "NFS: readdir(%s/%s) returns %ld\n", - dentry->d_parent->d_name.name, dentry->d_name.name, - res); + dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); return res; } -static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) +static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) { - mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); - switch (origin) { + struct inode *inode = file_inode(filp); + struct nfs_open_dir_context *dir_ctx = filp->private_data; + + dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", + filp, offset, whence); + + mutex_lock(&inode->i_mutex); + switch (whence) { case 1: offset += filp->f_pos; case 0: @@ -616,10 +943,11 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) } if (offset != filp->f_pos) { filp->f_pos = offset; - nfs_file_open_context(filp)->dir_cookie = 0; + dir_ctx->dir_cookie = 0; + dir_ctx->duped = 0; } out: - mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); + mutex_unlock(&inode->i_mutex); return offset; } @@ -627,12 +955,16 @@ out: * All directory operations under NFS are synchronous, so fsync() * is a dummy operation. */ -static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) +static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, + int datasync) { - dfprintk(VFS, "NFS: fsync_dir(%s/%s) datasync %d\n", - dentry->d_parent->d_name.name, dentry->d_name.name, - datasync); + struct inode *inode = file_inode(filp); + + dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync); + mutex_lock(&inode->i_mutex); + nfs_inc_stats(inode, NFSIOS_VFSFSYNC); + mutex_unlock(&inode->i_mutex); return 0; } @@ -648,8 +980,9 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) */ void nfs_force_lookup_revalidate(struct inode *dir) { - NFS_I(dir)->cache_change_attribute = jiffies; + NFS_I(dir)->cache_change_attribute++; } +EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate); /* * A check for whether or not the parent directory has changed. @@ -660,6 +993,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) { if (IS_ROOT(dentry)) return 1; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) + return 0; if (!nfs_verify_change_attribute(dir, dentry->d_time)) return 0; /* Revalidate nfsi->cache_change_attribute before we declare a match */ @@ -671,30 +1006,14 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) } /* - * Return the intent data that applies to this particular path component - * - * Note that the current set of intents only apply to the very last - * component of the path. - * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT. - */ -static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigned int mask) -{ - if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT)) - return 0; - return nd->flags & mask; -} - -/* * Use intent information to check whether or not we're going to do * an O_EXCL create using this path component. */ -static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd) +static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags) { if (NFS_PROTO(dir)->version == 2) return 0; - if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0) - return 0; - return (nd->intent.open.flags & O_EXCL) != 0; + return flags & LOOKUP_EXCL; } /* @@ -705,28 +1024,28 @@ static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd) * particular file and the "nocto" mount flag is not set. * */ -static inline -int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd) +static +int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) { struct nfs_server *server = NFS_SERVER(inode); + int ret; - if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags)) - return 0; - if (nd != NULL) { - /* VFS wants an on-the-wire revalidation */ - if (nd->flags & LOOKUP_REVAL) - goto out_force; - /* This is an open(2) */ - if (nfs_lookup_check_intent(nd, LOOKUP_OPEN) != 0 && - !(server->flags & NFS_MOUNT_NOCTO) && - (S_ISREG(inode->i_mode) || - S_ISDIR(inode->i_mode))) - goto out_force; + if (IS_AUTOMOUNT(inode)) return 0; - } - return nfs_revalidate_inode(server, inode); + /* VFS wants an on-the-wire revalidation */ + if (flags & LOOKUP_REVAL) + goto out_force; + /* This is an open(2) */ + if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) + goto out_force; +out: + return (inode->i_nlink == 0) ? -ENOENT : 0; out_force: - return __nfs_revalidate_inode(server, inode); + ret = __nfs_revalidate_inode(server, inode); + if (ret != 0) + return ret; + goto out; } /* @@ -738,11 +1057,13 @@ out_force: */ static inline int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { /* Don't revalidate a negative dentry if we're creating a new file */ - if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0) + if (flags & LOOKUP_CREATE) return 0; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) + return 1; return !nfs_check_verifier(dir, dentry); } @@ -757,37 +1078,42 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, * If the parent directory is seen to have changed, we throw out the * cached dentry and do a new lookup. */ -static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) +static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *dir; struct inode *inode; struct dentry *parent; + struct nfs_fh *fhandle = NULL; + struct nfs_fattr *fattr = NULL; + struct nfs4_label *label = NULL; int error; - struct nfs_fh fhandle; - struct nfs_fattr fattr; + + if (flags & LOOKUP_RCU) + return -ECHILD; parent = dget_parent(dentry); - lock_kernel(); dir = parent->d_inode; nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = dentry->d_inode; if (!inode) { - if (nfs_neg_need_reval(dir, dentry, nd)) + if (nfs_neg_need_reval(dir, dentry, flags)) goto out_bad; - goto out_valid; + goto out_valid_noent; } if (is_bad_inode(inode)) { - dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n", - __FUNCTION__, dentry->d_parent->d_name.name, - dentry->d_name.name); + dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", + __func__, dentry); goto out_bad; } + if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) + goto out_set_verifier; + /* Force a full look up iff the parent directory has changed */ - if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) { - if (nfs_lookup_verify_inode(inode, nd)) + if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) { + if (nfs_lookup_verify_inode(inode, flags)) goto out_zap_parent; goto out_valid; } @@ -795,51 +1121,123 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) if (NFS_STALE(inode)) goto out_bad; - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); + error = -ENOMEM; + fhandle = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + if (fhandle == NULL || fattr == NULL) + goto out_error; + + label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); + if (IS_ERR(label)) + goto out_error; + + trace_nfs_lookup_revalidate_enter(dir, dentry, flags); + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); + trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error); if (error) goto out_bad; - if (nfs_compare_fh(NFS_FH(inode), &fhandle)) + if (nfs_compare_fh(NFS_FH(inode), fhandle)) goto out_bad; - if ((error = nfs_refresh_inode(inode, &fattr)) != 0) + if ((error = nfs_refresh_inode(inode, fattr)) != 0) goto out_bad; + nfs_setsecurity(inode, fattr, label); + + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + nfs4_label_free(label); + +out_set_verifier: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_valid: - unlock_kernel(); + /* Success: notify readdir to use READDIRPLUS */ + nfs_advise_use_readdirplus(dir); + out_valid_noent: dput(parent); - dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n", - __FUNCTION__, dentry->d_parent->d_name.name, - dentry->d_name.name); + dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", + __func__, dentry); return 1; out_zap_parent: nfs_zap_caches(dir); out_bad: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + nfs4_label_free(label); nfs_mark_for_revalidate(dir); if (inode && S_ISDIR(inode->i_mode)) { /* Purge readdir caches. */ nfs_zap_caches(inode); - /* If we have submounts, don't unhash ! */ - if (have_submounts(dentry)) + /* + * We can't d_drop the root of a disconnected tree: + * its d_hash is on the s_anon list and d_drop() would hide + * it from shrink_dcache_for_unmount(), leading to busy + * inodes on unmount and further oopses. + */ + if (IS_ROOT(dentry)) goto out_valid; - shrink_dcache_parent(dentry); } - d_drop(dentry); - unlock_kernel(); + /* If we have submounts, don't unhash ! */ + if (check_submounts_and_drop(dentry) != 0) + goto out_valid; + dput(parent); - dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", - __FUNCTION__, dentry->d_parent->d_name.name, - dentry->d_name.name); + dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", + __func__, dentry); return 0; +out_error: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + nfs4_label_free(label); + dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n", + __func__, dentry, error); + return error; +} + +/* + * A weaker form of d_revalidate for revalidating just the dentry->d_inode + * when we don't really care about the dentry name. This is called when a + * pathwalk ends on a dentry that was not found via a normal lookup in the + * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals). + * + * In this situation, we just want to verify that the inode itself is OK + * since the dentry might have changed on the server. + */ +static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) +{ + int error; + struct inode *inode = dentry->d_inode; + + /* + * I believe we can only get a negative dentry here in the case of a + * procfs-style symlink. Just assume it's correct for now, but we may + * eventually need to do something more here. + */ + if (!inode) { + dfprintk(LOOKUPCACHE, "%s: %pd2 has negative inode\n", + __func__, dentry); + return 1; + } + + if (is_bad_inode(inode)) { + dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", + __func__, dentry); + return 0; + } + + error = nfs_revalidate_inode(NFS_SERVER(inode), inode); + dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n", + __func__, inode->i_ino, error ? "invalid" : "valid"); + return !error; } /* * This is called from dput() when d_count is going to 0. */ -static int nfs_dentry_delete(struct dentry *dentry) +static int nfs_dentry_delete(const struct dentry *dentry) { - dfprintk(VFS, "NFS: dentry_delete(%s/%s, %x)\n", - dentry->d_parent->d_name.name, dentry->d_name.name, - dentry->d_flags); + dfprintk(VFS, "NFS: dentry_delete(%pd2, %x)\n", + dentry, dentry->d_flags); /* Unhash any dentry with a stale inode */ if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode)) @@ -858,6 +1256,17 @@ static int nfs_dentry_delete(struct dentry *dentry) } +/* Ensure that we revalidate inode->i_nlink */ +static void nfs_drop_nlink(struct inode *inode) +{ + spin_lock(&inode->i_lock); + /* drop the inode if we're reasonably sure this is the last link */ + if (inode->i_nlink == 1) + clear_nlink(inode); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + spin_unlock(&inode->i_lock); +} + /* * Called when the dentry loses inode. * We use it to clean up silly-renamed files. @@ -869,67 +1278,89 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { - lock_kernel(); - drop_nlink(inode); nfs_complete_unlink(dentry, inode); - unlock_kernel(); + nfs_drop_nlink(inode); } iput(inode); } -struct dentry_operations nfs_dentry_operations = { +static void nfs_d_release(struct dentry *dentry) +{ + /* free cached devname value, if it survived that far */ + if (unlikely(dentry->d_fsdata)) { + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + WARN_ON(1); + else + kfree(dentry->d_fsdata); + } +} + +const struct dentry_operations nfs_dentry_operations = { .d_revalidate = nfs_lookup_revalidate, + .d_weak_revalidate = nfs_weak_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, + .d_automount = nfs_d_automount, + .d_release = nfs_d_release, }; +EXPORT_SYMBOL_GPL(nfs_dentry_operations); -static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct dentry *res; struct dentry *parent; struct inode *inode = NULL; + struct nfs_fh *fhandle = NULL; + struct nfs_fattr *fattr = NULL; + struct nfs4_label *label = NULL; int error; - struct nfs_fh fhandle; - struct nfs_fattr fattr; - dfprintk(VFS, "NFS: lookup(%s/%s)\n", - dentry->d_parent->d_name.name, dentry->d_name.name); + dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry); nfs_inc_stats(dir, NFSIOS_VFSLOOKUP); res = ERR_PTR(-ENAMETOOLONG); if (dentry->d_name.len > NFS_SERVER(dir)->namelen) goto out; - res = ERR_PTR(-ENOMEM); - dentry->d_op = NFS_PROTO(dir)->dentry_ops; - - lock_kernel(); - /* * If we're doing an exclusive create, optimize away the lookup * but don't hash the dentry. */ - if (nfs_is_exclusive_create(dir, nd)) { + if (nfs_is_exclusive_create(dir, flags)) { d_instantiate(dentry, NULL); res = NULL; - goto out_unlock; + goto out; } + res = ERR_PTR(-ENOMEM); + fhandle = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + if (fhandle == NULL || fattr == NULL) + goto out; + + label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT); + if (IS_ERR(label)) + goto out; + parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ + trace_nfs_lookup_enter(dir, dentry, flags); nfs_block_sillyrename(parent); - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); if (error == -ENOENT) goto no_entry; if (error < 0) { res = ERR_PTR(error); goto out_unblock_sillyrename; } - inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); - res = (struct dentry *)inode; + inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); + res = ERR_CAST(inode); if (IS_ERR(res)) goto out_unblock_sillyrename; + /* Success: notify readdir to use READDIRPLUS */ + nfs_advise_use_readdirplus(dir); + no_entry: res = d_materialise_unique(dentry, inode); if (res != NULL) { @@ -940,224 +1371,220 @@ no_entry: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_unblock_sillyrename: nfs_unblock_sillyrename(parent); -out_unlock: - unlock_kernel(); + trace_nfs_lookup_exit(dir, dentry, flags, error); + nfs4_label_free(label); out: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); return res; } +EXPORT_SYMBOL_GPL(nfs_lookup); -#ifdef CONFIG_NFS_V4 -static int nfs_open_revalidate(struct dentry *, struct nameidata *); +#if IS_ENABLED(CONFIG_NFS_V4) +static int nfs4_lookup_revalidate(struct dentry *, unsigned int); -struct dentry_operations nfs4_dentry_operations = { - .d_revalidate = nfs_open_revalidate, +const struct dentry_operations nfs4_dentry_operations = { + .d_revalidate = nfs4_lookup_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, + .d_automount = nfs_d_automount, + .d_release = nfs_d_release, }; +EXPORT_SYMBOL_GPL(nfs4_dentry_operations); -/* - * Use intent information to determine whether we need to substitute - * the NFSv4-style stateful OPEN for the LOOKUP call - */ -static int is_atomic_open(struct inode *dir, struct nameidata *nd) +static fmode_t flags_to_mode(int flags) { - if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_OPEN) == 0) - return 0; - /* NFS does not (yet) have a stateful open for directories */ - if (nd->flags & LOOKUP_DIRECTORY) - return 0; - /* Are we trying to write to a read only partition? */ - if (IS_RDONLY(dir) && (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) - return 0; - return 1; + fmode_t res = (__force fmode_t)flags & FMODE_EXEC; + if ((flags & O_ACCMODE) != O_WRONLY) + res |= FMODE_READ; + if ((flags & O_ACCMODE) != O_RDONLY) + res |= FMODE_WRITE; + return res; } -static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags) { - struct dentry *res = NULL; - int error; + return alloc_nfs_open_context(dentry, flags_to_mode(open_flags)); +} - dfprintk(VFS, "NFS: atomic_lookup(%s/%ld), %s\n", - dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); +static int do_open(struct inode *inode, struct file *filp) +{ + nfs_fscache_open_file(inode, filp); + return 0; +} - /* Check that we are indeed trying to open this file */ - if (!is_atomic_open(dir, nd)) - goto no_open; +static int nfs_finish_open(struct nfs_open_context *ctx, + struct dentry *dentry, + struct file *file, unsigned open_flags, + int *opened) +{ + int err; + + if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + *opened |= FILE_CREATED; - if (dentry->d_name.len > NFS_SERVER(dir)->namelen) { - res = ERR_PTR(-ENAMETOOLONG); + err = finish_open(file, dentry, do_open, opened); + if (err) goto out; + nfs_file_set_open_context(file, ctx); + +out: + return err; +} + +int nfs_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned open_flags, + umode_t mode, int *opened) +{ + struct nfs_open_context *ctx; + struct dentry *res; + struct iattr attr = { .ia_valid = ATTR_OPEN }; + struct inode *inode; + unsigned int lookup_flags = 0; + int err; + + /* Expect a negative dentry */ + BUG_ON(dentry->d_inode); + + dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n", + dir->i_sb->s_id, dir->i_ino, dentry); + + err = nfs_check_flags(open_flags); + if (err) + return err; + + /* NFS only supports OPEN on regular files */ + if ((open_flags & O_DIRECTORY)) { + if (!d_unhashed(dentry)) { + /* + * Hashed negative dentry with O_DIRECTORY: dentry was + * revalidated and is fine, no need to perform lookup + * again + */ + return -ENOENT; + } + lookup_flags = LOOKUP_OPEN|LOOKUP_DIRECTORY; + goto no_open; } - dentry->d_op = NFS_PROTO(dir)->dentry_ops; - /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash - * the dentry. */ - if (nd->intent.open.flags & O_EXCL) { - d_instantiate(dentry, NULL); - goto out; + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) + return -ENAMETOOLONG; + + if (open_flags & O_CREAT) { + attr.ia_valid |= ATTR_MODE; + attr.ia_mode = mode & ~current_umask(); + } + if (open_flags & O_TRUNC) { + attr.ia_valid |= ATTR_SIZE; + attr.ia_size = 0; } - /* Open the file on the server */ - lock_kernel(); - res = nfs4_atomic_open(dir, dentry, nd); - unlock_kernel(); - if (IS_ERR(res)) { - error = PTR_ERR(res); - switch (error) { - /* Make a negative dentry */ - case -ENOENT: - res = NULL; - goto out; - /* This turned out not to be a regular file */ - case -EISDIR: - case -ENOTDIR: + ctx = create_nfs_open_context(dentry, open_flags); + err = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out; + + trace_nfs_atomic_open_enter(dir, ctx, open_flags); + nfs_block_sillyrename(dentry->d_parent); + inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened); + nfs_unblock_sillyrename(dentry->d_parent); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); + put_nfs_open_context(ctx); + switch (err) { + case -ENOENT: + d_drop(dentry); + d_add(dentry, NULL); + break; + case -EISDIR: + case -ENOTDIR: + goto no_open; + case -ELOOP: + if (!(open_flags & O_NOFOLLOW)) goto no_open; - case -ELOOP: - if (!(nd->intent.open.flags & O_NOFOLLOW)) - goto no_open; + break; /* case -EINVAL: */ - default: - goto out; + default: + break; } - } else if (res != NULL) - dentry = res; + goto out; + } + + err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened); + trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); + put_nfs_open_context(ctx); out: - return res; + return err; + no_open: - return nfs_lookup(dir, dentry, nd); + res = nfs_lookup(dir, dentry, lookup_flags); + err = PTR_ERR(res); + if (IS_ERR(res)) + goto out; + + return finish_no_open(file, res); } +EXPORT_SYMBOL_GPL(nfs_atomic_open); -static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) +static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *parent = NULL; - struct inode *inode = dentry->d_inode; + struct inode *inode; struct inode *dir; - int openflags, ret = 0; + int ret = 0; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) + goto no_open; + if (d_mountpoint(dentry)) + goto no_open; + if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1) + goto no_open; + inode = dentry->d_inode; parent = dget_parent(dentry); dir = parent->d_inode; - if (!is_atomic_open(dir, nd)) - goto no_open; + /* We can't create new files in nfs_open_revalidate(), so we * optimize away revalidation of negative dentries. */ if (inode == NULL) { - if (!nfs_neg_need_reval(dir, dentry, nd)) + if (!nfs_neg_need_reval(dir, dentry, flags)) ret = 1; goto out; } /* NFS only supports OPEN on regular files */ if (!S_ISREG(inode->i_mode)) - goto no_open; - openflags = nd->intent.open.flags; + goto no_open_dput; /* We cannot do exclusive creation on a positive dentry */ - if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) - goto no_open; - /* We can't create new files, or truncate existing ones here */ - openflags &= ~(O_CREAT|O_TRUNC); + if (flags & LOOKUP_EXCL) + goto no_open_dput; + + /* Let f_op->open() actually open (and revalidate) the file */ + ret = 1; - /* - * Note: we're not holding inode->i_mutex and so may be racing with - * operations that change the directory. We therefore save the - * change attribute *before* we do the RPC call. - */ - lock_kernel(); - ret = nfs4_open_revalidate(dir, dentry, openflags, nd); - unlock_kernel(); out: dput(parent); - if (!ret) - d_drop(dentry); return ret; -no_open: + +no_open_dput: dput(parent); - if (inode != NULL && nfs_have_delegation(inode, FMODE_READ)) - return 1; - return nfs_lookup_revalidate(dentry, nd); +no_open: + return nfs_lookup_revalidate(dentry, flags); } -#endif /* CONFIG_NFSV4 */ -static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) -{ - struct dentry *parent = desc->file->f_path.dentry; - struct inode *dir = parent->d_inode; - struct nfs_entry *entry = desc->entry; - struct dentry *dentry, *alias; - struct qstr name = { - .name = entry->name, - .len = entry->len, - }; - struct inode *inode; - unsigned long verf = nfs_save_change_attribute(dir); - - switch (name.len) { - case 2: - if (name.name[0] == '.' && name.name[1] == '.') - return dget_parent(parent); - break; - case 1: - if (name.name[0] == '.') - return dget(parent); - } - - spin_lock(&dir->i_lock); - if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) { - spin_unlock(&dir->i_lock); - return NULL; - } - spin_unlock(&dir->i_lock); - - name.hash = full_name_hash(name.name, name.len); - dentry = d_lookup(parent, &name); - if (dentry != NULL) { - /* Is this a positive dentry that matches the readdir info? */ - if (dentry->d_inode != NULL && - (NFS_FILEID(dentry->d_inode) == entry->ino || - d_mountpoint(dentry))) { - if (!desc->plus || entry->fh->size == 0) - return dentry; - if (nfs_compare_fh(NFS_FH(dentry->d_inode), - entry->fh) == 0) - goto out_renew; - } - /* No, so d_drop to allow one to be created */ - d_drop(dentry); - dput(dentry); - } - if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR)) - return NULL; - if (name.len > NFS_SERVER(dir)->namelen) - return NULL; - /* Note: caller is already holding the dir->i_mutex! */ - dentry = d_alloc(parent, &name); - if (dentry == NULL) - return NULL; - dentry->d_op = NFS_PROTO(dir)->dentry_ops; - inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); - if (IS_ERR(inode)) { - dput(dentry); - return NULL; - } - - alias = d_materialise_unique(dentry, inode); - if (alias != NULL) { - dput(dentry); - if (IS_ERR(alias)) - return NULL; - dentry = alias; - } - -out_renew: - nfs_set_verifier(dentry, verf); - return dentry; -} +#endif /* CONFIG_NFSV4 */ /* * Code common to create, mkdir, and mknod. */ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, + struct nfs4_label *label) { struct dentry *parent = dget_parent(dentry); struct inode *dir = parent->d_inode; @@ -1170,18 +1597,18 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, if (dentry->d_inode) goto out; if (fhandle->size == 0) { - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); if (error) goto out_error; } nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); if (!(fattr->valid & NFS_ATTR_FATTR)) { struct nfs_server *server = NFS_SB(dentry->d_sb); - error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr); + error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL); if (error < 0) goto out_error; } - inode = nfs_fhget(dentry->d_sb, fhandle, fattr); + inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); error = PTR_ERR(inode); if (IS_ERR(inode)) goto out_error; @@ -1194,6 +1621,7 @@ out_error: dput(parent); return error; } +EXPORT_SYMBOL_GPL(nfs_instantiate); /* * Following a failed create operation, we drop the dentry rather @@ -1201,45 +1629,42 @@ out_error: * that the operation succeeded on the server, but an error in the * reply path made it appear to have failed. */ -static int nfs_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) +int nfs_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) { struct iattr attr; + int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT; int error; - int open_flags = 0; - dfprintk(VFS, "NFS: create(%s/%ld), %s\n", - dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + dfprintk(VFS, "NFS: create(%s/%lu), %pd\n", + dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; - if ((nd->flags & LOOKUP_CREATE) != 0) - open_flags = nd->intent.open.flags; - - lock_kernel(); - error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd); + trace_nfs_create_enter(dir, dentry, open_flags); + error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); + trace_nfs_create_exit(dir, dentry, open_flags, error); if (error != 0) goto out_err; - unlock_kernel(); return 0; out_err: - unlock_kernel(); d_drop(dentry); return error; } +EXPORT_SYMBOL_GPL(nfs_create); /* * See comments for nfs_proc_create regarding failed operations. */ -static int -nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +int +nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct iattr attr; int status; - dfprintk(VFS, "NFS: mknod(%s/%ld), %s\n", - dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n", + dir->i_sb->s_id, dir->i_ino, dentry); if (!new_valid_dev(rdev)) return -EINVAL; @@ -1247,43 +1672,43 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; - lock_kernel(); + trace_nfs_mknod_enter(dir, dentry); status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); + trace_nfs_mknod_exit(dir, dentry, status); if (status != 0) goto out_err; - unlock_kernel(); return 0; out_err: - unlock_kernel(); d_drop(dentry); return status; } +EXPORT_SYMBOL_GPL(nfs_mknod); /* * See comments for nfs_proc_create regarding failed operations. */ -static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct iattr attr; int error; - dfprintk(VFS, "NFS: mkdir(%s/%ld), %s\n", - dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); + dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", + dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_valid = ATTR_MODE; attr.ia_mode = mode | S_IFDIR; - lock_kernel(); + trace_nfs_mkdir_enter(dir, dentry); error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); + trace_nfs_mkdir_exit(dir, dentry, error); if (error != 0) goto out_err; - unlock_kernel(); return 0; out_err: d_drop(dentry); - unlock_kernel(); return error; } +EXPORT_SYMBOL_GPL(nfs_mkdir); static void nfs_dentry_handle_enoent(struct dentry *dentry) { @@ -1291,94 +1716,32 @@ static void nfs_dentry_handle_enoent(struct dentry *dentry) d_delete(dentry); } -static int nfs_rmdir(struct inode *dir, struct dentry *dentry) +int nfs_rmdir(struct inode *dir, struct dentry *dentry) { int error; - dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n", - dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); - - lock_kernel(); - error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); - /* Ensure the VFS deletes this inode */ - if (error == 0 && dentry->d_inode != NULL) - clear_nlink(dentry->d_inode); - else if (error == -ENOENT) - nfs_dentry_handle_enoent(dentry); - unlock_kernel(); - - return error; -} - -static int nfs_sillyrename(struct inode *dir, struct dentry *dentry) -{ - static unsigned int sillycounter; - const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2; - const int countersize = sizeof(sillycounter)*2; - const int slen = sizeof(".nfs")+fileidsize+countersize-1; - char silly[slen+1]; - struct qstr qsilly; - struct dentry *sdentry; - int error = -EIO; - - dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", - dentry->d_parent->d_name.name, dentry->d_name.name, - atomic_read(&dentry->d_count)); - nfs_inc_stats(dir, NFSIOS_SILLYRENAME); + dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n", + dir->i_sb->s_id, dir->i_ino, dentry); - /* - * We don't allow a dentry to be silly-renamed twice. - */ - error = -EBUSY; - if (dentry->d_flags & DCACHE_NFSFS_RENAMED) - goto out; - - sprintf(silly, ".nfs%*.*Lx", - fileidsize, fileidsize, - (unsigned long long)NFS_FILEID(dentry->d_inode)); - - /* Return delegation in anticipation of the rename */ - nfs_inode_return_delegation(dentry->d_inode); - - sdentry = NULL; - do { - char *suffix = silly + slen - countersize; - - dput(sdentry); - sillycounter++; - sprintf(suffix, "%*.*x", countersize, countersize, sillycounter); - - dfprintk(VFS, "NFS: trying to rename %s to %s\n", - dentry->d_name.name, silly); - - sdentry = lookup_one_len(silly, dentry->d_parent, slen); - /* - * N.B. Better to return EBUSY here ... it could be - * dangerous to delete the file while it's in use. - */ - if (IS_ERR(sdentry)) - goto out; - } while(sdentry->d_inode != NULL); /* need negative lookup */ - - qsilly.name = silly; - qsilly.len = strlen(silly); + trace_nfs_rmdir_enter(dir, dentry); if (dentry->d_inode) { - error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, - dir, &qsilly); - nfs_mark_for_revalidate(dentry->d_inode); + nfs_wait_on_sillyrename(dentry); + error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); + /* Ensure the VFS deletes this inode */ + switch (error) { + case 0: + clear_nlink(dentry->d_inode); + break; + case -ENOENT: + nfs_dentry_handle_enoent(dentry); + } } else - error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, - dir, &qsilly); - if (!error) { - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - d_move(dentry, sdentry); - error = nfs_async_unlink(dir, dentry); - /* If we return 0 we don't unlink */ - } - dput(sdentry); -out: + error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); + trace_nfs_rmdir_exit(dir, dentry, error); + return error; } +EXPORT_SYMBOL_GPL(nfs_rmdir); /* * Remove a file after making sure there are no pending writes, @@ -1393,8 +1756,7 @@ static int nfs_safe_remove(struct dentry *dentry) struct inode *inode = dentry->d_inode; int error = -EBUSY; - dfprintk(VFS, "NFS: safe_remove(%s/%s)\n", - dentry->d_parent->d_name.name, dentry->d_name.name); + dfprintk(VFS, "NFS: safe_remove(%pd2)\n", dentry); /* If the dentry was sillyrenamed, we simply call d_delete() */ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { @@ -1402,17 +1764,17 @@ static int nfs_safe_remove(struct dentry *dentry) goto out; } + trace_nfs_remove_enter(dir, dentry); if (inode != NULL) { - nfs_inode_return_delegation(inode); + NFS_PROTO(inode)->return_delegation(inode); error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); - /* The VFS may want to delete this inode */ if (error == 0) - drop_nlink(inode); - nfs_mark_for_revalidate(inode); + nfs_drop_nlink(inode); } else error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); if (error == -ENOENT) nfs_dentry_handle_enoent(dentry); + trace_nfs_remove_exit(dir, dentry, error); out: return error; } @@ -1422,40 +1784,38 @@ out: * * If sillyrename() returns 0, we do nothing, otherwise we unlink. */ -static int nfs_unlink(struct inode *dir, struct dentry *dentry) +int nfs_unlink(struct inode *dir, struct dentry *dentry) { int error; int need_rehash = 0; - dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, - dir->i_ino, dentry->d_name.name); + dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id, + dir->i_ino, dentry); - lock_kernel(); - spin_lock(&dcache_lock); + trace_nfs_unlink_enter(dir, dentry); spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count) > 1) { + if (d_count(dentry) > 1) { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); /* Start asynchronous writeout of the inode */ write_inode_now(dentry->d_inode, 0); error = nfs_sillyrename(dir, dentry); - unlock_kernel(); - return error; + goto out; } if (!d_unhashed(dentry)) { __d_drop(dentry); need_rehash = 1; } spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); error = nfs_safe_remove(dentry); if (!error || error == -ENOENT) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); } else if (need_rehash) d_rehash(dentry); - unlock_kernel(); +out: + trace_nfs_unlink_exit(dir, dentry, error); return error; } +EXPORT_SYMBOL_GPL(nfs_unlink); /* * To create a symbolic link, most file systems instantiate a new inode, @@ -1472,17 +1832,16 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) * now have a new file handle and can instantiate an in-core NFS inode * and move the raw page into its mapping. */ -static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct pagevec lru_pvec; struct page *page; char *kaddr; struct iattr attr; unsigned int pathlen = strlen(symname); int error; - dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id, - dir->i_ino, dentry->d_name.name, symname); + dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id, + dir->i_ino, dentry, symname); if (pathlen > PAGE_SIZE) return -ENAMETOOLONG; @@ -1490,28 +1849,25 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym attr.ia_mode = S_IFLNK | S_IRWXUGO; attr.ia_valid = ATTR_MODE; - lock_kernel(); - page = alloc_page(GFP_HIGHUSER); - if (!page) { - unlock_kernel(); + if (!page) return -ENOMEM; - } - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); memcpy(kaddr, symname, pathlen); if (pathlen < PAGE_SIZE) memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); + trace_nfs_symlink_enter(dir, dentry); error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); + trace_nfs_symlink_exit(dir, dentry, error); if (error != 0) { - dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n", + dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n", dir->i_sb->s_id, dir->i_ino, - dentry->d_name.name, symname, error); + dentry, symname, error); d_drop(dentry); __free_page(page); - unlock_kernel(); return error; } @@ -1519,40 +1875,44 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym * No big deal if we can't add this page to the page cache here. * READLINK will get the missing page from the server if needed. */ - pagevec_init(&lru_pvec, 0); - if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, + if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0, GFP_KERNEL)) { - pagevec_add(&lru_pvec, page); - pagevec_lru_add(&lru_pvec); SetPageUptodate(page); unlock_page(page); + /* + * add_to_page_cache_lru() grabs an extra page refcount. + * Drop it here to avoid leaking this page later. + */ + page_cache_release(page); } else __free_page(page); - unlock_kernel(); return 0; } +EXPORT_SYMBOL_GPL(nfs_symlink); -static int +int nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; int error; - dfprintk(VFS, "NFS: link(%s/%s -> %s/%s)\n", - old_dentry->d_parent->d_name.name, old_dentry->d_name.name, - dentry->d_parent->d_name.name, dentry->d_name.name); + dfprintk(VFS, "NFS: link(%pd2 -> %pd2)\n", + old_dentry, dentry); + + trace_nfs_link_enter(inode, dir, dentry); + NFS_PROTO(inode)->return_delegation(inode); - lock_kernel(); d_drop(dentry); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { - atomic_inc(&inode->i_count); + ihold(inode); d_add(dentry, inode); } - unlock_kernel(); + trace_nfs_link_exit(inode, dir, dentry, error); return error; } +EXPORT_SYMBOL_GPL(nfs_link); /* * RENAME @@ -1578,87 +1938,79 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) * If these conditions are met, we can drop the dentries before doing * the rename. */ -static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, +int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct dentry *dentry = NULL, *rehash = NULL; + struct rpc_task *task; int error = -EBUSY; + dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n", + old_dentry, new_dentry, + d_count(new_dentry)); + + trace_nfs_rename_enter(old_dir, old_dentry, new_dir, new_dentry); /* - * To prevent any new references to the target during the rename, - * we unhash the dentry and free the inode in advance. + * For non-directories, check whether the target is busy and if so, + * make a copy of the dentry and then do a silly-rename. If the + * silly-rename succeeds, the copied dentry is hashed and becomes + * the new target. */ - lock_kernel(); - if (!d_unhashed(new_dentry)) { - d_drop(new_dentry); - rehash = new_dentry; - } + if (new_inode && !S_ISDIR(new_inode->i_mode)) { + /* + * To prevent any new references to the target during the + * rename, we unhash the dentry in advance. + */ + if (!d_unhashed(new_dentry)) { + d_drop(new_dentry); + rehash = new_dentry; + } - dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", - old_dentry->d_parent->d_name.name, old_dentry->d_name.name, - new_dentry->d_parent->d_name.name, new_dentry->d_name.name, - atomic_read(&new_dentry->d_count)); + if (d_count(new_dentry) > 2) { + int err; - /* - * First check whether the target is busy ... we can't - * safely do _any_ rename if the target is in use. - * - * For files, make a copy of the dentry and then do a - * silly-rename. If the silly-rename succeeds, the - * copied dentry is hashed and becomes the new target. - */ - if (!new_inode) - goto go_ahead; - if (S_ISDIR(new_inode->i_mode)) { - error = -EISDIR; - if (!S_ISDIR(old_inode->i_mode)) - goto out; - } else if (atomic_read(&new_dentry->d_count) > 2) { - int err; - /* copy the target dentry's name */ - dentry = d_alloc(new_dentry->d_parent, - &new_dentry->d_name); - if (!dentry) - goto out; + /* copy the target dentry's name */ + dentry = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!dentry) + goto out; - /* silly-rename the existing target ... */ - err = nfs_sillyrename(new_dir, new_dentry); - if (!err) { - new_dentry = rehash = dentry; - new_inode = NULL; - /* instantiate the replacement target */ - d_instantiate(new_dentry, NULL); - } else if (atomic_read(&new_dentry->d_count) > 1) - /* dentry still busy? */ - goto out; - } else - drop_nlink(new_inode); + /* silly-rename the existing target ... */ + err = nfs_sillyrename(new_dir, new_dentry); + if (err) + goto out; -go_ahead: - /* - * ... prune child dentries and writebacks if needed. - */ - if (atomic_read(&old_dentry->d_count) > 1) { - if (S_ISREG(old_inode->i_mode)) - nfs_wb_all(old_inode); - shrink_dcache_parent(old_dentry); + new_dentry = dentry; + rehash = NULL; + new_inode = NULL; + } } - nfs_inode_return_delegation(old_inode); - if (new_inode != NULL) { - nfs_inode_return_delegation(new_inode); - d_delete(new_dentry); + NFS_PROTO(old_inode)->return_delegation(old_inode); + if (new_inode != NULL) + NFS_PROTO(new_inode)->return_delegation(new_inode); + + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); + if (IS_ERR(task)) { + error = PTR_ERR(task); + goto out; } - error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name); + error = rpc_wait_for_completion_task(task); + if (error == 0) + error = task->tk_status; + rpc_put_task(task); nfs_mark_for_revalidate(old_inode); out: if (rehash) d_rehash(rehash); + trace_nfs_rename_exit(old_dir, old_dentry, + new_dir, new_dentry, error); if (!error) { + if (new_inode != NULL) + nfs_drop_nlink(new_inode); d_move(old_dentry, new_dentry); nfs_set_verifier(new_dentry, nfs_save_change_attribute(new_dir)); @@ -1668,9 +2020,9 @@ out: /* new dentry created? */ if (dentry) dput(dentry); - unlock_kernel(); return error; } +EXPORT_SYMBOL_GPL(nfs_rename); static DEFINE_SPINLOCK(nfs_access_lru_lock); static LIST_HEAD(nfs_access_lru_list); @@ -1680,33 +2032,42 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry) { put_rpccred(entry->cred); kfree(entry); - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_long_dec(&nfs_access_nr_entries); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); +} + +static void nfs_access_free_list(struct list_head *head) +{ + struct nfs_access_entry *cache; + + while (!list_empty(head)) { + cache = list_entry(head->next, struct nfs_access_entry, lru); + list_del(&cache->lru); + nfs_access_free_entry(cache); + } } -int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) +unsigned long +nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { LIST_HEAD(head); - struct nfs_inode *nfsi; + struct nfs_inode *nfsi, *next; struct nfs_access_entry *cache; + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + long freed = 0; + + if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) + return SHRINK_STOP; -restart: spin_lock(&nfs_access_lru_lock); - list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { - struct rw_semaphore *s_umount; + list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { struct inode *inode; if (nr_to_scan-- == 0) break; - s_umount = &nfsi->vfs_inode.i_sb->s_umount; - if (!down_read_trylock(s_umount)) - continue; - inode = igrab(&nfsi->vfs_inode); - if (inode == NULL) { - up_read(s_umount); - continue; - } + inode = &nfsi->vfs_inode; spin_lock(&inode->i_lock); if (list_empty(&nfsi->access_cache_entry_lru)) goto remove_lru_entry; @@ -1714,68 +2075,63 @@ restart: struct nfs_access_entry, lru); list_move(&cache->lru, &head); rb_erase(&cache->rb_node, &nfsi->access_cache); + freed++; if (!list_empty(&nfsi->access_cache_entry_lru)) list_move_tail(&nfsi->access_cache_inode_lru, &nfs_access_lru_list); else { remove_lru_entry: list_del_init(&nfsi->access_cache_inode_lru); + smp_mb__before_atomic(); clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); + smp_mb__after_atomic(); } spin_unlock(&inode->i_lock); - spin_unlock(&nfs_access_lru_lock); - iput(inode); - up_read(s_umount); - goto restart; } spin_unlock(&nfs_access_lru_lock); - while (!list_empty(&head)) { - cache = list_entry(head.next, struct nfs_access_entry, lru); - list_del(&cache->lru); - nfs_access_free_entry(cache); - } - return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; + nfs_access_free_list(&head); + return freed; } -static void __nfs_access_zap_cache(struct inode *inode) +unsigned long +nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc) +{ + return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries)); +} + +static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) { - struct nfs_inode *nfsi = NFS_I(inode); struct rb_root *root_node = &nfsi->access_cache; - struct rb_node *n, *dispose = NULL; + struct rb_node *n; struct nfs_access_entry *entry; /* Unhook entries from the cache */ while ((n = rb_first(root_node)) != NULL) { entry = rb_entry(n, struct nfs_access_entry, rb_node); rb_erase(n, root_node); - list_del(&entry->lru); - n->rb_left = dispose; - dispose = n; + list_move(&entry->lru, head); } nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; - spin_unlock(&inode->i_lock); - - /* Now kill them all! */ - while (dispose != NULL) { - n = dispose; - dispose = n->rb_left; - nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node)); - } } void nfs_access_zap_cache(struct inode *inode) { + LIST_HEAD(head); + + if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0) + return; /* Remove from global LRU init */ - if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { - spin_lock(&nfs_access_lru_lock); + spin_lock(&nfs_access_lru_lock); + if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) list_del_init(&NFS_I(inode)->access_cache_inode_lru); - spin_unlock(&nfs_access_lru_lock); - } spin_lock(&inode->i_lock); - /* This will release the spinlock */ - __nfs_access_zap_cache(inode); + __nfs_access_zap_cache(NFS_I(inode), &head); + spin_unlock(&inode->i_lock); + spin_unlock(&nfs_access_lru_lock); + nfs_access_free_list(&head); } +EXPORT_SYMBOL_GPL(nfs_access_zap_cache); static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred) { @@ -1807,7 +2163,8 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str cache = nfs_access_search_rbtree(inode, cred); if (cache == NULL) goto out; - if (!time_in_range(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) + if (!nfs_have_delegated_attributes(inode) && + !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) goto out_stale; res->jiffies = cache->jiffies; res->cred = cache->cred; @@ -1824,8 +2181,8 @@ out_stale: nfs_access_free_entry(cache); return -ENOENT; out_zap: - /* This will release the spinlock */ - __nfs_access_zap_cache(inode); + spin_unlock(&inode->i_lock); + nfs_access_zap_cache(inode); return -ENOENT; } @@ -1862,7 +2219,7 @@ found: nfs_access_free_entry(entry); } -static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) +void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) { struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); if (cache == NULL) @@ -1875,51 +2232,81 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s nfs_access_add_rbtree(inode, cache); /* Update accounting */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_long_inc(&nfs_access_nr_entries); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* Add inode to global LRU list */ - if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { + if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { spin_lock(&nfs_access_lru_lock); - list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list); + if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) + list_add_tail(&NFS_I(inode)->access_cache_inode_lru, + &nfs_access_lru_list); spin_unlock(&nfs_access_lru_lock); } } +EXPORT_SYMBOL_GPL(nfs_access_add_cache); + +void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result) +{ + entry->mask = 0; + if (access_result & NFS4_ACCESS_READ) + entry->mask |= MAY_READ; + if (access_result & + (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE)) + entry->mask |= MAY_WRITE; + if (access_result & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) + entry->mask |= MAY_EXEC; +} +EXPORT_SYMBOL_GPL(nfs_access_set_mask); static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) { struct nfs_access_entry cache; int status; + trace_nfs_access_enter(inode); + status = nfs_access_get_cached(inode, cred, &cache); if (status == 0) - goto out; + goto out_cached; /* Be clever: ask server to check for all possible rights */ cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; cache.cred = cred; cache.jiffies = jiffies; status = NFS_PROTO(inode)->access(inode, &cache); - if (status != 0) - return status; + if (status != 0) { + if (status == -ESTALE) { + nfs_zap_caches(inode); + if (!S_ISDIR(inode->i_mode)) + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + } + goto out; + } nfs_access_add_cache(inode, &cache); +out_cached: + if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) + status = -EACCES; out: - if ((cache.mask & mask) == mask) - return 0; - return -EACCES; + trace_nfs_access_exit(inode, status); + return status; } static int nfs_open_permission_mask(int openflags) { int mask = 0; - if (openflags & FMODE_READ) - mask |= MAY_READ; - if (openflags & FMODE_WRITE) - mask |= MAY_WRITE; - if (openflags & FMODE_EXEC) - mask |= MAY_EXEC; + if (openflags & __FMODE_EXEC) { + /* ONLY check exec rights */ + mask = MAY_EXEC; + } else { + if ((openflags & O_ACCMODE) != O_WRONLY) + mask |= MAY_READ; + if ((openflags & O_ACCMODE) != O_RDONLY) + mask |= MAY_WRITE; + } + return mask; } @@ -1927,29 +2314,28 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) { return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); } +EXPORT_SYMBOL_GPL(nfs_may_open); -int nfs_permission(struct inode *inode, int mask, struct nameidata *nd) +int nfs_permission(struct inode *inode, int mask) { struct rpc_cred *cred; int res = 0; + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + nfs_inc_stats(inode, NFSIOS_VFSACCESS); - if (mask == 0) + if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) goto out; /* Is this sys_access() ? */ - if (nd != NULL && (nd->flags & LOOKUP_ACCESS)) + if (mask & (MAY_ACCESS | MAY_CHDIR)) goto force_lookup; switch (inode->i_mode & S_IFMT) { case S_IFLNK: goto out; case S_IFREG: - /* NFSv4 has atomic_open... */ - if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN) - && nd != NULL - && (nd->flags & LOOKUP_OPEN)) - goto out; break; case S_IFDIR: /* @@ -1961,29 +2347,29 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd) } force_lookup: - lock_kernel(); - if (!NFS_PROTO(inode)->access) goto out_notsup; - cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0); + cred = rpc_lookup_cred(); if (!IS_ERR(cred)) { res = nfs_do_access(inode, cred, mask); put_rpccred(cred); } else res = PTR_ERR(cred); - unlock_kernel(); out: - dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", + if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) + res = -EACCES; + + dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); return res; out_notsup: res = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (res == 0) - res = generic_permission(inode, mask, NULL); - unlock_kernel(); + res = generic_permission(inode, mask); goto out; } +EXPORT_SYMBOL_GPL(nfs_permission); /* * Local variables: diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 16844f98f50..f11b9eed0de 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -44,17 +44,20 @@ #include <linux/file.h> #include <linux/pagemap.h> #include <linux/kref.h> +#include <linux/slab.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/module.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/sunrpc/clnt.h> -#include <asm/system.h> #include <asm/uaccess.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include "internal.h" #include "iostat.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -68,6 +71,7 @@ struct nfs_direct_req { /* I/O parameters */ struct nfs_open_context *ctx; /* file open context info */ + struct nfs_lock_context *l_ctx; /* Lock context info */ struct kiocb * iocb; /* controlling i/o request */ struct inode * inode; /* target file of i/o */ @@ -75,20 +79,24 @@ struct nfs_direct_req { atomic_t io_count; /* i/os we're waiting for */ spinlock_t lock; /* protect completion state */ ssize_t count, /* bytes actually processed */ + bytes_left, /* bytes left to be sent */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ /* commit state */ - struct list_head rewrite_list; /* saved nfs_write_data structs */ - struct nfs_write_data * commit_data; /* special write_data for commits */ + struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */ + struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ + struct work_struct work; int flags; #define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ #define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ struct nfs_writeverf verf; /* unstable write verifier */ }; +static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops; +static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops; static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode); -static const struct rpc_call_ops nfs_write_direct_ops; +static void nfs_direct_write_schedule_work(struct work_struct *work); static inline void get_dreq(struct nfs_direct_req *dreq) { @@ -100,6 +108,97 @@ static inline int put_dreq(struct nfs_direct_req *dreq) return atomic_dec_and_test(&dreq->io_count); } +/* + * nfs_direct_select_verf - select the right verifier + * @dreq - direct request possibly spanning multiple servers + * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs + * @ds_idx - index of data server in data server list, only valid if ds_clp set + * + * returns the correct verifier to use given the role of the server + */ +static struct nfs_writeverf * +nfs_direct_select_verf(struct nfs_direct_req *dreq, + struct nfs_client *ds_clp, + int ds_idx) +{ + struct nfs_writeverf *verfp = &dreq->verf; + +#ifdef CONFIG_NFS_V4_1 + if (ds_clp) { + /* pNFS is in use, use the DS verf */ + if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets) + verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf; + else + WARN_ON_ONCE(1); + } +#endif + return verfp; +} + + +/* + * nfs_direct_set_hdr_verf - set the write/commit verifier + * @dreq - direct request possibly spanning multiple servers + * @hdr - pageio header to validate against previously seen verfs + * + * Set the server's (MDS or DS) "seen" verifier + */ +static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, + struct nfs_pgio_header *hdr) +{ + struct nfs_writeverf *verfp; + + verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, + hdr->data->ds_idx); + WARN_ON_ONCE(verfp->committed >= 0); + memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); + WARN_ON_ONCE(verfp->committed < 0); +} + +/* + * nfs_direct_cmp_hdr_verf - compare verifier for pgio header + * @dreq - direct request possibly spanning multiple servers + * @hdr - pageio header to validate against previously seen verf + * + * set the server's "seen" verf if not initialized. + * returns result of comparison between @hdr->verf and the "seen" + * verf of the server used by @hdr (DS or MDS) + */ +static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, + struct nfs_pgio_header *hdr) +{ + struct nfs_writeverf *verfp; + + verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, + hdr->data->ds_idx); + if (verfp->committed < 0) { + nfs_direct_set_hdr_verf(dreq, hdr); + return 0; + } + return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); +} + +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) +/* + * nfs_direct_cmp_commit_data_verf - compare verifier for commit data + * @dreq - direct request possibly spanning multiple servers + * @data - commit data to validate against previously seen verf + * + * returns result of comparison between @data->verf and the verf of + * the server used by @data (DS or MDS) + */ +static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, + struct nfs_commit_data *data) +{ + struct nfs_writeverf *verfp; + + verfp = nfs_direct_select_verf(dreq, data->ds_clp, + data->ds_commit_index); + WARN_ON_ONCE(verfp->committed < 0); + return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); +} +#endif + /** * nfs_direct_IO - NFS address space operation for direct I/O * @rw: direction (read or write) @@ -109,33 +208,26 @@ static inline int put_dreq(struct nfs_direct_req *dreq) * @nr_segs: size of iovec array * * The presence of this routine in the address space ops vector means - * the NFS client supports direct I/O. However, we shunt off direct - * read and write requests before the VFS gets them, so this method - * should never be called. + * the NFS client supports direct I/O. However, for most direct IO, we + * shunt off direct read and write requests before the VFS gets them, + * so this method is only ever called for swap. */ -ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) +ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos) { - dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", - iocb->ki_filp->f_path.dentry->d_name.name, - (long long) pos, nr_segs); +#ifndef CONFIG_NFS_SWAP + dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n", + iocb->ki_filp, (long long) pos, iter->nr_segs); return -EINVAL; -} - -static void nfs_direct_dirty_pages(struct page **pages, unsigned int pgbase, size_t count) -{ - unsigned int npages; - unsigned int i; - - if (count == 0) - return; - pages += (pgbase >> PAGE_SHIFT); - npages = (count + (pgbase & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; - for (i = 0; i < npages; i++) { - struct page *page = pages[i]; - if (!PageCompound(page)) - set_page_dirty(page); - } +#else + VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); + + if (rw == READ || rw == KERNEL_READ) + return nfs_file_direct_read(iocb, iter, pos, + rw == READ ? true : false); + return nfs_file_direct_write(iocb, iter, pos, + rw == WRITE ? true : false); +#endif /* CONFIG_NFS_SWAP */ } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -145,25 +237,31 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages) page_cache_release(pages[i]); } +void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, + struct nfs_direct_req *dreq) +{ + cinfo->lock = &dreq->lock; + cinfo->mds = &dreq->mds_cinfo; + cinfo->ds = &dreq->ds_cinfo; + cinfo->dreq = dreq; + cinfo->completion_ops = &nfs_direct_commit_completion_ops; +} + static inline struct nfs_direct_req *nfs_direct_req_alloc(void) { struct nfs_direct_req *dreq; - dreq = kmem_cache_alloc(nfs_direct_cachep, GFP_KERNEL); + dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL); if (!dreq) return NULL; kref_init(&dreq->kref); kref_get(&dreq->kref); init_completion(&dreq->completion); - INIT_LIST_HEAD(&dreq->rewrite_list); - dreq->iocb = NULL; - dreq->ctx = NULL; + INIT_LIST_HEAD(&dreq->mds_cinfo.list); + dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ + INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); spin_lock_init(&dreq->lock); - atomic_set(&dreq->io_count, 0); - dreq->count = 0; - dreq->error = 0; - dreq->flags = 0; return dreq; } @@ -172,6 +270,8 @@ static void nfs_direct_req_free(struct kref *kref) { struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); + if (dreq->l_ctx != NULL) + nfs_put_lock_context(dreq->l_ctx); if (dreq->ctx != NULL) put_nfs_open_context(dreq->ctx); kmem_cache_free(nfs_direct_cachep, dreq); @@ -182,6 +282,12 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq) kref_put(&dreq->kref, nfs_direct_req_free); } +ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq) +{ + return dreq->bytes_left; +} +EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left); + /* * Collects and returns the final error value/byte-count. */ @@ -208,52 +314,97 @@ out: * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust * the iocb is still valid here if this is a synchronous request. */ -static void nfs_direct_complete(struct nfs_direct_req *dreq) +static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) { + struct inode *inode = dreq->inode; + + if (dreq->iocb && write) { + loff_t pos = dreq->iocb->ki_pos + dreq->count; + + spin_lock(&inode->i_lock); + if (i_size_read(inode) < pos) + i_size_write(inode, pos); + spin_unlock(&inode->i_lock); + } + + if (write) + nfs_zap_mapping(inode, inode->i_mapping); + + inode_dio_done(inode); + if (dreq->iocb) { long res = (long) dreq->error; if (!res) res = (long) dreq->count; aio_complete(dreq->iocb, res, 0); } + complete_all(&dreq->completion); nfs_direct_req_release(dreq); } -/* - * We must hold a reference to all the pages in this direct read request - * until the RPCs complete. This could be long *after* we are woken up in - * nfs_direct_wait (for instance, if someone hits ^C on a slow server). - */ -static void nfs_direct_read_result(struct rpc_task *task, void *calldata) +static void nfs_direct_readpage_release(struct nfs_page *req) { - struct nfs_read_data *data = calldata; - struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", + req->wb_context->dentry->d_inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), + req->wb_bytes, + (long long)req_offset(req)); + nfs_release_request(req); +} - if (nfs_readpage_result(task, data) != 0) - return; +static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) +{ + unsigned long bytes = 0; + struct nfs_direct_req *dreq = hdr->dreq; + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) + goto out_put; spin_lock(&dreq->lock); - if (unlikely(task->tk_status < 0)) { - dreq->error = task->tk_status; - spin_unlock(&dreq->lock); - } else { - dreq->count += data->res.count; - spin_unlock(&dreq->lock); - nfs_direct_dirty_pages(data->pagevec, - data->args.pgbase, - data->res.count); - } - nfs_direct_release_pages(data->pagevec, data->npages); + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0)) + dreq->error = hdr->error; + else + dreq->count += hdr->good_bytes; + spin_unlock(&dreq->lock); + + while (!list_empty(&hdr->pages)) { + struct nfs_page *req = nfs_list_entry(hdr->pages.next); + struct page *page = req->wb_page; + if (!PageCompound(page) && bytes < hdr->good_bytes) + set_page_dirty(page); + bytes += req->wb_bytes; + nfs_list_remove_request(req); + nfs_direct_readpage_release(req); + } +out_put: if (put_dreq(dreq)) - nfs_direct_complete(dreq); + nfs_direct_complete(dreq, false); + hdr->release(hdr); } -static const struct rpc_call_ops nfs_read_direct_ops = { - .rpc_call_done = nfs_direct_read_result, - .rpc_release = nfs_readdata_release, +static void nfs_read_sync_pgio_error(struct list_head *head) +{ + struct nfs_page *req; + + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_release_request(req); + } +} + +static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr) +{ + get_dreq(hdr->dreq); +} + +static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = { + .error_cleanup = nfs_read_sync_pgio_error, + .init_hdr = nfs_direct_pgio_init, + .completion = nfs_direct_read_completion, }; /* @@ -263,307 +414,272 @@ static const struct rpc_call_ops nfs_read_direct_ops = { * handled automatically by nfs_direct_read_result(). Otherwise, if * no requests have been sent, just return an error. */ -static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, - const struct iovec *iov, - loff_t pos) -{ - struct nfs_open_context *ctx = dreq->ctx; - struct inode *inode = ctx->path.dentry->d_inode; - unsigned long user_addr = (unsigned long)iov->iov_base; - size_t count = iov->iov_len; - size_t rsize = NFS_SERVER(inode)->rsize; - struct rpc_task *task; - struct rpc_message msg = { - .rpc_cred = ctx->cred, - }; - struct rpc_task_setup task_setup_data = { - .rpc_client = NFS_CLIENT(inode), - .rpc_message = &msg, - .callback_ops = &nfs_read_direct_ops, - .flags = RPC_TASK_ASYNC, - }; - unsigned int pgbase; - int result; - ssize_t started = 0; - - do { - struct nfs_read_data *data; - size_t bytes; - - pgbase = user_addr & ~PAGE_MASK; - bytes = min(rsize,count); - - result = -ENOMEM; - data = nfs_readdata_alloc(nfs_page_array_len(pgbase, bytes)); - if (unlikely(!data)) - break; - - down_read(¤t->mm->mmap_sem); - result = get_user_pages(current, current->mm, user_addr, - data->npages, 1, 0, data->pagevec, NULL); - up_read(¤t->mm->mmap_sem); - if (result < 0) { - nfs_readdata_release(data); - break; - } - if ((unsigned)result < data->npages) { - bytes = result * PAGE_SIZE; - if (bytes <= pgbase) { - nfs_direct_release_pages(data->pagevec, result); - nfs_readdata_release(data); - break; - } - bytes -= pgbase; - data->npages = result; - } - - get_dreq(dreq); - - data->req = (struct nfs_page *) dreq; - data->inode = inode; - data->cred = msg.rpc_cred; - data->args.fh = NFS_FH(inode); - data->args.context = ctx; - data->args.offset = pos; - data->args.pgbase = pgbase; - data->args.pages = data->pagevec; - data->args.count = bytes; - data->res.fattr = &data->fattr; - data->res.eof = 0; - data->res.count = bytes; - msg.rpc_argp = &data->args; - msg.rpc_resp = &data->res; - - task_setup_data.task = &data->task; - task_setup_data.callback_data = data; - NFS_PROTO(inode)->read_setup(data, &msg); - - task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); - - dprintk("NFS: %5u initiated direct read call " - "(req %s/%Ld, %zu bytes @ offset %Lu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - bytes, - (unsigned long long)data->args.offset); - - started += bytes; - user_addr += bytes; - pos += bytes; - /* FIXME: Remove this unnecessary math from final patch */ - pgbase += bytes; - pgbase &= ~PAGE_MASK; - BUG_ON(pgbase != (user_addr & ~PAGE_MASK)); - - count -= bytes; - } while (count != 0); - - if (started) - return started; - return result < 0 ? (ssize_t) result : -EFAULT; -} static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, - const struct iovec *iov, - unsigned long nr_segs, + struct iov_iter *iter, loff_t pos) { + struct nfs_pageio_descriptor desc; + struct inode *inode = dreq->inode; ssize_t result = -EINVAL; size_t requested_bytes = 0; - unsigned long seg; + size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE); + nfs_pageio_init_read(&desc, dreq->inode, false, + &nfs_direct_read_completion_ops); get_dreq(dreq); + desc.pg_dreq = dreq; + atomic_inc(&inode->i_dio_count); + + while (iov_iter_count(iter)) { + struct page **pagevec; + size_t bytes; + size_t pgbase; + unsigned npages, i; - for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *vec = &iov[seg]; - result = nfs_direct_read_schedule_segment(dreq, vec, pos); + result = iov_iter_get_pages_alloc(iter, &pagevec, + rsize, &pgbase); if (result < 0) break; - requested_bytes += result; - if ((size_t)result < vec->iov_len) + + bytes = result; + iov_iter_advance(iter, bytes); + npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; + for (i = 0; i < npages; i++) { + struct nfs_page *req; + unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); + /* XXX do we need to do the eof zeroing found in async_filler? */ + req = nfs_create_request(dreq->ctx, pagevec[i], NULL, + pgbase, req_len); + if (IS_ERR(req)) { + result = PTR_ERR(req); + break; + } + req->wb_index = pos >> PAGE_SHIFT; + req->wb_offset = pos & ~PAGE_MASK; + if (!nfs_pageio_add_request(&desc, req)) { + result = desc.pg_error; + nfs_release_request(req); + break; + } + pgbase = 0; + bytes -= req_len; + requested_bytes += req_len; + pos += req_len; + dreq->bytes_left -= req_len; + } + nfs_direct_release_pages(pagevec, npages); + kvfree(pagevec); + if (result < 0) break; - pos += vec->iov_len; } - if (put_dreq(dreq)) - nfs_direct_complete(dreq); + nfs_pageio_complete(&desc); - if (requested_bytes != 0) - return 0; + /* + * If no bytes were started, return the error, and let the + * generic layer handle the completion. + */ + if (requested_bytes == 0) { + inode_dio_done(inode); + nfs_direct_req_release(dreq); + return result < 0 ? result : -EIO; + } - if (result < 0) - return result; - return -EIO; + if (put_dreq(dreq)) + nfs_direct_complete(dreq, false); + return 0; } -static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +/** + * nfs_file_direct_read - file direct read operation for NFS files + * @iocb: target I/O control block + * @iter: vector of user buffers into which to read data + * @pos: byte offset in file where reading starts + * + * We use this function for direct reads instead of calling + * generic_file_aio_read() in order to avoid gfar's check to see if + * the request starts before the end of the file. For that check + * to work, we must generate a GETATTR before each direct read, and + * even then there is a window between the GETATTR and the subsequent + * READ where the file size could change. Our preference is simply + * to do all reads the application wants, and the server will take + * care of managing the end of file boundary. + * + * This function also eliminates unnecessarily updating the file's + * atime locally, as the NFS server sets the file's atime, and this + * client must read the updated atime from the server back into its + * cache. + */ +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, + loff_t pos, bool uio) { - ssize_t result = 0; - struct inode *inode = iocb->ki_filp->f_mapping->host; + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; struct nfs_direct_req *dreq; + struct nfs_lock_context *l_ctx; + ssize_t result = -EINVAL; + size_t count = iov_iter_count(iter); + nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); + + dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", + file, count, (long long) pos); + result = 0; + if (!count) + goto out; + + mutex_lock(&inode->i_mutex); + result = nfs_sync_mapping(mapping); + if (result) + goto out_unlock; + + task_io_account_read(count); + + result = -ENOMEM; dreq = nfs_direct_req_alloc(); - if (!dreq) - return -ENOMEM; + if (dreq == NULL) + goto out_unlock; dreq->inode = inode; + dreq->bytes_left = count; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + l_ctx = nfs_get_lock_context(dreq->ctx); + if (IS_ERR(l_ctx)) { + result = PTR_ERR(l_ctx); + goto out_release; + } + dreq->l_ctx = l_ctx; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; - result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); - if (!result) + NFS_I(inode)->read_io += count; + result = nfs_direct_read_schedule_iovec(dreq, iter, pos); + + mutex_unlock(&inode->i_mutex); + + if (!result) { result = nfs_direct_wait(dreq); - nfs_direct_req_release(dreq); + if (result > 0) + iocb->ki_pos = pos + result; + } + nfs_direct_req_release(dreq); return result; -} -static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) -{ - while (!list_empty(&dreq->rewrite_list)) { - struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages); - list_del(&data->pages); - nfs_direct_release_pages(data->pagevec, data->npages); - nfs_writedata_release(data); - } +out_release: + nfs_direct_req_release(dreq); +out_unlock: + mutex_unlock(&inode->i_mutex); +out: + return result; } -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { - struct inode *inode = dreq->inode; - struct list_head *p; - struct nfs_write_data *data; - struct rpc_task *task; - struct rpc_message msg = { - .rpc_cred = dreq->ctx->cred, - }; - struct rpc_task_setup task_setup_data = { - .rpc_client = NFS_CLIENT(inode), - .callback_ops = &nfs_write_direct_ops, - .flags = RPC_TASK_ASYNC, - }; + struct nfs_pageio_descriptor desc; + struct nfs_page *req, *tmp; + LIST_HEAD(reqs); + struct nfs_commit_info cinfo; + LIST_HEAD(failed); + + nfs_init_cinfo_from_dreq(&cinfo, dreq); + pnfs_recover_commit_reqs(dreq->inode, &reqs, &cinfo); + spin_lock(cinfo.lock); + nfs_scan_commit_list(&cinfo.mds->list, &reqs, &cinfo, 0); + spin_unlock(cinfo.lock); dreq->count = 0; get_dreq(dreq); - list_for_each(p, &dreq->rewrite_list) { - data = list_entry(p, struct nfs_write_data, pages); - - get_dreq(dreq); - - /* Use stable writes */ - data->args.stable = NFS_FILE_SYNC; - - /* - * Reset data->res. - */ - nfs_fattr_init(&data->fattr); - data->res.count = data->args.count; - memset(&data->verf, 0, sizeof(data->verf)); - - /* - * Reuse data->task; data->args should not have changed - * since the original request was sent. - */ - task_setup_data.task = &data->task; - task_setup_data.callback_data = data; - msg.rpc_argp = &data->args; - msg.rpc_resp = &data->res; - NFS_PROTO(inode)->write_setup(data, &msg); - - /* - * We're called via an RPC callback, so BKL is already held. - */ - task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); - - dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - data->args.count, - (unsigned long long)data->args.offset); + nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false, + &nfs_direct_write_completion_ops); + desc.pg_dreq = dreq; + + list_for_each_entry_safe(req, tmp, &reqs, wb_list) { + if (!nfs_pageio_add_request(&desc, req)) { + nfs_list_remove_request(req); + nfs_list_add_request(req, &failed); + spin_lock(cinfo.lock); + dreq->flags = 0; + dreq->error = -EIO; + spin_unlock(cinfo.lock); + } + nfs_release_request(req); + } + nfs_pageio_complete(&desc); + + while (!list_empty(&failed)) { + req = nfs_list_entry(failed.next); + nfs_list_remove_request(req); + nfs_unlock_and_release_request(req); } if (put_dreq(dreq)) - nfs_direct_write_complete(dreq, inode); + nfs_direct_write_complete(dreq, dreq->inode); } -static void nfs_direct_commit_result(struct rpc_task *task, void *calldata) +static void nfs_direct_commit_complete(struct nfs_commit_data *data) { - struct nfs_write_data *data = calldata; - struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + struct nfs_direct_req *dreq = data->dreq; + struct nfs_commit_info cinfo; + struct nfs_page *req; + int status = data->task.tk_status; - /* Call the NFS version-specific code */ - if (NFS_PROTO(data->inode)->commit_done(task, data) != 0) - return; - if (unlikely(task->tk_status < 0)) { + nfs_init_cinfo_from_dreq(&cinfo, dreq); + if (status < 0) { dprintk("NFS: %5u commit failed with error %d.\n", - task->tk_pid, task->tk_status); + data->task.tk_pid, status); dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) { - dprintk("NFS: %5u commit verify failed\n", task->tk_pid); + } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) { + dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid); dreq->flags = NFS_ODIRECT_RESCHED_WRITES; } - dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status); - nfs_direct_write_complete(dreq, data->inode); + dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); + while (!list_empty(&data->pages)) { + req = nfs_list_entry(data->pages.next); + nfs_list_remove_request(req); + if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) { + /* Note the rewrite will go through mds */ + nfs_mark_request_commit(req, NULL, &cinfo); + } else + nfs_release_request(req); + nfs_unlock_and_release_request(req); + } + + if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) + nfs_direct_write_complete(dreq, data->inode); +} + +static void nfs_direct_error_cleanup(struct nfs_inode *nfsi) +{ + /* There is no lock to clear */ } -static const struct rpc_call_ops nfs_commit_direct_ops = { - .rpc_call_done = nfs_direct_commit_result, - .rpc_release = nfs_commit_release, +static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = { + .completion = nfs_direct_commit_complete, + .error_cleanup = nfs_direct_error_cleanup, }; static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) { - struct nfs_write_data *data = dreq->commit_data; - struct rpc_task *task; - struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, - .rpc_cred = dreq->ctx->cred, - }; - struct rpc_task_setup task_setup_data = { - .task = &data->task, - .rpc_client = NFS_CLIENT(dreq->inode), - .rpc_message = &msg, - .callback_ops = &nfs_commit_direct_ops, - .callback_data = data, - .flags = RPC_TASK_ASYNC, - }; - - data->inode = dreq->inode; - data->cred = msg.rpc_cred; - - data->args.fh = NFS_FH(data->inode); - data->args.offset = 0; - data->args.count = 0; - data->res.count = 0; - data->res.fattr = &data->fattr; - data->res.verf = &data->verf; - - NFS_PROTO(data->inode)->commit_setup(data, &msg); - - /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ - dreq->commit_data = NULL; - - dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); - - task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + int res; + struct nfs_commit_info cinfo; + LIST_HEAD(mds_list); + + nfs_init_cinfo_from_dreq(&cinfo, dreq); + nfs_scan_commit(dreq->inode, &mds_list, &cinfo); + res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo); + if (res < 0) /* res == -ENOMEM */ + nfs_direct_write_reschedule(dreq); } -static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +static void nfs_direct_write_schedule_work(struct work_struct *work) { + struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work); int flags = dreq->flags; dreq->flags = 0; @@ -575,90 +691,111 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode nfs_direct_write_reschedule(dreq); break; default: - if (dreq->commit_data != NULL) - nfs_commit_free(dreq->commit_data); - nfs_direct_free_writedata(dreq); - nfs_zap_mapping(inode, inode->i_mapping); - nfs_direct_complete(dreq); + nfs_direct_complete(dreq, true); } } -static void nfs_alloc_commit_data(struct nfs_direct_req *dreq) +static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) { - dreq->commit_data = nfs_commit_alloc(); - if (dreq->commit_data != NULL) - dreq->commit_data->req = (struct nfs_page *) dreq; + schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */ } + #else -static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq) +static void nfs_direct_write_schedule_work(struct work_struct *work) { - dreq->commit_data = NULL; } static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) { - nfs_direct_free_writedata(dreq); - nfs_zap_mapping(inode, inode->i_mapping); - nfs_direct_complete(dreq); + nfs_direct_complete(dreq, true); } #endif -static void nfs_direct_write_result(struct rpc_task *task, void *calldata) +static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) { - struct nfs_write_data *data = calldata; - struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; - int status = task->tk_status; + struct nfs_direct_req *dreq = hdr->dreq; + struct nfs_commit_info cinfo; + int bit = -1; + struct nfs_page *req = nfs_list_entry(hdr->pages.next); - if (nfs_writeback_done(task, data) != 0) - return; + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) + goto out_put; + + nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); - if (unlikely(status < 0)) { - /* An error has occurred, so we should not commit */ + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { dreq->flags = 0; - dreq->error = status; + dreq->error = hdr->error; } - if (unlikely(dreq->error != 0)) - goto out_unlock; - - dreq->count += data->res.count; - - if (data->res.verf->committed != NFS_FILE_SYNC) { - switch (dreq->flags) { - case 0: - memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf)); + if (dreq->error != 0) + bit = NFS_IOHDR_ERROR; + else { + dreq->count += hdr->good_bytes; + if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) { + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + bit = NFS_IOHDR_NEED_RESCHED; + } else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { + if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) + bit = NFS_IOHDR_NEED_RESCHED; + else if (dreq->flags == 0) { + nfs_direct_set_hdr_verf(dreq, hdr); + bit = NFS_IOHDR_NEED_COMMIT; dreq->flags = NFS_ODIRECT_DO_COMMIT; - break; - case NFS_ODIRECT_DO_COMMIT: - if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) { - dprintk("NFS: %5u write verify failed\n", task->tk_pid); - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - } + } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { + if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) { + dreq->flags = + NFS_ODIRECT_RESCHED_WRITES; + bit = NFS_IOHDR_NEED_RESCHED; + } else + bit = NFS_IOHDR_NEED_COMMIT; + } } } -out_unlock: spin_unlock(&dreq->lock); + + while (!list_empty(&hdr->pages)) { + + req = nfs_list_entry(hdr->pages.next); + nfs_list_remove_request(req); + switch (bit) { + case NFS_IOHDR_NEED_RESCHED: + case NFS_IOHDR_NEED_COMMIT: + kref_get(&req->wb_kref); + nfs_mark_request_commit(req, hdr->lseg, &cinfo); + } + nfs_unlock_and_release_request(req); + } + +out_put: + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, hdr->inode); + hdr->release(hdr); } -/* - * NB: Return the value of the first error return code. Subsequent - * errors after the first one are ignored. - */ -static void nfs_direct_write_release(void *calldata) +static void nfs_write_sync_pgio_error(struct list_head *head) { - struct nfs_write_data *data = calldata; - struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + struct nfs_page *req; - if (put_dreq(dreq)) - nfs_direct_write_complete(dreq, data->inode); + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_unlock_and_release_request(req); + } } -static const struct rpc_call_ops nfs_write_direct_ops = { - .rpc_call_done = nfs_direct_write_result, - .rpc_release = nfs_direct_write_release, +static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { + .error_cleanup = nfs_write_sync_pgio_error, + .init_hdr = nfs_direct_pgio_init, + .completion = nfs_direct_write_completion, }; + +/* + * NB: Return the value of the first error return code. Subsequent + * errors after the first one are ignored. + */ /* * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE * operation. If nfs_writedata_alloc() or get_user_pages() fails, @@ -666,236 +803,87 @@ static const struct rpc_call_ops nfs_write_direct_ops = { * handled automatically by nfs_direct_write_result(). Otherwise, if * no requests have been sent, just return an error. */ -static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, - const struct iovec *iov, - loff_t pos, int sync) -{ - struct nfs_open_context *ctx = dreq->ctx; - struct inode *inode = ctx->path.dentry->d_inode; - unsigned long user_addr = (unsigned long)iov->iov_base; - size_t count = iov->iov_len; - struct rpc_task *task; - struct rpc_message msg = { - .rpc_cred = ctx->cred, - }; - struct rpc_task_setup task_setup_data = { - .rpc_client = NFS_CLIENT(inode), - .rpc_message = &msg, - .callback_ops = &nfs_write_direct_ops, - .flags = RPC_TASK_ASYNC, - }; - size_t wsize = NFS_SERVER(inode)->wsize; - unsigned int pgbase; - int result; - ssize_t started = 0; - - do { - struct nfs_write_data *data; - size_t bytes; - - pgbase = user_addr & ~PAGE_MASK; - bytes = min(wsize,count); - - result = -ENOMEM; - data = nfs_writedata_alloc(nfs_page_array_len(pgbase, bytes)); - if (unlikely(!data)) - break; - - down_read(¤t->mm->mmap_sem); - result = get_user_pages(current, current->mm, user_addr, - data->npages, 0, 0, data->pagevec, NULL); - up_read(¤t->mm->mmap_sem); - if (result < 0) { - nfs_writedata_release(data); - break; - } - if ((unsigned)result < data->npages) { - bytes = result * PAGE_SIZE; - if (bytes <= pgbase) { - nfs_direct_release_pages(data->pagevec, result); - nfs_writedata_release(data); - break; - } - bytes -= pgbase; - data->npages = result; - } - - get_dreq(dreq); - - list_move_tail(&data->pages, &dreq->rewrite_list); - - data->req = (struct nfs_page *) dreq; - data->inode = inode; - data->cred = msg.rpc_cred; - data->args.fh = NFS_FH(inode); - data->args.context = ctx; - data->args.offset = pos; - data->args.pgbase = pgbase; - data->args.pages = data->pagevec; - data->args.count = bytes; - data->args.stable = sync; - data->res.fattr = &data->fattr; - data->res.count = bytes; - data->res.verf = &data->verf; - - task_setup_data.task = &data->task; - task_setup_data.callback_data = data; - msg.rpc_argp = &data->args; - msg.rpc_resp = &data->res; - NFS_PROTO(inode)->write_setup(data, &msg); - - task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); - - dprintk("NFS: %5u initiated direct write call " - "(req %s/%Ld, %zu bytes @ offset %Lu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - bytes, - (unsigned long long)data->args.offset); - - started += bytes; - user_addr += bytes; - pos += bytes; - - /* FIXME: Remove this useless math from the final patch */ - pgbase += bytes; - pgbase &= ~PAGE_MASK; - BUG_ON(pgbase != (user_addr & ~PAGE_MASK)); - - count -= bytes; - } while (count != 0); - - if (started) - return started; - return result < 0 ? (ssize_t) result : -EFAULT; -} - static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos, int sync) + struct iov_iter *iter, + loff_t pos) { + struct nfs_pageio_descriptor desc; + struct inode *inode = dreq->inode; ssize_t result = 0; size_t requested_bytes = 0; - unsigned long seg; + size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); + nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false, + &nfs_direct_write_completion_ops); + desc.pg_dreq = dreq; get_dreq(dreq); + atomic_inc(&inode->i_dio_count); - for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *vec = &iov[seg]; - result = nfs_direct_write_schedule_segment(dreq, vec, - pos, sync); + NFS_I(inode)->write_io += iov_iter_count(iter); + while (iov_iter_count(iter)) { + struct page **pagevec; + size_t bytes; + size_t pgbase; + unsigned npages, i; + + result = iov_iter_get_pages_alloc(iter, &pagevec, + wsize, &pgbase); if (result < 0) break; - requested_bytes += result; - if ((size_t)result < vec->iov_len) + + bytes = result; + iov_iter_advance(iter, bytes); + npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; + for (i = 0; i < npages; i++) { + struct nfs_page *req; + unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); + + req = nfs_create_request(dreq->ctx, pagevec[i], NULL, + pgbase, req_len); + if (IS_ERR(req)) { + result = PTR_ERR(req); + break; + } + nfs_lock_request(req); + req->wb_index = pos >> PAGE_SHIFT; + req->wb_offset = pos & ~PAGE_MASK; + if (!nfs_pageio_add_request(&desc, req)) { + result = desc.pg_error; + nfs_unlock_and_release_request(req); + break; + } + pgbase = 0; + bytes -= req_len; + requested_bytes += req_len; + pos += req_len; + dreq->bytes_left -= req_len; + } + nfs_direct_release_pages(pagevec, npages); + kvfree(pagevec); + if (result < 0) break; - pos += vec->iov_len; + } + nfs_pageio_complete(&desc); + + /* + * If no bytes were started, return the error, and let the + * generic layer handle the completion. + */ + if (requested_bytes == 0) { + inode_dio_done(inode); + nfs_direct_req_release(dreq); + return result < 0 ? result : -EIO; } if (put_dreq(dreq)) nfs_direct_write_complete(dreq, dreq->inode); - - if (requested_bytes != 0) - return 0; - - if (result < 0) - return result; - return -EIO; -} - -static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, - size_t count) -{ - ssize_t result = 0; - struct inode *inode = iocb->ki_filp->f_mapping->host; - struct nfs_direct_req *dreq; - size_t wsize = NFS_SERVER(inode)->wsize; - int sync = NFS_UNSTABLE; - - dreq = nfs_direct_req_alloc(); - if (!dreq) - return -ENOMEM; - nfs_alloc_commit_data(dreq); - - if (dreq->commit_data == NULL || count < wsize) - sync = NFS_FILE_SYNC; - - dreq->inode = inode; - dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); - if (!is_sync_kiocb(iocb)) - dreq->iocb = iocb; - - result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync); - if (!result) - result = nfs_direct_wait(dreq); - nfs_direct_req_release(dreq); - - return result; -} - -/** - * nfs_file_direct_read - file direct read operation for NFS files - * @iocb: target I/O control block - * @iov: vector of user buffers into which to read data - * @nr_segs: size of iov vector - * @pos: byte offset in file where reading starts - * - * We use this function for direct reads instead of calling - * generic_file_aio_read() in order to avoid gfar's check to see if - * the request starts before the end of the file. For that check - * to work, we must generate a GETATTR before each direct read, and - * even then there is a window between the GETATTR and the subsequent - * READ where the file size could change. Our preference is simply - * to do all reads the application wants, and the server will take - * care of managing the end of file boundary. - * - * This function also eliminates unnecessarily updating the file's - * atime locally, as the NFS server sets the file's atime, and this - * client must read the updated atime from the server back into its - * cache. - */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - ssize_t retval = -EINVAL; - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - size_t count; - - count = iov_length(iov, nr_segs); - nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); - - dprintk("nfs: direct read(%s/%s, %zd@%Ld)\n", - file->f_path.dentry->d_parent->d_name.name, - file->f_path.dentry->d_name.name, - count, (long long) pos); - - retval = 0; - if (!count) - goto out; - - retval = nfs_sync_mapping(mapping); - if (retval) - goto out; - - retval = nfs_direct_read(iocb, iov, nr_segs, pos); - if (retval > 0) - iocb->ki_pos = pos + retval; - -out: - return retval; + return 0; } /** * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block - * @iov: vector of user buffers from which to write data - * @nr_segs: size of iov vector + * @iter: vector of user buffers from which to write data * @pos: byte offset in file where writing starts * * We use this function for direct writes instead of calling @@ -905,9 +893,6 @@ out: * back into its cache. We let the server do generic write * parameter checking and report problems. * - * We also avoid an unnecessary invocation of generic_osync_inode(), - * as it is fairly meaningless to sync the metadata of an NFS file. - * * We eliminate local atime updates, see direct read above. * * We avoid unnecessary page cache invalidations for normal cached @@ -916,44 +901,97 @@ out: * Note that O_APPEND is not supported for NFS direct writes, as there * is no atomic O_APPEND write facility in the NFS protocol. */ -ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, + loff_t pos, bool uio) { - ssize_t retval = -EINVAL; + ssize_t result = -EINVAL; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - size_t count; + struct inode *inode = mapping->host; + struct nfs_direct_req *dreq; + struct nfs_lock_context *l_ctx; + loff_t end; + size_t count = iov_iter_count(iter); + end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - count = iov_length(iov, nr_segs); nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); - dfprintk(VFS, "nfs: direct write(%s/%s, %zd@%Ld)\n", - file->f_path.dentry->d_parent->d_name.name, - file->f_path.dentry->d_name.name, - count, (long long) pos); + dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", + file, count, (long long) pos); - retval = generic_write_checks(file, &pos, &count, 0); - if (retval) + result = generic_write_checks(file, &pos, &count, 0); + if (result) goto out; - retval = -EINVAL; + result = -EINVAL; if ((ssize_t) count < 0) goto out; - retval = 0; + result = 0; if (!count) goto out; - retval = nfs_sync_mapping(mapping); - if (retval) - goto out; + mutex_lock(&inode->i_mutex); + + result = nfs_sync_mapping(mapping); + if (result) + goto out_unlock; + + if (mapping->nrpages) { + result = invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + if (result) + goto out_unlock; + } + + task_io_account_write(count); + + result = -ENOMEM; + dreq = nfs_direct_req_alloc(); + if (!dreq) + goto out_unlock; + + dreq->inode = inode; + dreq->bytes_left = count; + dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + l_ctx = nfs_get_lock_context(dreq->ctx); + if (IS_ERR(l_ctx)) { + result = PTR_ERR(l_ctx); + goto out_release; + } + dreq->l_ctx = l_ctx; + if (!is_sync_kiocb(iocb)) + dreq->iocb = iocb; - retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); + result = nfs_direct_write_schedule_iovec(dreq, iter, pos); + + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + } - if (retval > 0) - iocb->ki_pos = pos + retval; + mutex_unlock(&inode->i_mutex); + if (!result) { + result = nfs_direct_wait(dreq); + if (result > 0) { + struct inode *inode = mapping->host; + + iocb->ki_pos = pos + result; + spin_lock(&inode->i_lock); + if (i_size_read(inode) < iocb->ki_pos) + i_size_write(inode, iocb->ki_pos); + spin_unlock(&inode->i_lock); + } + } + nfs_direct_req_release(dreq); + return result; + +out_release: + nfs_direct_req_release(dreq); +out_unlock: + mutex_unlock(&inode->i_mutex); out: - return retval; + return result; } /** diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c new file mode 100644 index 00000000000..d25f10fb492 --- /dev/null +++ b/fs/nfs/dns_resolve.c @@ -0,0 +1,470 @@ +/* + * linux/fs/nfs/dns_resolve.c + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + * + * Resolves DNS hostnames into valid ip addresses + */ + +#ifdef CONFIG_NFS_USE_KERNEL_DNS + +#include <linux/module.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> +#include <linux/dns_resolver.h> +#include "dns_resolve.h" + +ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, + struct sockaddr *sa, size_t salen) +{ + ssize_t ret; + char *ip_addr = NULL; + int ip_len; + + ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL); + if (ip_len > 0) + ret = rpc_pton(net, ip_addr, ip_len, sa, salen); + else + ret = -ESRCH; + kfree(ip_addr); + return ret; +} + +#else + +#include <linux/module.h> +#include <linux/hash.h> +#include <linux/string.h> +#include <linux/kmod.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/seq_file.h> +#include <linux/inet.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> +#include <linux/sunrpc/cache.h> +#include <linux/sunrpc/svcauth.h> +#include <linux/sunrpc/rpc_pipe_fs.h> +#include <linux/nfs_fs.h> + +#include "nfs4_fs.h" +#include "dns_resolve.h" +#include "cache_lib.h" +#include "netns.h" + +#define NFS_DNS_HASHBITS 4 +#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS) + +struct nfs_dns_ent { + struct cache_head h; + + char *hostname; + size_t namelen; + + struct sockaddr_storage addr; + size_t addrlen; +}; + + +static void nfs_dns_ent_update(struct cache_head *cnew, + struct cache_head *ckey) +{ + struct nfs_dns_ent *new; + struct nfs_dns_ent *key; + + new = container_of(cnew, struct nfs_dns_ent, h); + key = container_of(ckey, struct nfs_dns_ent, h); + + memcpy(&new->addr, &key->addr, key->addrlen); + new->addrlen = key->addrlen; +} + +static void nfs_dns_ent_init(struct cache_head *cnew, + struct cache_head *ckey) +{ + struct nfs_dns_ent *new; + struct nfs_dns_ent *key; + + new = container_of(cnew, struct nfs_dns_ent, h); + key = container_of(ckey, struct nfs_dns_ent, h); + + kfree(new->hostname); + new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL); + if (new->hostname) { + new->namelen = key->namelen; + nfs_dns_ent_update(cnew, ckey); + } else { + new->namelen = 0; + new->addrlen = 0; + } +} + +static void nfs_dns_ent_put(struct kref *ref) +{ + struct nfs_dns_ent *item; + + item = container_of(ref, struct nfs_dns_ent, h.ref); + kfree(item->hostname); + kfree(item); +} + +static struct cache_head *nfs_dns_ent_alloc(void) +{ + struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL); + + if (item != NULL) { + item->hostname = NULL; + item->namelen = 0; + item->addrlen = 0; + return &item->h; + } + return NULL; +}; + +static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key) +{ + return hash_str(key->hostname, NFS_DNS_HASHBITS); +} + +static void nfs_dns_request(struct cache_detail *cd, + struct cache_head *ch, + char **bpp, int *blen) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + + qword_add(bpp, blen, key->hostname); + (*bpp)[-1] = '\n'; +} + +static int nfs_dns_upcall(struct cache_detail *cd, + struct cache_head *ch) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + int ret; + + ret = nfs_cache_upcall(cd, key->hostname); + if (ret) + ret = sunrpc_cache_pipe_upcall(cd, ch); + return ret; +} + +static int nfs_dns_match(struct cache_head *ca, + struct cache_head *cb) +{ + struct nfs_dns_ent *a; + struct nfs_dns_ent *b; + + a = container_of(ca, struct nfs_dns_ent, h); + b = container_of(cb, struct nfs_dns_ent, h); + + if (a->namelen == 0 || a->namelen != b->namelen) + return 0; + return memcmp(a->hostname, b->hostname, a->namelen) == 0; +} + +static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd, + struct cache_head *h) +{ + struct nfs_dns_ent *item; + long ttl; + + if (h == NULL) { + seq_puts(m, "# ip address hostname ttl\n"); + return 0; + } + item = container_of(h, struct nfs_dns_ent, h); + ttl = item->h.expiry_time - seconds_since_boot(); + if (ttl < 0) + ttl = 0; + + if (!test_bit(CACHE_NEGATIVE, &h->flags)) { + char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1]; + + rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf)); + seq_printf(m, "%15s ", buf); + } else + seq_puts(m, "<none> "); + seq_printf(m, "%15s %ld\n", item->hostname, ttl); + return 0; +} + +static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_lookup(cd, + &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +static struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd, + struct nfs_dns_ent *new, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_update(cd, + &new->h, &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) +{ + char buf1[NFS_DNS_HOSTNAME_MAXLEN+1]; + struct nfs_dns_ent key, *item; + unsigned int ttl; + ssize_t len; + int ret = -EINVAL; + + if (buf[buflen-1] != '\n') + goto out; + buf[buflen-1] = '\0'; + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + key.addrlen = rpc_pton(cd->net, buf1, len, + (struct sockaddr *)&key.addr, + sizeof(key.addr)); + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + + key.hostname = buf1; + key.namelen = len; + memset(&key.h, 0, sizeof(key.h)); + + if (get_uint(&buf, &ttl) < 0) + goto out; + if (ttl == 0) + goto out; + key.h.expiry_time = ttl + seconds_since_boot(); + + ret = -ENOMEM; + item = nfs_dns_lookup(cd, &key); + if (item == NULL) + goto out; + + if (key.addrlen == 0) + set_bit(CACHE_NEGATIVE, &key.h.flags); + + item = nfs_dns_update(cd, &key, item); + if (item == NULL) + goto out; + + ret = 0; + cache_put(&item->h, cd); +out: + return ret; +} + +static int do_cache_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item, + struct nfs_cache_defer_req *dreq) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (*item) { + ret = cache_check(cd, &(*item)->h, &dreq->req); + if (ret) + *item = NULL; + } + return ret; +} + +static int do_cache_lookup_nowait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (!*item) + goto out_err; + ret = -ETIMEDOUT; + if (!test_bit(CACHE_VALID, &(*item)->h.flags) + || (*item)->h.expiry_time < seconds_since_boot() + || cd->flush_time > (*item)->h.last_refresh) + goto out_put; + ret = -ENOENT; + if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags)) + goto out_put; + return 0; +out_put: + cache_put(&(*item)->h, cd); +out_err: + *item = NULL; + return ret; +} + +static int do_cache_lookup_wait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + struct nfs_cache_defer_req *dreq; + int ret = -ENOMEM; + + dreq = nfs_cache_defer_req_alloc(); + if (!dreq) + goto out; + ret = do_cache_lookup(cd, key, item, dreq); + if (ret == -EAGAIN) { + ret = nfs_cache_wait_for_upcall(dreq); + if (!ret) + ret = do_cache_lookup_nowait(cd, key, item); + } + nfs_cache_defer_req_put(dreq); +out: + return ret; +} + +ssize_t nfs_dns_resolve_name(struct net *net, char *name, + size_t namelen, struct sockaddr *sa, size_t salen) +{ + struct nfs_dns_ent key = { + .hostname = name, + .namelen = namelen, + }; + struct nfs_dns_ent *item = NULL; + ssize_t ret; + struct nfs_net *nn = net_generic(net, nfs_net_id); + + ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item); + if (ret == 0) { + if (salen >= item->addrlen) { + memcpy(sa, &item->addr, item->addrlen); + ret = item->addrlen; + } else + ret = -EOVERFLOW; + cache_put(&item->h, nn->nfs_dns_resolve); + } else if (ret == -ENOENT) + ret = -ESRCH; + return ret; +} + +static struct cache_detail nfs_dns_resolve_template = { + .owner = THIS_MODULE, + .hash_size = NFS_DNS_HASHTBL_SIZE, + .name = "dns_resolve", + .cache_put = nfs_dns_ent_put, + .cache_upcall = nfs_dns_upcall, + .cache_request = nfs_dns_request, + .cache_parse = nfs_dns_parse, + .cache_show = nfs_dns_show, + .match = nfs_dns_match, + .init = nfs_dns_ent_init, + .update = nfs_dns_ent_update, + .alloc = nfs_dns_ent_alloc, +}; + + +int nfs_dns_resolver_cache_init(struct net *net) +{ + int err; + struct nfs_net *nn = net_generic(net, nfs_net_id); + + nn->nfs_dns_resolve = cache_create_net(&nfs_dns_resolve_template, net); + if (IS_ERR(nn->nfs_dns_resolve)) + return PTR_ERR(nn->nfs_dns_resolve); + + err = nfs_cache_register_net(net, nn->nfs_dns_resolve); + if (err) + goto err_reg; + return 0; + +err_reg: + cache_destroy_net(nn->nfs_dns_resolve, net); + return err; +} + +void nfs_dns_resolver_cache_destroy(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + nfs_cache_unregister_net(net, nn->nfs_dns_resolve); + cache_destroy_net(nn->nfs_dns_resolve, net); +} + +static int nfs4_dns_net_init(struct net *net) +{ + return nfs_dns_resolver_cache_init(net); +} + +static void nfs4_dns_net_exit(struct net *net) +{ + nfs_dns_resolver_cache_destroy(net); +} + +static struct pernet_operations nfs4_dns_resolver_ops = { + .init = nfs4_dns_net_init, + .exit = nfs4_dns_net_exit, +}; + +static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct super_block *sb = ptr; + struct net *net = sb->s_fs_info; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct cache_detail *cd = nn->nfs_dns_resolve; + int ret = 0; + + if (cd == NULL) + return 0; + + if (!try_module_get(THIS_MODULE)) + return 0; + + switch (event) { + case RPC_PIPEFS_MOUNT: + ret = nfs_cache_register_sb(sb, cd); + break; + case RPC_PIPEFS_UMOUNT: + nfs_cache_unregister_sb(sb, cd); + break; + default: + ret = -ENOTSUPP; + break; + } + module_put(THIS_MODULE); + return ret; +} + +static struct notifier_block nfs_dns_resolver_block = { + .notifier_call = rpc_pipefs_event, +}; + +int nfs_dns_resolver_init(void) +{ + int err; + + err = register_pernet_subsys(&nfs4_dns_resolver_ops); + if (err < 0) + goto out; + err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block); + if (err < 0) + goto out1; + return 0; +out1: + unregister_pernet_subsys(&nfs4_dns_resolver_ops); +out: + return err; +} + +void nfs_dns_resolver_destroy(void) +{ + rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block); + unregister_pernet_subsys(&nfs4_dns_resolver_ops); +} +#endif diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h new file mode 100644 index 00000000000..2e4f596d292 --- /dev/null +++ b/fs/nfs/dns_resolve.h @@ -0,0 +1,36 @@ +/* + * Resolve DNS hostnames into valid ip addresses + */ +#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H +#define __LINUX_FS_NFS_DNS_RESOLVE_H + +#define NFS_DNS_HOSTNAME_MAXLEN (128) + + +#ifdef CONFIG_NFS_USE_KERNEL_DNS +static inline int nfs_dns_resolver_init(void) +{ + return 0; +} + +static inline void nfs_dns_resolver_destroy(void) +{} + +static inline int nfs_dns_resolver_cache_init(struct net *net) +{ + return 0; +} + +static inline void nfs_dns_resolver_cache_destroy(struct net *net) +{} +#else +extern int nfs_dns_resolver_init(void); +extern void nfs_dns_resolver_destroy(void); +extern int nfs_dns_resolver_cache_init(struct net *net); +extern void nfs_dns_resolver_cache_destroy(struct net *net); +#endif + +extern ssize_t nfs_dns_resolve_name(struct net *net, char *name, + size_t namelen, struct sockaddr *sa, size_t salen); + +#endif diff --git a/fs/nfs/file.c b/fs/nfs/file.c index ef57a5ae590..4042ff58fe3 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -16,6 +16,7 @@ * nfs regular file handling functions */ +#include <linux/module.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/errno.h> @@ -24,88 +25,37 @@ #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/mm.h> -#include <linux/slab.h> #include <linux/pagemap.h> -#include <linux/smp_lock.h> #include <linux/aio.h> +#include <linux/gfp.h> +#include <linux/swap.h> #include <asm/uaccess.h> -#include <asm/system.h> #include "delegation.h" #include "internal.h" #include "iostat.h" +#include "fscache.h" -#define NFSDBG_FACILITY NFSDBG_FILE - -static int nfs_file_open(struct inode *, struct file *); -static int nfs_file_release(struct inode *, struct file *); -static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin); -static int nfs_file_mmap(struct file *, struct vm_area_struct *); -static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos, - struct pipe_inode_info *pipe, - size_t count, unsigned int flags); -static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -static int nfs_file_flush(struct file *, fl_owner_t id); -static int nfs_fsync(struct file *, struct dentry *dentry, int datasync); -static int nfs_check_flags(int flags); -static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); -static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); -static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); - -static struct vm_operations_struct nfs_file_vm_ops; - -const struct file_operations nfs_file_operations = { - .llseek = nfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = nfs_file_read, - .aio_write = nfs_file_write, - .mmap = nfs_file_mmap, - .open = nfs_file_open, - .flush = nfs_file_flush, - .release = nfs_file_release, - .fsync = nfs_fsync, - .lock = nfs_lock, - .flock = nfs_flock, - .splice_read = nfs_file_splice_read, - .check_flags = nfs_check_flags, - .setlease = nfs_setlease, -}; +#include "nfstrace.h" -const struct inode_operations nfs_file_inode_operations = { - .permission = nfs_permission, - .getattr = nfs_getattr, - .setattr = nfs_setattr, -}; +#define NFSDBG_FACILITY NFSDBG_FILE -#ifdef CONFIG_NFS_V3 -const struct inode_operations nfs3_file_inode_operations = { - .permission = nfs_permission, - .getattr = nfs_getattr, - .setattr = nfs_setattr, - .listxattr = nfs3_listxattr, - .getxattr = nfs3_getxattr, - .setxattr = nfs3_setxattr, - .removexattr = nfs3_removexattr, -}; -#endif /* CONFIG_NFS_v3 */ +static const struct vm_operations_struct nfs_file_vm_ops; /* Hack for future NFS swap support */ #ifndef IS_SWAPFILE # define IS_SWAPFILE(inode) (0) #endif -static int nfs_check_flags(int flags) +int nfs_check_flags(int flags) { if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT)) return -EINVAL; return 0; } +EXPORT_SYMBOL_GPL(nfs_check_flags); /* * Open file @@ -115,26 +65,26 @@ nfs_file_open(struct inode *inode, struct file *filp) { int res; + dprintk("NFS: open file(%pD2)\n", filp); + + nfs_inc_stats(inode, NFSIOS_VFSOPEN); res = nfs_check_flags(filp->f_flags); if (res) return res; - nfs_inc_stats(inode, NFSIOS_VFSOPEN); - lock_kernel(); - res = NFS_PROTO(inode)->file_open(inode, filp); - unlock_kernel(); + res = nfs_open(inode, filp); return res; } -static int +int nfs_file_release(struct inode *inode, struct file *filp) { - /* Ensure that dirty pages are flushed out with the right creds */ - if (filp->f_mode & FMODE_WRITE) - nfs_wb_all(filp->f_path.dentry->d_inode); + dprintk("NFS: release(%pD2)\n", filp); + nfs_inc_stats(inode, NFSIOS_VFSRELEASE); - return NFS_PROTO(inode)->file_release(inode, filp); + return nfs_release(inode, filp); } +EXPORT_SYMBOL_GPL(nfs_file_release); /** * nfs_revalidate_size - Revalidate the file size @@ -152,157 +102,238 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp) struct nfs_server *server = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); - if (server->flags & NFS_MOUNT_NOAC) - goto force_reval; + if (nfs_have_delegated_attributes(inode)) + goto out_noreval; + if (filp->f_flags & O_DIRECT) goto force_reval; - if (nfsi->npages != 0) - return 0; - if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode)) - return 0; + if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + goto force_reval; + if (nfs_attribute_timeout(inode)) + goto force_reval; +out_noreval: + return 0; force_reval: return __nfs_revalidate_inode(server, inode); } -static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) +loff_t nfs_file_llseek(struct file *filp, loff_t offset, int whence) { - /* origin == SEEK_END => we must revalidate the cached file length */ - if (origin == SEEK_END) { + dprintk("NFS: llseek file(%pD2, %lld, %d)\n", + filp, offset, whence); + + /* + * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate + * the cached file length + */ + if (whence != SEEK_SET && whence != SEEK_CUR) { struct inode *inode = filp->f_mapping->host; + int retval = nfs_revalidate_file_size(inode, filp); if (retval < 0) return (loff_t)retval; } - return remote_llseek(filp, offset, origin); -} -/* - * Helper for nfs_file_flush() and nfs_fsync() - * - * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to - * disk, but it retrieves and clears ctx->error after synching, despite - * the two being set at the same time in nfs_context_set_write_error(). - * This is because the former is used to notify the _next_ call to - * nfs_file_write() that a write error occured, and hence cause it to - * fall back to doing a synchronous write. - */ -static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode) -{ - int have_error, status; - int ret = 0; - - have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); - status = nfs_wb_all(inode); - have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); - if (have_error) - ret = xchg(&ctx->error, 0); - if (!ret) - ret = status; - return ret; + return generic_file_llseek(filp, offset, whence); } +EXPORT_SYMBOL_GPL(nfs_file_llseek); /* * Flush all dirty pages, and check for write errors. - * */ -static int +int nfs_file_flush(struct file *file, fl_owner_t id) { - struct nfs_open_context *ctx = nfs_file_open_context(file); - struct inode *inode = file->f_path.dentry->d_inode; - int status; + struct inode *inode = file_inode(file); - dfprintk(VFS, "nfs: flush(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); + dprintk("NFS: flush(%pD2)\n", file); + nfs_inc_stats(inode, NFSIOS_VFSFLUSH); if ((file->f_mode & FMODE_WRITE) == 0) return 0; - nfs_inc_stats(inode, NFSIOS_VFSFLUSH); - /* Ensure that data+attribute caches are up to date after close() */ - status = nfs_do_fsync(ctx, inode); - if (!status) - nfs_revalidate_inode(NFS_SERVER(inode), inode); - return status; + /* + * If we're holding a write delegation, then just start the i/o + * but don't wait for completion (or send a commit). + */ + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) + return filemap_fdatawrite(file->f_mapping); + + /* Flush writes to the server and return any errors */ + return vfs_fsync(file, 0); } +EXPORT_SYMBOL_GPL(nfs_file_flush); -static ssize_t -nfs_file_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t +nfs_file_read(struct kiocb *iocb, struct iov_iter *to) { - struct dentry * dentry = iocb->ki_filp->f_path.dentry; - struct inode * inode = dentry->d_inode; + struct inode *inode = file_inode(iocb->ki_filp); ssize_t result; - size_t count = iov_length(iov, nr_segs); -#ifdef CONFIG_NFS_DIRECTIO if (iocb->ki_filp->f_flags & O_DIRECT) - return nfs_file_direct_read(iocb, iov, nr_segs, pos); -#endif + return nfs_file_direct_read(iocb, to, iocb->ki_pos, true); - dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n", - dentry->d_parent->d_name.name, dentry->d_name.name, - (unsigned long) count, (unsigned long) pos); + dprintk("NFS: read(%pD2, %zu@%lu)\n", + iocb->ki_filp, + iov_iter_count(to), (unsigned long) iocb->ki_pos); result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); - nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count); - if (!result) - result = generic_file_aio_read(iocb, iov, nr_segs, pos); + if (!result) { + result = generic_file_read_iter(iocb, to); + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); + } return result; } +EXPORT_SYMBOL_GPL(nfs_file_read); -static ssize_t +ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos, struct pipe_inode_info *pipe, size_t count, unsigned int flags) { - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = file_inode(filp); ssize_t res; - dfprintk(VFS, "nfs: splice_read(%s/%s, %lu@%Lu)\n", - dentry->d_parent->d_name.name, dentry->d_name.name, - (unsigned long) count, (unsigned long long) *ppos); + dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", + filp, (unsigned long) count, (unsigned long long) *ppos); res = nfs_revalidate_mapping(inode, filp->f_mapping); - if (!res) + if (!res) { res = generic_file_splice_read(filp, ppos, pipe, count, flags); + if (res > 0) + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); + } return res; } +EXPORT_SYMBOL_GPL(nfs_file_splice_read); -static int +int nfs_file_mmap(struct file * file, struct vm_area_struct * vma) { - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = file_inode(file); int status; - dfprintk(VFS, "nfs: mmap(%s/%s)\n", - dentry->d_parent->d_name.name, dentry->d_name.name); + dprintk("NFS: mmap(%pD2)\n", file); - status = nfs_revalidate_mapping(inode, file->f_mapping); + /* Note: generic_file_mmap() returns ENOSYS on nommu systems + * so we call that before revalidating the mapping + */ + status = generic_file_mmap(file, vma); if (!status) { vma->vm_ops = &nfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; - file_accessed(file); + status = nfs_revalidate_mapping(inode, file->f_mapping); } return status; } +EXPORT_SYMBOL_GPL(nfs_file_mmap); /* * Flush any dirty pages for this process, and check for write errors. * The return status from this call provides a reliable indication of * whether any write errors occurred for this process. + * + * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to + * disk, but it retrieves and clears ctx->error after synching, despite + * the two being set at the same time in nfs_context_set_write_error(). + * This is because the former is used to notify the _next_ call to + * nfs_file_write() that a write error occurred, and hence cause it to + * fall back to doing a synchronous write. */ -static int -nfs_fsync(struct file *file, struct dentry *dentry, int datasync) +int +nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct inode *inode = dentry->d_inode; + struct inode *inode = file_inode(file); + int have_error, do_resend, status; + int ret = 0; - dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); + dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); - return nfs_do_fsync(ctx, inode); + do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); + have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + status = nfs_commit_inode(inode, FLUSH_SYNC); + have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + if (have_error) { + ret = xchg(&ctx->error, 0); + if (ret) + goto out; + } + if (status < 0) { + ret = status; + goto out; + } + do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); + if (do_resend) + ret = -EAGAIN; +out: + return ret; +} +EXPORT_SYMBOL_GPL(nfs_file_fsync_commit); + +static int +nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + int ret; + struct inode *inode = file_inode(file); + + trace_nfs_fsync_enter(inode); + + do { + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret != 0) + break; + mutex_lock(&inode->i_mutex); + ret = nfs_file_fsync_commit(file, start, end, datasync); + mutex_unlock(&inode->i_mutex); + /* + * If nfs_file_fsync_commit detected a server reboot, then + * resend all dirty pages that might have been covered by + * the NFS_CONTEXT_RESEND_WRITES flag + */ + start = 0; + end = LLONG_MAX; + } while (ret == -EAGAIN); + + trace_nfs_fsync_exit(inode, ret); + return ret; +} + +/* + * Decide whether a read/modify/write cycle may be more efficient + * then a modify/write/read cycle when writing to a page in the + * page cache. + * + * The modify/write/read cycle may occur if a page is read before + * being completely filled by the writer. In this situation, the + * page must be completely written to stable storage on the server + * before it can be refilled by reading in the page from the server. + * This can lead to expensive, small, FILE_SYNC mode writes being + * done. + * + * It may be more efficient to read the page first if the file is + * open for reading in addition to writing, the page is not marked + * as Uptodate, it is not dirty or waiting to be committed, + * indicating that it was previously allocated and then modified, + * that there were valid bytes of data in that range of the file, + * and that the new data won't completely replace the old data in + * that range of the file. + */ +static int nfs_want_read_modify_write(struct file *file, struct page *page, + loff_t pos, unsigned len) +{ + unsigned int pglen = nfs_page_length(page); + unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); + unsigned int end = offset + len; + + if ((file->f_mode & FMODE_READ) && /* open for read? */ + !PageUptodate(page) && /* Uptodate? */ + !PagePrivate(page) && /* i/o request already? */ + pglen && /* valid bytes of file? */ + (end < pglen || offset)) /* replace all valid bytes? */ + return 1; + return 0; } /* @@ -318,11 +349,24 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { int ret; - pgoff_t index; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; struct page *page; - index = pos >> PAGE_CACHE_SHIFT; + int once_thru = 0; + + dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", + file, mapping->host->i_ino, len, (long long) pos); - page = __grab_cache_page(mapping, index); +start: + /* + * Prevent starvation issues if someone is doing a consistency + * sync-to-disk + */ + ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) + return ret; + + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; @@ -331,6 +375,13 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, if (ret) { unlock_page(page); page_cache_release(page); + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; + ret = nfs_readpage(file, page); + page_cache_release(page); + if (!ret) + goto start; } return ret; } @@ -340,39 +391,164 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct nfs_open_context *ctx = nfs_file_open_context(file); int status; - lock_kernel(); + dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", + file, mapping->host->i_ino, len, (long long) pos); + + /* + * Zero any uninitialised parts of the page, and then mark the page + * as up to date if it turns out that we're extending the file. + */ + if (!PageUptodate(page)) { + unsigned pglen = nfs_page_length(page); + unsigned end = offset + len; + + if (pglen == 0) { + zero_user_segments(page, 0, offset, + end, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } else if (end >= pglen) { + zero_user_segment(page, end, PAGE_CACHE_SIZE); + if (offset == 0) + SetPageUptodate(page); + } else + zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + } + status = nfs_updatepage(file, page, offset, copied); - unlock_kernel(); unlock_page(page); page_cache_release(page); if (status < 0) return status; + NFS_I(mapping->host)->write_io += copied; + + if (nfs_ctx_key_to_expire(ctx)) { + status = nfs_wb_all(mapping->host); + if (status < 0) + return status; + } + return copied; } -static void nfs_invalidate_page(struct page *page, unsigned long offset) +/* + * Partially or wholly invalidate a page + * - Release the private state associated with a page if undergoing complete + * page invalidation + * - Called if either PG_private or PG_fscache is set on the page + * - Caller holds page lock + */ +static void nfs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length) { - if (offset != 0) + dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n", + page, offset, length); + + if (offset != 0 || length < PAGE_CACHE_SIZE) return; /* Cancel any unstarted writes on this page */ - nfs_wb_page_cancel(page->mapping->host, page); + nfs_wb_page_cancel(page_file_mapping(page)->host, page); + + nfs_fscache_invalidate_page(page, page->mapping->host); } +/* + * Attempt to release the private state associated with a page + * - Called if either PG_private or PG_fscache is set on the page + * - Caller holds page lock + * - Return true (may release page) or false (may not) + */ static int nfs_release_page(struct page *page, gfp_t gfp) { + struct address_space *mapping = page->mapping; + + dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); + + /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not + * doing this memory reclaim for a fs-related allocation. + */ + if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL && + !(current->flags & PF_FSTRANS)) { + int how = FLUSH_SYNC; + + /* Don't let kswapd deadlock waiting for OOM RPC calls */ + if (current_is_kswapd()) + how = 0; + nfs_commit_inode(mapping->host, how); + } /* If PagePrivate() is set, then the page is not freeable */ - return 0; + if (PagePrivate(page)) + return 0; + return nfs_fscache_release_page(page, gfp); } +static void nfs_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct nfs_inode *nfsi; + struct address_space *mapping = page_file_mapping(page); + + if (!mapping || PageSwapCache(page)) + return; + + /* + * Check if an unstable page is currently being committed and + * if so, have the VM treat it as if the page is under writeback + * so it will not block due to pages that will shortly be freeable. + */ + nfsi = NFS_I(mapping->host); + if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) { + *writeback = true; + return; + } + + /* + * If PagePrivate() is set, then the page is not freeable and as the + * inode is not being committed, it's not going to be cleaned in the + * near future so treat it as dirty + */ + if (PagePrivate(page)) + *dirty = true; +} + +/* + * Attempt to clear the private state associated with a page when an error + * occurs that requires the cached contents of an inode to be written back or + * destroyed + * - Called if either PG_private or fscache is set on the page + * - Caller holds page lock + * - Return 0 if successful, -error otherwise + */ static int nfs_launder_page(struct page *page) { - return nfs_wb_page(page->mapping->host, page); + struct inode *inode = page_file_mapping(page)->host; + struct nfs_inode *nfsi = NFS_I(inode); + + dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", + inode->i_ino, (long long)page_offset(page)); + + nfs_fscache_wait_on_page_write(nfsi, page); + return nfs_wb_page(inode, page); +} + +#ifdef CONFIG_NFS_SWAP +static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + *span = sis->pages; + return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1); } +static void nfs_swap_deactivate(struct file *file) +{ + xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0); +} +#endif + const struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, .readpages = nfs_readpages, @@ -383,74 +559,99 @@ const struct address_space_operations nfs_file_aops = { .write_end = nfs_write_end, .invalidatepage = nfs_invalidate_page, .releasepage = nfs_release_page, -#ifdef CONFIG_NFS_DIRECTIO .direct_IO = nfs_direct_IO, -#endif + .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, + .is_dirty_writeback = nfs_check_dirty_writeback, + .error_remove_page = generic_error_remove_page, +#ifdef CONFIG_NFS_SWAP + .swap_activate = nfs_swap_activate, + .swap_deactivate = nfs_swap_deactivate, +#endif }; -static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) +/* + * Notification that a PTE pointing to an NFS page is about to be made + * writable, implying that someone is about to modify the page through a + * shared-writable mapping + */ +static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { + struct page *page = vmf->page; struct file *filp = vma->vm_file; + struct inode *inode = file_inode(filp); unsigned pagelen; - int ret = -EINVAL; + int ret = VM_FAULT_NOPAGE; struct address_space *mapping; + dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n", + filp, filp->f_mapping->host->i_ino, + (long long)page_offset(page)); + + /* make sure the cache has finished storing the page */ + nfs_fscache_wait_on_page_write(NFS_I(inode), page); + lock_page(page); - mapping = page->mapping; - if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping) + mapping = page_file_mapping(page); + if (mapping != inode->i_mapping) goto out_unlock; - ret = 0; + wait_on_page_writeback(page); + pagelen = nfs_page_length(page); if (pagelen == 0) goto out_unlock; - ret = nfs_flush_incompatible(filp, page); - if (ret != 0) - goto out_unlock; + ret = VM_FAULT_LOCKED; + if (nfs_flush_incompatible(filp, page) == 0 && + nfs_updatepage(filp, page, 0, pagelen) == 0) + goto out; - ret = nfs_updatepage(filp, page, 0, pagelen); - if (ret == 0) - ret = pagelen; + ret = VM_FAULT_SIGBUS; out_unlock: unlock_page(page); +out: return ret; } -static struct vm_operations_struct nfs_file_vm_ops = { +static const struct vm_operations_struct nfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = nfs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int nfs_need_sync_write(struct file *filp, struct inode *inode) { struct nfs_open_context *ctx; - if (IS_SYNC(inode) || (filp->f_flags & O_SYNC)) + if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC)) return 1; ctx = nfs_file_open_context(filp); - if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) + if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || + nfs_ctx_key_to_expire(ctx)) return 1; return 0; } -static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) { - struct dentry * dentry = iocb->ki_filp->f_path.dentry; - struct inode * inode = dentry->d_inode; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + unsigned long written = 0; ssize_t result; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(from); + loff_t pos = iocb->ki_pos; -#ifdef CONFIG_NFS_DIRECTIO - if (iocb->ki_filp->f_flags & O_DIRECT) - return nfs_file_direct_write(iocb, iov, nr_segs, pos); -#endif + result = nfs_key_timeout_notify(file, inode); + if (result) + return result; + + if (file->f_flags & O_DIRECT) + return nfs_file_direct_write(iocb, from, pos, true); - dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n", - dentry->d_parent->d_name.name, dentry->d_name.name, - inode->i_ino, (unsigned long) count, (long long) pos); + dprintk("NFS: write(%pD2, %zu@%Ld)\n", + file, count, (long long) pos); result = -EBUSY; if (IS_SWAPFILE(inode)) @@ -458,8 +659,8 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, /* * O_APPEND implies that we must revalidate the file length. */ - if (iocb->ki_filp->f_flags & O_APPEND) { - result = nfs_revalidate_file_size(inode, iocb->ki_filp); + if (file->f_flags & O_APPEND) { + result = nfs_revalidate_file_size(inode, file); if (result) goto out; } @@ -468,14 +669,18 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, if (!count) goto out; - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count); - result = generic_file_aio_write(iocb, iov, nr_segs, pos); - /* Return error values for O_SYNC and IS_SYNC() */ - if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { - int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); + result = generic_file_write_iter(iocb, from); + if (result > 0) + written = result; + + /* Return error values for O_DSYNC and IS_SYNC() */ + if (result >= 0 && nfs_need_sync_write(file, inode)) { + int err = vfs_fsync(file, 0); if (err < 0) result = err; } + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); out: return result; @@ -483,29 +688,31 @@ out_swapfile: printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); goto out; } +EXPORT_SYMBOL_GPL(nfs_file_write); -static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) +static int +do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; int status = 0; + unsigned int saved_type = fl->fl_type; - lock_kernel(); /* Try local locking first */ posix_test_lock(filp, fl); if (fl->fl_type != F_UNLCK) { /* found a conflict */ goto out; } + fl->fl_type = saved_type; - if (nfs_have_delegation(inode, FMODE_READ)) + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) goto out_noconflict; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) + if (is_local) goto out_noconflict; status = NFS_PROTO(inode)->lock(filp, cmd, fl); out: - unlock_kernel(); return status; out_noconflict: fl->fl_type = F_UNLCK; @@ -525,16 +732,14 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl) default: BUG(); } - if (res < 0) - dprintk(KERN_WARNING "%s: VFS is out of sync with lock manager" - " - error %d!\n", - __FUNCTION__, res); return res; } -static int do_unlk(struct file *filp, int cmd, struct file_lock *fl) +static int +do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; + struct nfs_lock_context *l_ctx; int status; /* @@ -543,21 +748,36 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl) */ nfs_sync_mapping(filp->f_mapping); + l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); + if (!IS_ERR(l_ctx)) { + status = nfs_iocounter_wait(&l_ctx->io_count); + nfs_put_lock_context(l_ctx); + if (status < 0) + return status; + } + /* NOTE: special case * If we're signalled while cleaning up locks on process exit, we * still need to complete the unlock. */ - lock_kernel(); - /* Use local locking if mounted with "-onolock" */ - if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) + /* + * Use local locking if mounted with "-onolock" or with appropriate + * "-olocal_lock=" + */ + if (!is_local) status = NFS_PROTO(inode)->lock(filp, cmd, fl); else status = do_vfs_lock(filp, fl); - unlock_kernel(); return status; } -static int do_setlk(struct file *filp, int cmd, struct file_lock *fl) +static int +is_time_granular(struct timespec *ts) { + return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000)); +} + +static int +do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; int status; @@ -570,29 +790,31 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl) if (status != 0) goto out; - lock_kernel(); - /* Use local locking if mounted with "-onolock" */ - if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) { + /* + * Use local locking if mounted with "-onolock" or with appropriate + * "-olocal_lock=" + */ + if (!is_local) status = NFS_PROTO(inode)->lock(filp, cmd, fl); - /* If we were signalled we still need to ensure that - * we clean up any state on the server. We therefore - * record the lock call as having succeeded in order to - * ensure that locks_remove_posix() cleans it out when - * the process exits. - */ - if (status == -EINTR || status == -ERESTARTSYS) - do_vfs_lock(filp, fl); - } else + else status = do_vfs_lock(filp, fl); - unlock_kernel(); if (status < 0) goto out; + /* - * Make sure we clear the cache whenever we try to get the lock. + * Revalidate the cache if the server has time stamps granular + * enough to detect subsecond changes. Otherwise, clear the + * cache to prevent missing any changes. + * * This makes locking act as a cache coherency point. */ nfs_sync_mapping(filp->f_mapping); - nfs_zap_caches(inode); + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { + if (is_time_granular(&NFS_SERVER(inode)->time_delta)) + __nfs_revalidate_inode(NFS_SERVER(inode), inode); + else + nfs_zap_caches(inode); + } out: return status; } @@ -600,63 +822,102 @@ out: /* * Lock a (portion of) a file */ -static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) +int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) { - struct inode * inode = filp->f_mapping->host; + struct inode *inode = filp->f_mapping->host; + int ret = -ENOLCK; + int is_local = 0; - dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n", - inode->i_sb->s_id, inode->i_ino, - fl->fl_type, fl->fl_flags, + dprintk("NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld)\n", + filp, fl->fl_type, fl->fl_flags, (long long)fl->fl_start, (long long)fl->fl_end); + nfs_inc_stats(inode, NFSIOS_VFSLOCK); /* No mandatory locks over NFS */ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; + goto out_err; + + if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) + is_local = 1; + + if (NFS_PROTO(inode)->lock_check_bounds != NULL) { + ret = NFS_PROTO(inode)->lock_check_bounds(fl); + if (ret < 0) + goto out_err; + } if (IS_GETLK(cmd)) - return do_getlk(filp, cmd, fl); - if (fl->fl_type == F_UNLCK) - return do_unlk(filp, cmd, fl); - return do_setlk(filp, cmd, fl); + ret = do_getlk(filp, cmd, fl, is_local); + else if (fl->fl_type == F_UNLCK) + ret = do_unlk(filp, cmd, fl, is_local); + else + ret = do_setlk(filp, cmd, fl, is_local); +out_err: + return ret; } +EXPORT_SYMBOL_GPL(nfs_lock); /* * Lock a (portion of) a file */ -static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) +int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) { - dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n", - filp->f_path.dentry->d_inode->i_sb->s_id, - filp->f_path.dentry->d_inode->i_ino, - fl->fl_type, fl->fl_flags); + struct inode *inode = filp->f_mapping->host; + int is_local = 0; + + dprintk("NFS: flock(%pD2, t=%x, fl=%x)\n", + filp, fl->fl_type, fl->fl_flags); - /* - * No BSD flocks over NFS allowed. - * Note: we could try to fake a POSIX lock request here by - * using ((u32) filp | 0x80000000) or some such as the pid. - * Not sure whether that would be unique, though, or whether - * that would break in other places. - */ if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; - /* We're simulating flock() locks using posix locks on the server */ - fl->fl_owner = (fl_owner_t)filp; - fl->fl_start = 0; - fl->fl_end = OFFSET_MAX; + /* + * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of + * any standard. In principle we might be able to support LOCK_MAND + * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the + * NFS code is not set up for it. + */ + if (fl->fl_type & LOCK_MAND) + return -EINVAL; + + if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) + is_local = 1; + /* We're simulating flock() locks using posix locks on the server */ if (fl->fl_type == F_UNLCK) - return do_unlk(filp, cmd, fl); - return do_setlk(filp, cmd, fl); + return do_unlk(filp, cmd, fl, is_local); + return do_setlk(filp, cmd, fl, is_local); } +EXPORT_SYMBOL_GPL(nfs_flock); -static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) +/* + * There is no protocol support for leases, so we have no way to implement + * them correctly in the face of opens by other clients. + */ +int nfs_setlease(struct file *file, long arg, struct file_lock **fl) { - /* - * There is no protocol support for leases, so we have no way - * to implement them correctly in the face of opens by other - * clients. - */ + dprintk("NFS: setlease(%pD2, arg=%ld)\n", file, arg); return -EINVAL; } +EXPORT_SYMBOL_GPL(nfs_setlease); + +const struct file_operations nfs_file_operations = { + .llseek = nfs_file_llseek, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = nfs_file_read, + .write_iter = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = iter_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = nfs_setlease, +}; +EXPORT_SYMBOL_GPL(nfs_file_operations); diff --git a/fs/nfs/filelayout/Makefile b/fs/nfs/filelayout/Makefile new file mode 100644 index 00000000000..8516cdffb9e --- /dev/null +++ b/fs/nfs/filelayout/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS Files Layout Driver kernel module +# +obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o +nfs_layout_nfsv41_files-y := filelayout.o filelayoutdev.o diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c new file mode 100644 index 00000000000..d2eba1c13b7 --- /dev/null +++ b/fs/nfs/filelayout/filelayout.c @@ -0,0 +1,1406 @@ +/* + * Module for the pnfs nfs4 file layout driver. + * Defines all I/O and Policy interface operations, plus code + * to register itself with the pNFS client. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include <linux/nfs_fs.h> +#include <linux/nfs_page.h> +#include <linux/module.h> + +#include <linux/sunrpc/metrics.h> + +#include "../nfs4session.h" +#include "../internal.h" +#include "../delegation.h" +#include "filelayout.h" +#include "../nfs4trace.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>"); +MODULE_DESCRIPTION("The NFSv4 file layout driver"); + +#define FILELAYOUT_POLL_RETRY_MAX (15*HZ) + +static loff_t +filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg, + loff_t offset) +{ + u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count; + u64 stripe_no; + u32 rem; + + offset -= flseg->pattern_offset; + stripe_no = div_u64(offset, stripe_width); + div_u64_rem(offset, flseg->stripe_unit, &rem); + + return stripe_no * flseg->stripe_unit + rem; +} + +/* This function is used by the layout driver to calculate the + * offset of the file on the dserver based on whether the + * layout type is STRIPE_DENSE or STRIPE_SPARSE + */ +static loff_t +filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + switch (flseg->stripe_type) { + case STRIPE_SPARSE: + return offset; + + case STRIPE_DENSE: + return filelayout_get_dense_offset(flseg, offset); + } + + BUG(); +} + +static void filelayout_reset_write(struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + struct rpc_task *task = &data->task; + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + dprintk("%s Reset task %5u for i/o through MDS " + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, + data->task.tk_pid, + hdr->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(hdr->inode), + data->args.count, + (unsigned long long)data->args.offset); + + task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode, + &hdr->pages, + hdr->completion_ops, + hdr->dreq); + } +} + +static void filelayout_reset_read(struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + struct rpc_task *task = &data->task; + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + dprintk("%s Reset task %5u for i/o through MDS " + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, + data->task.tk_pid, + hdr->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(hdr->inode), + data->args.count, + (unsigned long long)data->args.offset); + + task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode, + &hdr->pages, + hdr->completion_ops, + hdr->dreq); + } +} + +static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo) +{ + if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + return; + pnfs_return_layout(inode); +} + +static int filelayout_async_handle_error(struct rpc_task *task, + struct nfs4_state *state, + struct nfs_client *clp, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_hdr *lo = lseg->pls_layout; + struct inode *inode = lo->plh_inode; + struct nfs_server *mds_server = NFS_SERVER(inode); + struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); + struct nfs_client *mds_client = mds_server->nfs_client; + struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; + + if (task->tk_status >= 0) + return 0; + + switch (task->tk_status) { + /* MDS state errors */ + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + if (state == NULL) + break; + nfs_remove_bad_delegation(state->inode); + case -NFS4ERR_OPENMODE: + if (state == NULL) + break; + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) + goto out_bad_stateid; + goto wait_on_recovery; + case -NFS4ERR_EXPIRED: + if (state != NULL) { + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) + goto out_bad_stateid; + } + nfs4_schedule_lease_recovery(mds_client); + goto wait_on_recovery; + /* DS session errors */ + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR %d, Reset session. Exchangeid " + "flags 0x%x\n", __func__, task->tk_status, + clp->cl_exchange_flags); + nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); + break; + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX); + break; + case -NFS4ERR_RETRY_UNCACHED_REP: + break; + /* Invalidate Layout errors */ + case -NFS4ERR_PNFS_NO_LAYOUT: + case -ESTALE: /* mapped NFS4ERR_STALE */ + case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */ + case -EISDIR: /* mapped NFS4ERR_ISDIR */ + case -NFS4ERR_FHEXPIRED: + case -NFS4ERR_WRONG_TYPE: + dprintk("%s Invalid layout error %d\n", __func__, + task->tk_status); + /* + * Destroy layout so new i/o will get a new layout. + * Layout will not be destroyed until all current lseg + * references are put. Mark layout as invalid to resend failed + * i/o and all i/o waiting on the slot table to the MDS until + * layout is destroyed and a new valid layout is obtained. + */ + pnfs_destroy_layout(NFS_I(inode)); + rpc_wake_up(&tbl->slot_tbl_waitq); + goto reset; + /* RPC connection errors */ + case -ECONNREFUSED: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + case -EIO: + case -ETIMEDOUT: + case -EPIPE: + dprintk("%s DS connection error %d\n", __func__, + task->tk_status); + nfs4_mark_deviceid_unavailable(devid); + set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); + rpc_wake_up(&tbl->slot_tbl_waitq); + /* fall through */ + default: +reset: + dprintk("%s Retry through MDS. Error %d\n", __func__, + task->tk_status); + return -NFS4ERR_RESET_TO_MDS; + } +out: + task->tk_status = 0; + return -EAGAIN; +out_bad_stateid: + task->tk_status = -EIO; + return 0; +wait_on_recovery: + rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0) + rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task); + goto out; +} + +/* NFS_PROTO call done callback routines */ + +static int filelayout_read_done_cb(struct rpc_task *task, + struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + int err; + + trace_nfs4_pnfs_read(data, task->tk_status); + err = filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, hdr->lseg); + + switch (err) { + case -NFS4ERR_RESET_TO_MDS: + filelayout_reset_read(data); + return task->tk_status; + case -EAGAIN: + rpc_restart_call_prepare(task); + return -EAGAIN; + } + + return 0; +} + +/* + * We reference the rpc_cred of the first WRITE that triggers the need for + * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. + * rfc5661 is not clear about which credential should be used. + */ +static void +filelayout_set_layoutcommit(struct nfs_pgio_data *wdata) +{ + struct nfs_pgio_header *hdr = wdata->header; + + if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || + wdata->res.verf->committed == NFS_FILE_SYNC) + return; + + pnfs_set_layoutcommit(wdata); + dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, + (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); +} + +bool +filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node) +{ + return filelayout_test_devid_invalid(node) || + nfs4_test_deviceid_unavailable(node); +} + +static bool +filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) +{ + struct nfs4_deviceid_node *node = FILELAYOUT_DEVID_NODE(lseg); + + return filelayout_test_devid_unavailable(node); +} + +/* + * Call ops for the async read/write cases + * In the case of dense layouts, the offset needs to be reset to its + * original value. + */ +static void filelayout_read_prepare(struct rpc_task *task, void *data) +{ + struct nfs_pgio_data *rdata = data; + + if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) { + rpc_exit(task, -EIO); + return; + } + if (filelayout_reset_to_mds(rdata->header->lseg)) { + dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); + filelayout_reset_read(rdata); + rpc_exit(task, 0); + return; + } + rdata->pgio_done_cb = filelayout_read_done_cb; + + if (nfs41_setup_sequence(rdata->ds_clp->cl_session, + &rdata->args.seq_args, + &rdata->res.seq_res, + task)) + return; + if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, + rdata->args.lock_context, FMODE_READ) == -EIO) + rpc_exit(task, -EIO); /* lost lock, terminate I/O */ +} + +static void filelayout_read_call_done(struct rpc_task *task, void *data) +{ + struct nfs_pgio_data *rdata = data; + + dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); + + if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) && + task->tk_status == 0) { + nfs41_sequence_done(task, &rdata->res.seq_res); + return; + } + + /* Note this may cause RPC to be resent */ + rdata->header->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_read_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_pgio_data *rdata = data; + + rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics); +} + +static void filelayout_read_release(void *data) +{ + struct nfs_pgio_data *rdata = data; + struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout; + + filelayout_fenceme(lo->plh_inode, lo); + nfs_put_client(rdata->ds_clp); + rdata->header->mds_ops->rpc_release(data); +} + +static int filelayout_write_done_cb(struct rpc_task *task, + struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + int err; + + trace_nfs4_pnfs_write(data, task->tk_status); + err = filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, hdr->lseg); + + switch (err) { + case -NFS4ERR_RESET_TO_MDS: + filelayout_reset_write(data); + return task->tk_status; + case -EAGAIN: + rpc_restart_call_prepare(task); + return -EAGAIN; + } + + filelayout_set_layoutcommit(data); + return 0; +} + +/* Fake up some data that will cause nfs_commit_release to retry the writes. */ +static void prepare_to_resend_writes(struct nfs_commit_data *data) +{ + struct nfs_page *first = nfs_list_entry(data->pages.next); + + data->task.tk_status = 0; + memcpy(&data->verf.verifier, &first->wb_verf, + sizeof(data->verf.verifier)); + data->verf.verifier.data[0]++; /* ensure verifier mismatch */ +} + +static int filelayout_commit_done_cb(struct rpc_task *task, + struct nfs_commit_data *data) +{ + int err; + + trace_nfs4_pnfs_commit_ds(data, task->tk_status); + err = filelayout_async_handle_error(task, NULL, data->ds_clp, + data->lseg); + + switch (err) { + case -NFS4ERR_RESET_TO_MDS: + prepare_to_resend_writes(data); + return -EAGAIN; + case -EAGAIN: + rpc_restart_call_prepare(task); + return -EAGAIN; + } + + return 0; +} + +static void filelayout_write_prepare(struct rpc_task *task, void *data) +{ + struct nfs_pgio_data *wdata = data; + + if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) { + rpc_exit(task, -EIO); + return; + } + if (filelayout_reset_to_mds(wdata->header->lseg)) { + dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); + filelayout_reset_write(wdata); + rpc_exit(task, 0); + return; + } + if (nfs41_setup_sequence(wdata->ds_clp->cl_session, + &wdata->args.seq_args, + &wdata->res.seq_res, + task)) + return; + if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, + wdata->args.lock_context, FMODE_WRITE) == -EIO) + rpc_exit(task, -EIO); /* lost lock, terminate I/O */ +} + +static void filelayout_write_call_done(struct rpc_task *task, void *data) +{ + struct nfs_pgio_data *wdata = data; + + if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) && + task->tk_status == 0) { + nfs41_sequence_done(task, &wdata->res.seq_res); + return; + } + + /* Note this may cause RPC to be resent */ + wdata->header->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_write_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_pgio_data *wdata = data; + + rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics); +} + +static void filelayout_write_release(void *data) +{ + struct nfs_pgio_data *wdata = data; + struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout; + + filelayout_fenceme(lo->plh_inode, lo); + nfs_put_client(wdata->ds_clp); + wdata->header->mds_ops->rpc_release(data); +} + +static void filelayout_commit_prepare(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *wdata = data; + + nfs41_setup_sequence(wdata->ds_clp->cl_session, + &wdata->args.seq_args, + &wdata->res.seq_res, + task); +} + +static void filelayout_write_commit_done(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *wdata = data; + + /* Note this may cause RPC to be resent */ + wdata->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_commit_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *cdata = data; + + rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics); +} + +static void filelayout_commit_release(void *calldata) +{ + struct nfs_commit_data *data = calldata; + + data->completion_ops->completion(data); + pnfs_put_lseg(data->lseg); + nfs_put_client(data->ds_clp); + nfs_commitdata_release(data); +} + +static const struct rpc_call_ops filelayout_read_call_ops = { + .rpc_call_prepare = filelayout_read_prepare, + .rpc_call_done = filelayout_read_call_done, + .rpc_count_stats = filelayout_read_count_stats, + .rpc_release = filelayout_read_release, +}; + +static const struct rpc_call_ops filelayout_write_call_ops = { + .rpc_call_prepare = filelayout_write_prepare, + .rpc_call_done = filelayout_write_call_done, + .rpc_count_stats = filelayout_write_count_stats, + .rpc_release = filelayout_write_release, +}; + +static const struct rpc_call_ops filelayout_commit_call_ops = { + .rpc_call_prepare = filelayout_commit_prepare, + .rpc_call_done = filelayout_write_commit_done, + .rpc_count_stats = filelayout_commit_count_stats, + .rpc_release = filelayout_commit_release, +}; + +static enum pnfs_try_status +filelayout_read_pagelist(struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + struct pnfs_layout_segment *lseg = hdr->lseg; + struct nfs4_pnfs_ds *ds; + struct rpc_clnt *ds_clnt; + loff_t offset = data->args.offset; + u32 j, idx; + struct nfs_fh *fh; + + dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", + __func__, hdr->inode->i_ino, + data->args.pgbase, (size_t)data->args.count, offset); + + /* Retrieve the correct rpc_client for the byte range */ + j = nfs4_fl_calc_j_index(lseg, offset); + idx = nfs4_fl_calc_ds_index(lseg, j); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) + return PNFS_NOT_ATTEMPTED; + + ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode); + if (IS_ERR(ds_clnt)) + return PNFS_NOT_ATTEMPTED; + + dprintk("%s USE DS: %s cl_count %d\n", __func__, + ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); + + /* No multipath support. Use first DS */ + atomic_inc(&ds->ds_clp->cl_count); + data->ds_clp = ds->ds_clp; + data->ds_idx = idx; + fh = nfs4_fl_select_ds_fh(lseg, j); + if (fh) + data->args.fh = fh; + + data->args.offset = filelayout_get_dserver_offset(lseg, offset); + data->mds_offset = offset; + + /* Perform an asynchronous read to ds */ + nfs_initiate_pgio(ds_clnt, data, + &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN); + return PNFS_ATTEMPTED; +} + +/* Perform async writes. */ +static enum pnfs_try_status +filelayout_write_pagelist(struct nfs_pgio_data *data, int sync) +{ + struct nfs_pgio_header *hdr = data->header; + struct pnfs_layout_segment *lseg = hdr->lseg; + struct nfs4_pnfs_ds *ds; + struct rpc_clnt *ds_clnt; + loff_t offset = data->args.offset; + u32 j, idx; + struct nfs_fh *fh; + + /* Retrieve the correct rpc_client for the byte range */ + j = nfs4_fl_calc_j_index(lseg, offset); + idx = nfs4_fl_calc_ds_index(lseg, j); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) + return PNFS_NOT_ATTEMPTED; + + ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, hdr->inode); + if (IS_ERR(ds_clnt)) + return PNFS_NOT_ATTEMPTED; + + dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n", + __func__, hdr->inode->i_ino, sync, (size_t) data->args.count, + offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); + + data->pgio_done_cb = filelayout_write_done_cb; + atomic_inc(&ds->ds_clp->cl_count); + data->ds_clp = ds->ds_clp; + data->ds_idx = idx; + fh = nfs4_fl_select_ds_fh(lseg, j); + if (fh) + data->args.fh = fh; + + data->args.offset = filelayout_get_dserver_offset(lseg, offset); + + /* Perform an asynchronous write */ + nfs_initiate_pgio(ds_clnt, data, + &filelayout_write_call_ops, sync, + RPC_TASK_SOFTCONN); + return PNFS_ATTEMPTED; +} + +/* + * filelayout_check_layout() + * + * Make sure layout segment parameters are sane WRT the device. + * At this point no generic layer initialization of the lseg has occurred, + * and nothing has been added to the layout_hdr cache. + * + */ +static int +filelayout_check_layout(struct pnfs_layout_hdr *lo, + struct nfs4_filelayout_segment *fl, + struct nfs4_layoutget_res *lgr, + struct nfs4_deviceid *id, + gfp_t gfp_flags) +{ + struct nfs4_deviceid_node *d; + struct nfs4_file_layout_dsaddr *dsaddr; + int status = -EINVAL; + + dprintk("--> %s\n", __func__); + + /* FIXME: remove this check when layout segment support is added */ + if (lgr->range.offset != 0 || + lgr->range.length != NFS4_MAX_UINT64) { + dprintk("%s Only whole file layouts supported. Use MDS i/o\n", + __func__); + goto out; + } + + if (fl->pattern_offset > lgr->range.offset) { + dprintk("%s pattern_offset %lld too large\n", + __func__, fl->pattern_offset); + goto out; + } + + if (!fl->stripe_unit) { + dprintk("%s Invalid stripe unit (%u)\n", + __func__, fl->stripe_unit); + goto out; + } + + /* find and reference the deviceid */ + d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, + NFS_SERVER(lo->plh_inode)->nfs_client, id); + if (d == NULL) { + dsaddr = filelayout_get_device_info(lo->plh_inode, id, + lo->plh_lc_cred, gfp_flags); + if (dsaddr == NULL) + goto out; + } else + dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); + /* Found deviceid is unavailable */ + if (filelayout_test_devid_unavailable(&dsaddr->id_node)) + goto out_put; + + fl->dsaddr = dsaddr; + + if (fl->first_stripe_index >= dsaddr->stripe_count) { + dprintk("%s Bad first_stripe_index %u\n", + __func__, fl->first_stripe_index); + goto out_put; + } + + if ((fl->stripe_type == STRIPE_SPARSE && + fl->num_fh > 1 && fl->num_fh != dsaddr->ds_num) || + (fl->stripe_type == STRIPE_DENSE && + fl->num_fh != dsaddr->stripe_count)) { + dprintk("%s num_fh %u not valid for given packing\n", + __func__, fl->num_fh); + goto out_put; + } + + status = 0; +out: + dprintk("--> %s returns %d\n", __func__, status); + return status; +out_put: + nfs4_fl_put_deviceid(dsaddr); + goto out; +} + +static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) +{ + int i; + + for (i = 0; i < fl->num_fh; i++) { + if (!fl->fh_array[i]) + break; + kfree(fl->fh_array[i]); + } + kfree(fl->fh_array); + fl->fh_array = NULL; +} + +static void +_filelayout_free_lseg(struct nfs4_filelayout_segment *fl) +{ + filelayout_free_fh_array(fl); + kfree(fl); +} + +static int +filelayout_decode_layout(struct pnfs_layout_hdr *flo, + struct nfs4_filelayout_segment *fl, + struct nfs4_layoutget_res *lgr, + struct nfs4_deviceid *id, + gfp_t gfp_flags) +{ + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + __be32 *p; + uint32_t nfl_util; + int i; + + dprintk("%s: set_layout_map Begin\n", __func__); + + scratch = alloc_page(gfp_flags); + if (!scratch) + return -ENOMEM; + + xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), + * num_fh (4) */ + p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20); + if (unlikely(!p)) + goto out_err; + + memcpy(id, p, sizeof(*id)); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + nfs4_print_deviceid(id); + + nfl_util = be32_to_cpup(p++); + if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) + fl->commit_through_mds = 1; + if (nfl_util & NFL4_UFLG_DENSE) + fl->stripe_type = STRIPE_DENSE; + else + fl->stripe_type = STRIPE_SPARSE; + fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; + + fl->first_stripe_index = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &fl->pattern_offset); + fl->num_fh = be32_to_cpup(p++); + + dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu\n", + __func__, nfl_util, fl->num_fh, fl->first_stripe_index, + fl->pattern_offset); + + /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. + * Futher checking is done in filelayout_check_layout */ + if (fl->num_fh > + max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) + goto out_err; + + if (fl->num_fh > 0) { + fl->fh_array = kcalloc(fl->num_fh, sizeof(fl->fh_array[0]), + gfp_flags); + if (!fl->fh_array) + goto out_err; + } + + for (i = 0; i < fl->num_fh; i++) { + /* Do we want to use a mempool here? */ + fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), gfp_flags); + if (!fl->fh_array[i]) + goto out_err_free; + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free; + fl->fh_array[i]->size = be32_to_cpup(p++); + if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { + printk(KERN_ERR "NFS: Too big fh %d received %d\n", + i, fl->fh_array[i]->size); + goto out_err_free; + } + + p = xdr_inline_decode(&stream, fl->fh_array[i]->size); + if (unlikely(!p)) + goto out_err_free; + memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); + dprintk("DEBUG: %s: fh len %d\n", __func__, + fl->fh_array[i]->size); + } + + __free_page(scratch); + return 0; + +out_err_free: + filelayout_free_fh_array(fl); +out_err: + __free_page(scratch); + return -EIO; +} + +static void +filelayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + + dprintk("--> %s\n", __func__); + nfs4_fl_put_deviceid(fl->dsaddr); + /* This assumes a single RW lseg */ + if (lseg->pls_range.iomode == IOMODE_RW) { + struct nfs4_filelayout *flo; + + flo = FILELAYOUT_FROM_HDR(lseg->pls_layout); + flo->commit_info.nbuckets = 0; + kfree(flo->commit_info.buckets); + flo->commit_info.buckets = NULL; + } + _filelayout_free_lseg(fl); +} + +static int +filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + gfp_t gfp_flags) +{ + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + struct pnfs_commit_bucket *buckets; + int size, i; + + if (fl->commit_through_mds) + return 0; + + size = (fl->stripe_type == STRIPE_SPARSE) ? + fl->dsaddr->ds_num : fl->dsaddr->stripe_count; + + if (cinfo->ds->nbuckets >= size) { + /* This assumes there is only one IOMODE_RW lseg. What + * we really want to do is have a layout_hdr level + * dictionary of <multipath_list4, fh> keys, each + * associated with a struct list_head, populated by calls + * to filelayout_write_pagelist(). + * */ + return 0; + } + + buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), + gfp_flags); + if (!buckets) + return -ENOMEM; + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&buckets[i].written); + INIT_LIST_HEAD(&buckets[i].committing); + /* mark direct verifier as unset */ + buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; + } + + spin_lock(cinfo->lock); + if (cinfo->ds->nbuckets >= size) + goto out; + for (i = 0; i < cinfo->ds->nbuckets; i++) { + list_splice(&cinfo->ds->buckets[i].written, + &buckets[i].written); + list_splice(&cinfo->ds->buckets[i].committing, + &buckets[i].committing); + buckets[i].direct_verf.committed = + cinfo->ds->buckets[i].direct_verf.committed; + buckets[i].wlseg = cinfo->ds->buckets[i].wlseg; + buckets[i].clseg = cinfo->ds->buckets[i].clseg; + } + swap(cinfo->ds->buckets, buckets); + cinfo->ds->nbuckets = size; +out: + spin_unlock(cinfo->lock); + kfree(buckets); + return 0; +} + +static struct pnfs_layout_segment * +filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, + struct nfs4_layoutget_res *lgr, + gfp_t gfp_flags) +{ + struct nfs4_filelayout_segment *fl; + int rc; + struct nfs4_deviceid id; + + dprintk("--> %s\n", __func__); + fl = kzalloc(sizeof(*fl), gfp_flags); + if (!fl) + return NULL; + + rc = filelayout_decode_layout(layoutid, fl, lgr, &id, gfp_flags); + if (rc != 0 || filelayout_check_layout(layoutid, fl, lgr, &id, gfp_flags)) { + _filelayout_free_lseg(fl); + return NULL; + } + return &fl->generic_hdr; +} + +/* + * filelayout_pg_test(). Called by nfs_can_coalesce_requests() + * + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t +filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + unsigned int size; + u64 p_stripe, r_stripe; + u32 stripe_offset; + u64 segment_offset = pgio->pg_lseg->pls_range.offset; + u32 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; + + /* calls nfs_generic_pg_test */ + size = pnfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; + + /* see if req and prev are in the same stripe */ + if (prev) { + p_stripe = (u64)req_offset(prev) - segment_offset; + r_stripe = (u64)req_offset(req) - segment_offset; + do_div(p_stripe, stripe_unit); + do_div(r_stripe, stripe_unit); + + if (p_stripe != r_stripe) + return 0; + } + + /* calculate remaining bytes in the current stripe */ + div_u64_rem((u64)req_offset(req) - segment_offset, + stripe_unit, + &stripe_offset); + WARN_ON_ONCE(stripe_offset > stripe_unit); + if (stripe_offset >= stripe_unit) + return 0; + return min(stripe_unit - (unsigned int)stripe_offset, size); +} + +static void +filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_READ, + GFP_KERNEL); + /* If no lseg, fall back to read through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_reset_read_mds(pgio); +} + +static void +filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + struct nfs_commit_info cinfo; + int status; + + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_RW, + GFP_NOFS); + /* If no lseg, fall back to write through mds */ + if (pgio->pg_lseg == NULL) + goto out_mds; + nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); + status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); + if (status < 0) { + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + goto out_mds; + } + return; +out_mds: + nfs_pageio_reset_write_mds(pgio); +} + +static const struct nfs_pageio_ops filelayout_pg_read_ops = { + .pg_init = filelayout_pg_init_read, + .pg_test = filelayout_pg_test, + .pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops filelayout_pg_write_ops = { + .pg_init = filelayout_pg_init_write, + .pg_test = filelayout_pg_test, + .pg_doio = pnfs_generic_pg_writepages, +}; + +static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) +{ + if (fl->stripe_type == STRIPE_SPARSE) + return nfs4_fl_calc_ds_index(&fl->generic_hdr, j); + else + return j; +} + +/* The generic layer is about to remove the req from the commit list. + * If this will make the bucket empty, it will need to put the lseg reference. + */ +static void +filelayout_clear_request_commit(struct nfs_page *req, + struct nfs_commit_info *cinfo) +{ + struct pnfs_layout_segment *freeme = NULL; + + spin_lock(cinfo->lock); + if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags)) + goto out; + cinfo->ds->nwritten--; + if (list_is_singular(&req->wb_list)) { + struct pnfs_commit_bucket *bucket; + + bucket = list_first_entry(&req->wb_list, + struct pnfs_commit_bucket, + written); + freeme = bucket->wlseg; + bucket->wlseg = NULL; + } +out: + nfs_request_remove_commit_list(req, cinfo); + spin_unlock(cinfo->lock); + pnfs_put_lseg(freeme); +} + +static struct list_head * +filelayout_choose_commit_list(struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) +{ + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + u32 i, j; + struct list_head *list; + struct pnfs_commit_bucket *buckets; + + if (fl->commit_through_mds) + return &cinfo->mds->list; + + /* Note that we are calling nfs4_fl_calc_j_index on each page + * that ends up being committed to a data server. An attractive + * alternative is to add a field to nfs_write_data and nfs_page + * to store the value calculated in filelayout_write_pagelist + * and just use that here. + */ + j = nfs4_fl_calc_j_index(lseg, req_offset(req)); + i = select_bucket_index(fl, j); + spin_lock(cinfo->lock); + buckets = cinfo->ds->buckets; + list = &buckets[i].written; + if (list_empty(list)) { + /* Non-empty buckets hold a reference on the lseg. That ref + * is normally transferred to the COMMIT call and released + * there. It could also be released if the last req is pulled + * off due to a rewrite, in which case it will be done in + * filelayout_clear_request_commit + */ + buckets[i].wlseg = pnfs_get_lseg(lseg); + } + set_bit(PG_COMMIT_TO_DS, &req->wb_flags); + cinfo->ds->nwritten++; + spin_unlock(cinfo->lock); + return list; +} + +static void +filelayout_mark_request_commit(struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) +{ + struct list_head *list; + + list = filelayout_choose_commit_list(req, lseg, cinfo); + nfs_request_add_commit_list(req, list, cinfo); +} + +static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + if (flseg->stripe_type == STRIPE_SPARSE) + return i; + else + return nfs4_fl_calc_ds_index(lseg, i); +} + +static struct nfs_fh * +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + if (flseg->stripe_type == STRIPE_SPARSE) { + if (flseg->num_fh == 1) + i = 0; + else if (flseg->num_fh == 0) + /* Use the MDS OPEN fh set in nfs_read_rpcsetup */ + return NULL; + } + return flseg->fh_array[i]; +} + +static int filelayout_initiate_commit(struct nfs_commit_data *data, int how) +{ + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + struct rpc_clnt *ds_clnt; + u32 idx; + struct nfs_fh *fh; + + idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) + goto out_err; + + ds_clnt = nfs4_find_or_create_ds_client(ds->ds_clp, data->inode); + if (IS_ERR(ds_clnt)) + goto out_err; + + dprintk("%s ino %lu, how %d cl_count %d\n", __func__, + data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count)); + data->commit_done_cb = filelayout_commit_done_cb; + atomic_inc(&ds->ds_clp->cl_count); + data->ds_clp = ds->ds_clp; + fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); + if (fh) + data->args.fh = fh; + return nfs_initiate_commit(ds_clnt, data, + &filelayout_commit_call_ops, how, + RPC_TASK_SOFTCONN); +out_err: + prepare_to_resend_writes(data); + filelayout_commit_release(data); + return -EAGAIN; +} + +static int +transfer_commit_list(struct list_head *src, struct list_head *dst, + struct nfs_commit_info *cinfo, int max) +{ + struct nfs_page *req, *tmp; + int ret = 0; + + list_for_each_entry_safe(req, tmp, src, wb_list) { + if (!nfs_lock_request(req)) + continue; + kref_get(&req->wb_kref); + if (cond_resched_lock(cinfo->lock)) + list_safe_reset_next(req, tmp, wb_list); + nfs_request_remove_commit_list(req, cinfo); + clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); + nfs_list_add_request(req, dst); + ret++; + if ((ret == max) && !cinfo->dreq) + break; + } + return ret; +} + +/* Note called with cinfo->lock held. */ +static int +filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, + struct nfs_commit_info *cinfo, + int max) +{ + struct list_head *src = &bucket->written; + struct list_head *dst = &bucket->committing; + int ret; + + ret = transfer_commit_list(src, dst, cinfo, max); + if (ret) { + cinfo->ds->nwritten -= ret; + cinfo->ds->ncommitting += ret; + bucket->clseg = bucket->wlseg; + if (list_empty(src)) + bucket->wlseg = NULL; + else + pnfs_get_lseg(bucket->clseg); + } + return ret; +} + +/* Move reqs from written to committing lists, returning count of number moved. + * Note called with cinfo->lock held. + */ +static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo, + int max) +{ + int i, rv = 0, cnt; + + for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { + cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i], + cinfo, max); + max -= cnt; + rv += cnt; + } + return rv; +} + +/* Pull everything off the committing lists and dump into @dst */ +static void filelayout_recover_commit_reqs(struct list_head *dst, + struct nfs_commit_info *cinfo) +{ + struct pnfs_commit_bucket *b; + struct pnfs_layout_segment *freeme; + int i; + +restart: + spin_lock(cinfo->lock); + for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { + if (transfer_commit_list(&b->written, dst, cinfo, 0)) { + freeme = b->wlseg; + b->wlseg = NULL; + spin_unlock(cinfo->lock); + pnfs_put_lseg(freeme); + goto restart; + } + } + cinfo->ds->nwritten = 0; + spin_unlock(cinfo->lock); +} + +static unsigned int +alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) +{ + struct pnfs_ds_commit_info *fl_cinfo; + struct pnfs_commit_bucket *bucket; + struct nfs_commit_data *data; + int i, j; + unsigned int nreq = 0; + struct pnfs_layout_segment *freeme; + + fl_cinfo = cinfo->ds; + bucket = fl_cinfo->buckets; + for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { + if (list_empty(&bucket->committing)) + continue; + data = nfs_commitdata_alloc(); + if (!data) + break; + data->ds_commit_index = i; + spin_lock(cinfo->lock); + data->lseg = bucket->clseg; + bucket->clseg = NULL; + spin_unlock(cinfo->lock); + list_add(&data->pages, list); + nreq++; + } + + /* Clean up on error */ + for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) { + if (list_empty(&bucket->committing)) + continue; + nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); + spin_lock(cinfo->lock); + freeme = bucket->clseg; + bucket->clseg = NULL; + spin_unlock(cinfo->lock); + pnfs_put_lseg(freeme); + } + /* Caller will clean up entries put on list */ + return nreq; +} + +/* This follows nfs_commit_list pretty closely */ +static int +filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, + int how, struct nfs_commit_info *cinfo) +{ + struct nfs_commit_data *data, *tmp; + LIST_HEAD(list); + unsigned int nreq = 0; + + if (!list_empty(mds_pages)) { + data = nfs_commitdata_alloc(); + if (data != NULL) { + data->lseg = NULL; + list_add(&data->pages, &list); + nreq++; + } else + nfs_retry_commit(mds_pages, NULL, cinfo); + } + + nreq += alloc_ds_commits(cinfo, &list); + + if (nreq == 0) { + cinfo->completion_ops->error_cleanup(NFS_I(inode)); + goto out; + } + + atomic_add(nreq, &cinfo->mds->rpcs_out); + + list_for_each_entry_safe(data, tmp, &list, pages) { + list_del_init(&data->pages); + if (!data->lseg) { + nfs_init_commit(data, mds_pages, NULL, cinfo); + nfs_initiate_commit(NFS_CLIENT(inode), data, + data->mds_ops, how, 0); + } else { + struct pnfs_commit_bucket *buckets; + + buckets = cinfo->ds->buckets; + nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo); + filelayout_initiate_commit(data, how); + } + } +out: + cinfo->ds->ncommitting = 0; + return PNFS_ATTEMPTED; +} + +static void +filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) +{ + nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node)); +} + +static struct pnfs_layout_hdr * +filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) +{ + struct nfs4_filelayout *flo; + + flo = kzalloc(sizeof(*flo), gfp_flags); + return flo != NULL ? &flo->generic_hdr : NULL; +} + +static void +filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + kfree(FILELAYOUT_FROM_HDR(lo)); +} + +static struct pnfs_ds_commit_info * +filelayout_get_ds_info(struct inode *inode) +{ + struct pnfs_layout_hdr *layout = NFS_I(inode)->layout; + + if (layout == NULL) + return NULL; + else + return &FILELAYOUT_FROM_HDR(layout)->commit_info; +} + +static struct pnfs_layoutdriver_type filelayout_type = { + .id = LAYOUT_NFSV4_1_FILES, + .name = "LAYOUT_NFSV4_1_FILES", + .owner = THIS_MODULE, + .alloc_layout_hdr = filelayout_alloc_layout_hdr, + .free_layout_hdr = filelayout_free_layout_hdr, + .alloc_lseg = filelayout_alloc_lseg, + .free_lseg = filelayout_free_lseg, + .pg_read_ops = &filelayout_pg_read_ops, + .pg_write_ops = &filelayout_pg_write_ops, + .get_ds_info = &filelayout_get_ds_info, + .mark_request_commit = filelayout_mark_request_commit, + .clear_request_commit = filelayout_clear_request_commit, + .scan_commit_lists = filelayout_scan_commit_lists, + .recover_commit_reqs = filelayout_recover_commit_reqs, + .commit_pagelist = filelayout_commit_pagelist, + .read_pagelist = filelayout_read_pagelist, + .write_pagelist = filelayout_write_pagelist, + .free_deviceid_node = filelayout_free_deveiceid_node, +}; + +static int __init nfs4filelayout_init(void) +{ + printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", + __func__); + return pnfs_register_layoutdriver(&filelayout_type); +} + +static void __exit nfs4filelayout_exit(void) +{ + printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", + __func__); + pnfs_unregister_layoutdriver(&filelayout_type); +} + +MODULE_ALIAS("nfs-layouttype4-1"); + +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h new file mode 100644 index 00000000000..ffbddf2219e --- /dev/null +++ b/fs/nfs/filelayout/filelayout.h @@ -0,0 +1,156 @@ +/* + * NFSv4 file layout driver data structures. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#ifndef FS_NFS_NFS4FILELAYOUT_H +#define FS_NFS_NFS4FILELAYOUT_H + +#include "../pnfs.h" + +/* + * Default data server connection timeout and retrans vaules. + * Set by module paramters dataserver_timeo and dataserver_retrans. + */ +#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */ +#define NFS4_DEF_DS_RETRANS 5 + +/* + * Field testing shows we need to support up to 4096 stripe indices. + * We store each index as a u8 (u32 on the wire) to keep the memory footprint + * reasonable. This in turn means we support a maximum of 256 + * RFC 5661 multipath_list4 structures. + */ +#define NFS4_PNFS_MAX_STRIPE_CNT 4096 +#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */ + +/* error codes for internal use */ +#define NFS4ERR_RESET_TO_MDS 12001 + +enum stripetype4 { + STRIPE_SPARSE = 1, + STRIPE_DENSE = 2 +}; + +/* Individual ip address */ +struct nfs4_pnfs_ds_addr { + struct sockaddr_storage da_addr; + size_t da_addrlen; + struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */ + char *da_remotestr; /* human readable addr+port */ +}; + +struct nfs4_pnfs_ds { + struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ + char *ds_remotestr; /* comma sep list of addrs */ + struct list_head ds_addrs; + struct nfs_client *ds_clp; + atomic_t ds_count; + unsigned long ds_state; +#define NFS4DS_CONNECTING 0 /* ds is establishing connection */ +}; + +struct nfs4_file_layout_dsaddr { + struct nfs4_deviceid_node id_node; + u32 stripe_count; + u8 *stripe_indices; + u32 ds_num; + struct nfs4_pnfs_ds *ds_list[1]; +}; + +struct nfs4_filelayout_segment { + struct pnfs_layout_segment generic_hdr; + u32 stripe_type; + u32 commit_through_mds; + u32 stripe_unit; + u32 first_stripe_index; + u64 pattern_offset; + struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ + unsigned int num_fh; + struct nfs_fh **fh_array; +}; + +struct nfs4_filelayout { + struct pnfs_layout_hdr generic_hdr; + struct pnfs_ds_commit_info commit_info; +}; + +static inline struct nfs4_filelayout * +FILELAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct nfs4_filelayout, generic_hdr); +} + +static inline struct nfs4_filelayout_segment * +FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) +{ + return container_of(lseg, + struct nfs4_filelayout_segment, + generic_hdr); +} + +static inline struct nfs4_deviceid_node * +FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg) +{ + return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node; +} + +static inline void +filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node) +{ + u32 *p = (u32 *)&node->deviceid; + + printk(KERN_WARNING "NFS: Deviceid [%x%x%x%x] marked out of use.\n", + p[0], p[1], p[2], p[3]); + + set_bit(NFS_DEVICEID_INVALID, &node->flags); +} + +static inline bool +filelayout_test_devid_invalid(struct nfs4_deviceid_node *node) +{ + return test_bit(NFS_DEVICEID_INVALID, &node->flags); +} + +extern bool +filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node); + +extern struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); + +extern void print_ds(struct nfs4_pnfs_ds *ds); +u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); +u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); +struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, + u32 ds_idx); +extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); +extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); +struct nfs4_file_layout_dsaddr * +filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, + struct rpc_cred *cred, gfp_t gfp_flags); + +#endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c new file mode 100644 index 00000000000..44bf0140a4c --- /dev/null +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -0,0 +1,843 @@ +/* + * Device operations for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * Garth Goodson <Garth.Goodson@netapp.com> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include <linux/nfs_fs.h> +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/sunrpc/addr.h> + +#include "../internal.h" +#include "../nfs4session.h" +#include "filelayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; +static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; + +/* + * Data server cache + * + * Data servers can be mapped to different device ids. + * nfs4_pnfs_ds reference counting + * - set to 1 on allocation + * - incremented when a device id maps a data server already in the cache. + * - decremented when deviceid is removed from the cache. + */ +static DEFINE_SPINLOCK(nfs4_ds_cache_lock); +static LIST_HEAD(nfs4_data_server_cache); + +/* Debug routines */ +void +print_ds(struct nfs4_pnfs_ds *ds) +{ + if (ds == NULL) { + printk("%s NULL device\n", __func__); + return; + } + printk(" ds %s\n" + " ref count %d\n" + " client %p\n" + " cl_exchange_flags %x\n", + ds->ds_remotestr, + atomic_read(&ds->ds_count), ds->ds_clp, + ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); +} + +static bool +same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) +{ + struct sockaddr_in *a, *b; + struct sockaddr_in6 *a6, *b6; + + if (addr1->sa_family != addr2->sa_family) + return false; + + switch (addr1->sa_family) { + case AF_INET: + a = (struct sockaddr_in *)addr1; + b = (struct sockaddr_in *)addr2; + + if (a->sin_addr.s_addr == b->sin_addr.s_addr && + a->sin_port == b->sin_port) + return true; + break; + + case AF_INET6: + a6 = (struct sockaddr_in6 *)addr1; + b6 = (struct sockaddr_in6 *)addr2; + + /* LINKLOCAL addresses must have matching scope_id */ + if (ipv6_addr_src_scope(&a6->sin6_addr) == + IPV6_ADDR_SCOPE_LINKLOCAL && + a6->sin6_scope_id != b6->sin6_scope_id) + return false; + + if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) && + a6->sin6_port == b6->sin6_port) + return true; + break; + + default: + dprintk("%s: unhandled address family: %u\n", + __func__, addr1->sa_family); + return false; + } + + return false; +} + +static bool +_same_data_server_addrs_locked(const struct list_head *dsaddrs1, + const struct list_head *dsaddrs2) +{ + struct nfs4_pnfs_ds_addr *da1, *da2; + + /* step through both lists, comparing as we go */ + for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node), + da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node); + da1 != NULL && da2 != NULL; + da1 = list_entry(da1->da_node.next, typeof(*da1), da_node), + da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) { + if (!same_sockaddr((struct sockaddr *)&da1->da_addr, + (struct sockaddr *)&da2->da_addr)) + return false; + } + if (da1 == NULL && da2 == NULL) + return true; + + return false; +} + +/* + * Lookup DS by addresses. nfs4_ds_cache_lock is held + */ +static struct nfs4_pnfs_ds * +_data_server_lookup_locked(const struct list_head *dsaddrs) +{ + struct nfs4_pnfs_ds *ds; + + list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) + if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) + return ds; + return NULL; +} + +/* + * Create an rpc connection to the nfs4_pnfs_ds data server + * Currently only supports IPv4 and IPv6 addresses + */ +static int +nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) +{ + struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs4_pnfs_ds_addr *da; + int status = 0; + + dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr, + mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); + + list_for_each_entry(da, &ds->ds_addrs, da_node) { + dprintk("%s: DS %s: trying address %s\n", + __func__, ds->ds_remotestr, da->da_remotestr); + + clp = nfs4_set_ds_client(mds_srv->nfs_client, + (struct sockaddr *)&da->da_addr, + da->da_addrlen, IPPROTO_TCP, + dataserver_timeo, dataserver_retrans); + if (!IS_ERR(clp)) + break; + } + + if (IS_ERR(clp)) { + status = PTR_ERR(clp); + goto out; + } + + status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time); + if (status) + goto out_put; + + smp_wmb(); + ds->ds_clp = clp; + dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); +out: + return status; +out_put: + nfs_put_client(clp); + goto out; +} + +static void +destroy_ds(struct nfs4_pnfs_ds *ds) +{ + struct nfs4_pnfs_ds_addr *da; + + dprintk("--> %s\n", __func__); + ifdebug(FACILITY) + print_ds(ds); + + if (ds->ds_clp) + nfs_put_client(ds->ds_clp); + + while (!list_empty(&ds->ds_addrs)) { + da = list_first_entry(&ds->ds_addrs, + struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } + + kfree(ds->ds_remotestr); + kfree(ds); +} + +void +nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) +{ + struct nfs4_pnfs_ds *ds; + int i; + + nfs4_print_deviceid(&dsaddr->id_node.deviceid); + + for (i = 0; i < dsaddr->ds_num; i++) { + ds = dsaddr->ds_list[i]; + if (ds != NULL) { + if (atomic_dec_and_lock(&ds->ds_count, + &nfs4_ds_cache_lock)) { + list_del_init(&ds->ds_node); + spin_unlock(&nfs4_ds_cache_lock); + destroy_ds(ds); + } + } + } + kfree(dsaddr->stripe_indices); + kfree(dsaddr); +} + +/* + * Create a string with a human readable address and port to avoid + * complicated setup around many dprinks. + */ +static char * +nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds_addr *da; + char *remotestr; + size_t len; + char *p; + + len = 3; /* '{', '}' and eol */ + list_for_each_entry(da, dsaddrs, da_node) { + len += strlen(da->da_remotestr) + 1; /* string plus comma */ + } + + remotestr = kzalloc(len, gfp_flags); + if (!remotestr) + return NULL; + + p = remotestr; + *(p++) = '{'; + len--; + list_for_each_entry(da, dsaddrs, da_node) { + size_t ll = strlen(da->da_remotestr); + + if (ll > len) + goto out_err; + + memcpy(p, da->da_remotestr, ll); + p += ll; + len -= ll; + + if (len < 1) + goto out_err; + (*p++) = ','; + len--; + } + if (len < 2) + goto out_err; + *(p++) = '}'; + *p = '\0'; + return remotestr; +out_err: + kfree(remotestr); + return NULL; +} + +static struct nfs4_pnfs_ds * +nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; + char *remotestr; + + if (list_empty(dsaddrs)) { + dprintk("%s: no addresses defined\n", __func__); + goto out; + } + + ds = kzalloc(sizeof(*ds), gfp_flags); + if (!ds) + goto out; + + /* this is only used for debugging, so it's ok if its NULL */ + remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); + + spin_lock(&nfs4_ds_cache_lock); + tmp_ds = _data_server_lookup_locked(dsaddrs); + if (tmp_ds == NULL) { + INIT_LIST_HEAD(&ds->ds_addrs); + list_splice_init(dsaddrs, &ds->ds_addrs); + ds->ds_remotestr = remotestr; + atomic_set(&ds->ds_count, 1); + INIT_LIST_HEAD(&ds->ds_node); + ds->ds_clp = NULL; + list_add(&ds->ds_node, &nfs4_data_server_cache); + dprintk("%s add new data server %s\n", __func__, + ds->ds_remotestr); + } else { + kfree(remotestr); + kfree(ds); + atomic_inc(&tmp_ds->ds_count); + dprintk("%s data server %s found, inc'ed ds_count to %d\n", + __func__, tmp_ds->ds_remotestr, + atomic_read(&tmp_ds->ds_count)); + ds = tmp_ds; + } + spin_unlock(&nfs4_ds_cache_lock); +out: + return ds; +} + +/* + * Currently only supports ipv4, ipv6 and one multi-path address. + */ +static struct nfs4_pnfs_ds_addr * +decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds_addr *da = NULL; + char *buf, *portstr; + __be16 port; + int nlen, rlen; + int tmp[2]; + __be32 *p; + char *netid, *match_netid; + size_t len, match_netid_len; + char *startsep = ""; + char *endsep = ""; + + + /* r_netid */ + p = xdr_inline_decode(streamp, 4); + if (unlikely(!p)) + goto out_err; + nlen = be32_to_cpup(p++); + + p = xdr_inline_decode(streamp, nlen); + if (unlikely(!p)) + goto out_err; + + netid = kmalloc(nlen+1, gfp_flags); + if (unlikely(!netid)) + goto out_err; + + netid[nlen] = '\0'; + memcpy(netid, p, nlen); + + /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */ + p = xdr_inline_decode(streamp, 4); + if (unlikely(!p)) + goto out_free_netid; + rlen = be32_to_cpup(p); + + p = xdr_inline_decode(streamp, rlen); + if (unlikely(!p)) + goto out_free_netid; + + /* port is ".ABC.DEF", 8 chars max */ + if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) { + dprintk("%s: Invalid address, length %d\n", __func__, + rlen); + goto out_free_netid; + } + buf = kmalloc(rlen + 1, gfp_flags); + if (!buf) { + dprintk("%s: Not enough memory\n", __func__); + goto out_free_netid; + } + buf[rlen] = '\0'; + memcpy(buf, p, rlen); + + /* replace port '.' with '-' */ + portstr = strrchr(buf, '.'); + if (!portstr) { + dprintk("%s: Failed finding expected dot in port\n", + __func__); + goto out_free_buf; + } + *portstr = '-'; + + /* find '.' between address and port */ + portstr = strrchr(buf, '.'); + if (!portstr) { + dprintk("%s: Failed finding expected dot between address and " + "port\n", __func__); + goto out_free_buf; + } + *portstr = '\0'; + + da = kzalloc(sizeof(*da), gfp_flags); + if (unlikely(!da)) + goto out_free_buf; + + INIT_LIST_HEAD(&da->da_node); + + if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr, + sizeof(da->da_addr))) { + dprintk("%s: error parsing address %s\n", __func__, buf); + goto out_free_da; + } + + portstr++; + sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]); + port = htons((tmp[0] << 8) | (tmp[1])); + + switch (da->da_addr.ss_family) { + case AF_INET: + ((struct sockaddr_in *)&da->da_addr)->sin_port = port; + da->da_addrlen = sizeof(struct sockaddr_in); + match_netid = "tcp"; + match_netid_len = 3; + break; + + case AF_INET6: + ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port; + da->da_addrlen = sizeof(struct sockaddr_in6); + match_netid = "tcp6"; + match_netid_len = 4; + startsep = "["; + endsep = "]"; + break; + + default: + dprintk("%s: unsupported address family: %u\n", + __func__, da->da_addr.ss_family); + goto out_free_da; + } + + if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) { + dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n", + __func__, netid, match_netid); + goto out_free_da; + } + + /* save human readable address */ + len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7; + da->da_remotestr = kzalloc(len, gfp_flags); + + /* NULL is ok, only used for dprintk */ + if (da->da_remotestr) + snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep, + buf, endsep, ntohs(port)); + + dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr); + kfree(buf); + kfree(netid); + return da; + +out_free_da: + kfree(da); +out_free_buf: + dprintk("%s: Error parsing DS addr: %s\n", __func__, buf); + kfree(buf); +out_free_netid: + kfree(netid); +out_err: + return NULL; +} + +/* Decode opaque device data and return the result */ +static struct nfs4_file_layout_dsaddr* +decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) +{ + int i; + u32 cnt, num; + u8 *indexp; + __be32 *p; + u8 *stripe_indices; + u8 max_stripe_index; + struct nfs4_file_layout_dsaddr *dsaddr = NULL; + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + struct list_head dsaddrs; + struct nfs4_pnfs_ds_addr *da; + + /* set up xdr stream */ + scratch = alloc_page(gfp_flags); + if (!scratch) + goto out_err; + + xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + /* Get the stripe count (number of stripe index) */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_scratch; + + cnt = be32_to_cpup(p); + dprintk("%s stripe count %d\n", __func__, cnt); + if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { + printk(KERN_WARNING "NFS: %s: stripe count %d greater than " + "supported maximum %d\n", __func__, + cnt, NFS4_PNFS_MAX_STRIPE_CNT); + goto out_err_free_scratch; + } + + /* read stripe indices */ + stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags); + if (!stripe_indices) + goto out_err_free_scratch; + + p = xdr_inline_decode(&stream, cnt << 2); + if (unlikely(!p)) + goto out_err_free_stripe_indices; + + indexp = &stripe_indices[0]; + max_stripe_index = 0; + for (i = 0; i < cnt; i++) { + *indexp = be32_to_cpup(p++); + max_stripe_index = max(max_stripe_index, *indexp); + indexp++; + } + + /* Check the multipath list count */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_stripe_indices; + + num = be32_to_cpup(p); + dprintk("%s ds_num %u\n", __func__, num); + if (num > NFS4_PNFS_MAX_MULTI_CNT) { + printk(KERN_WARNING "NFS: %s: multipath count %d greater than " + "supported maximum %d\n", __func__, + num, NFS4_PNFS_MAX_MULTI_CNT); + goto out_err_free_stripe_indices; + } + + /* validate stripe indices are all < num */ + if (max_stripe_index >= num) { + printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n", + __func__, max_stripe_index, num); + goto out_err_free_stripe_indices; + } + + dsaddr = kzalloc(sizeof(*dsaddr) + + (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), + gfp_flags); + if (!dsaddr) + goto out_err_free_stripe_indices; + + dsaddr->stripe_count = cnt; + dsaddr->stripe_indices = stripe_indices; + stripe_indices = NULL; + dsaddr->ds_num = num; + nfs4_init_deviceid_node(&dsaddr->id_node, + NFS_SERVER(ino)->pnfs_curr_ld, + NFS_SERVER(ino)->nfs_client, + &pdev->dev_id); + + INIT_LIST_HEAD(&dsaddrs); + + for (i = 0; i < dsaddr->ds_num; i++) { + int j; + u32 mp_count; + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_deviceid; + + mp_count = be32_to_cpup(p); /* multipath count */ + for (j = 0; j < mp_count; j++) { + da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net, + &stream, gfp_flags); + if (da) + list_add_tail(&da->da_node, &dsaddrs); + } + if (list_empty(&dsaddrs)) { + dprintk("%s: no suitable DS addresses found\n", + __func__); + goto out_err_free_deviceid; + } + + dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + if (!dsaddr->ds_list[i]) + goto out_err_drain_dsaddrs; + + /* If DS was already in cache, free ds addrs */ + while (!list_empty(&dsaddrs)) { + da = list_first_entry(&dsaddrs, + struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } + } + + __free_page(scratch); + return dsaddr; + +out_err_drain_dsaddrs: + while (!list_empty(&dsaddrs)) { + da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } +out_err_free_deviceid: + nfs4_fl_free_deviceid(dsaddr); + /* stripe_indicies was part of dsaddr */ + goto out_err_free_scratch; +out_err_free_stripe_indices: + kfree(stripe_indices); +out_err_free_scratch: + __free_page(scratch); +out_err: + dprintk("%s ERROR: returning NULL\n", __func__); + return NULL; +} + +/* + * Decode the opaque device specified in 'dev' and add it to the cache of + * available devices. + */ +static struct nfs4_file_layout_dsaddr * +decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags) +{ + struct nfs4_deviceid_node *d; + struct nfs4_file_layout_dsaddr *n, *new; + + new = decode_device(inode, dev, gfp_flags); + if (!new) { + printk(KERN_WARNING "NFS: %s: Could not decode or add device\n", + __func__); + return NULL; + } + + d = nfs4_insert_deviceid_node(&new->id_node); + n = container_of(d, struct nfs4_file_layout_dsaddr, id_node); + if (n != new) { + nfs4_fl_free_deviceid(new); + return n; + } + + return new; +} + +/* + * Retrieve the information for dev_id, add it to the list + * of available devices, and return it. + */ +struct nfs4_file_layout_dsaddr * +filelayout_get_device_info(struct inode *inode, + struct nfs4_deviceid *dev_id, + struct rpc_cred *cred, + gfp_t gfp_flags) +{ + struct pnfs_device *pdev = NULL; + u32 max_resp_sz; + int max_pages; + struct page **pages = NULL; + struct nfs4_file_layout_dsaddr *dsaddr = NULL; + int rc, i; + struct nfs_server *server = NFS_SERVER(inode); + + /* + * Use the session max response size as the basis for setting + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + max_pages = nfs_page_array_len(0, max_resp_sz); + dprintk("%s inode %p max_resp_sz %u max_pages %d\n", + __func__, inode, max_resp_sz, max_pages); + + pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags); + if (pdev == NULL) + return NULL; + + pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags); + if (pages == NULL) { + kfree(pdev); + return NULL; + } + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(gfp_flags); + if (!pages[i]) + goto out_free; + } + + memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); + pdev->layout_type = LAYOUT_NFSV4_1_FILES; + pdev->pages = pages; + pdev->pgbase = 0; + pdev->pglen = max_resp_sz; + pdev->mincount = 0; + pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; + + rc = nfs4_proc_getdeviceinfo(server, pdev, cred); + dprintk("%s getdevice info returns %d\n", __func__, rc); + if (rc) + goto out_free; + + /* + * Found new device, need to decode it and then add it to the + * list of known devices for this mountpoint. + */ + dsaddr = decode_and_add_device(inode, pdev, gfp_flags); +out_free: + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); + kfree(pdev); + dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); + return dsaddr; +} + +void +nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) +{ + nfs4_put_deviceid_node(&dsaddr->id_node); +} + +/* + * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit + * Then: ((res + fsi) % dsaddr->stripe_count) + */ +u32 +nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + u64 tmp; + + tmp = offset - flseg->pattern_offset; + do_div(tmp, flseg->stripe_unit); + tmp += flseg->first_stripe_index; + return do_div(tmp, flseg->dsaddr->stripe_count); +} + +u32 +nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j) +{ + return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j]; +} + +struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + u32 i; + + if (flseg->stripe_type == STRIPE_SPARSE) { + if (flseg->num_fh == 1) + i = 0; + else if (flseg->num_fh == 0) + /* Use the MDS OPEN fh set in nfs_read_rpcsetup */ + return NULL; + else + i = nfs4_fl_calc_ds_index(lseg, j); + } else + i = j; + return flseg->fh_array[i]; +} + +static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) +{ + might_sleep(); + wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, + nfs_wait_bit_killable, TASK_KILLABLE); +} + +static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) +{ + smp_mb__before_atomic(); + clear_bit(NFS4DS_CONNECTING, &ds->ds_state); + smp_mb__after_atomic(); + wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); +} + + +struct nfs4_pnfs_ds * +nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) +{ + struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr; + struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; + struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); + struct nfs4_pnfs_ds *ret = ds; + + if (ds == NULL) { + printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", + __func__, ds_idx); + filelayout_mark_devid_invalid(devid); + goto out; + } + smp_rmb(); + if (ds->ds_clp) + goto out_test_devid; + + if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { + struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); + int err; + + err = nfs4_ds_connect(s, ds); + if (err) + nfs4_mark_deviceid_unavailable(devid); + nfs4_clear_ds_conn_bit(ds); + } else { + /* Either ds is connected, or ds is NULL */ + nfs4_wait_ds_connect(ds); + } +out_test_devid: + if (filelayout_test_devid_unavailable(devid)) + ret = NULL; +out: + return ret; +} + +module_param(dataserver_retrans, uint, 0644); +MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " + "retries a request before it attempts further " + " recovery action."); +module_param(dataserver_timeo, uint, 0644); +MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the " + "NFSv4.1 client waits for a response from a " + " data server before it retries an NFS request."); diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c new file mode 100644 index 00000000000..7cf2c4699b0 --- /dev/null +++ b/fs/nfs/fscache-index.c @@ -0,0 +1,337 @@ +/* NFS FS-Cache index structure definition + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <linux/in6.h> + +#include "internal.h" +#include "fscache.h" + +#define NFSDBG_FACILITY NFSDBG_FSCACHE + +/* + * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks + * the cookie for the top-level index object for NFS into here. The top-level + * index can than have other cache objects inserted into it. + */ +struct fscache_netfs nfs_fscache_netfs = { + .name = "nfs", + .version = 0, +}; + +/* + * Register NFS for caching + */ +int nfs_fscache_register(void) +{ + return fscache_register_netfs(&nfs_fscache_netfs); +} + +/* + * Unregister NFS for caching + */ +void nfs_fscache_unregister(void) +{ + fscache_unregister_netfs(&nfs_fscache_netfs); +} + +/* + * Layout of the key for an NFS server cache object. + */ +struct nfs_server_key { + uint16_t nfsversion; /* NFS protocol version */ + uint16_t family; /* address family */ + uint16_t port; /* IP port */ + union { + struct in_addr ipv4_addr; /* IPv4 address */ + struct in6_addr ipv6_addr; /* IPv6 address */ + } addr[0]; +}; + +/* + * Generate a key to describe a server in the main NFS index + * - We return the length of the key, or 0 if we can't generate one + */ +static uint16_t nfs_server_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_client *clp = cookie_netfs_data; + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr; + const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr; + struct nfs_server_key *key = buffer; + uint16_t len = sizeof(struct nfs_server_key); + + key->nfsversion = clp->rpc_ops->version; + key->family = clp->cl_addr.ss_family; + + memset(key, 0, len); + + switch (clp->cl_addr.ss_family) { + case AF_INET: + key->port = sin->sin_port; + key->addr[0].ipv4_addr = sin->sin_addr; + len += sizeof(key->addr[0].ipv4_addr); + break; + + case AF_INET6: + key->port = sin6->sin6_port; + key->addr[0].ipv6_addr = sin6->sin6_addr; + len += sizeof(key->addr[0].ipv6_addr); + break; + + default: + printk(KERN_WARNING "NFS: Unknown network family '%d'\n", + clp->cl_addr.ss_family); + len = 0; + break; + } + + return len; +} + +/* + * Define the server object for FS-Cache. This is used to describe a server + * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and + * server address parameters. + */ +const struct fscache_cookie_def nfs_fscache_server_index_def = { + .name = "NFS.server", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = nfs_server_get_key, +}; + +/* + * Generate a key to describe a superblock key in the main NFS index + */ +static uint16_t nfs_super_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_fscache_key *key; + const struct nfs_server *nfss = cookie_netfs_data; + uint16_t len; + + key = nfss->fscache_key; + len = sizeof(key->key) + key->key.uniq_len; + if (len > bufmax) { + len = 0; + } else { + memcpy(buffer, &key->key, sizeof(key->key)); + memcpy(buffer + sizeof(key->key), + key->key.uniquifier, key->key.uniq_len); + } + + return len; +} + +/* + * Define the superblock object for FS-Cache. This is used to describe a + * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS + * parameters that might cause a separate superblock. + */ +const struct fscache_cookie_def nfs_fscache_super_index_def = { + .name = "NFS.super", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = nfs_super_get_key, +}; + +/* + * Definition of the auxiliary data attached to NFS inode storage objects + * within the cache. + * + * The contents of this struct are recorded in the on-disk local cache in the + * auxiliary data attached to the data storage object backing an inode. This + * permits coherency to be managed when a new inode binds to an already extant + * cache object. + */ +struct nfs_fscache_inode_auxdata { + struct timespec mtime; + struct timespec ctime; + loff_t size; + u64 change_attr; +}; + +/* + * Generate a key to describe an NFS inode in an NFS server's index + */ +static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + const struct nfs_inode *nfsi = cookie_netfs_data; + uint16_t nsize; + + /* use the inode's NFS filehandle as the key */ + nsize = nfsi->fh.size; + memcpy(buffer, nfsi->fh.data, nsize); + return nsize; +} + +/* + * Get certain file attributes from the netfs data + * - This function can be absent for an index + * - Not permitted to return an error + * - The netfs data from the cookie being used as the source is presented + */ +static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct nfs_inode *nfsi = cookie_netfs_data; + + *size = nfsi->vfs_inode.i_size; +} + +/* + * Get the auxiliary data from netfs data + * - This function can be absent if the index carries no state data + * - Should store the auxiliary data in the buffer + * - Should return the amount of amount stored + * - Not permitted to return an error + * - The netfs data from the cookie being used as the source is presented + */ +static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + struct nfs_fscache_inode_auxdata auxdata; + const struct nfs_inode *nfsi = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.size = nfsi->vfs_inode.i_size; + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata.change_attr = nfsi->vfs_inode.i_version; + + if (bufmax > sizeof(auxdata)) + bufmax = sizeof(auxdata); + + memcpy(buffer, &auxdata, bufmax); + return bufmax; +} + +/* + * Consult the netfs about the state of an object + * - This function can be absent if the index carries no state data + * - The netfs data from the cookie being used as the target is + * presented, as is the auxiliary data + */ +static +enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct nfs_fscache_inode_auxdata auxdata; + struct nfs_inode *nfsi = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.size = nfsi->vfs_inode.i_size; + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata.change_attr = nfsi->vfs_inode.i_version; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +/* + * Indication from FS-Cache that the cookie is no longer cached + * - This function is called when the backing store currently caching a cookie + * is removed + * - The netfs should use this to clean up any markers indicating cached pages + * - This is mandatory for any object that may have data + */ +static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data) +{ + struct nfs_inode *nfsi = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi); + + for (;;) { + /* grab a bunch of pages to unmark */ + nr_pages = pagevec_lookup(&pvec, + nfsi->vfs_inode.i_mapping, + first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +/* + * Get an extra reference on a read context. + * - This function can be absent if the completion function doesn't require a + * context. + * - The read context is passed back to NFS in the event that a data read on the + * cache fails with EIO - in which case the server must be contacted to + * retrieve the data, which requires the read context for security. + */ +static void nfs_fh_get_context(void *cookie_netfs_data, void *context) +{ + get_nfs_open_context(context); +} + +/* + * Release an extra reference on a read context. + * - This function can be absent if the completion function doesn't require a + * context. + */ +static void nfs_fh_put_context(void *cookie_netfs_data, void *context) +{ + if (context) + put_nfs_open_context(context); +} + +/* + * Define the inode object for FS-Cache. This is used to describe an inode + * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for + * an inode. + * + * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime + * held in the cache auxiliary data for the data storage object with those in + * the inode struct in memory. + */ +const struct fscache_cookie_def nfs_fscache_inode_object_def = { + .name = "NFS.fh", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = nfs_fscache_inode_get_key, + .get_attr = nfs_fscache_inode_get_attr, + .get_aux = nfs_fscache_inode_get_aux, + .check_aux = nfs_fscache_inode_check_aux, + .now_uncached = nfs_fscache_inode_now_uncached, + .get_context = nfs_fh_get_context, + .put_context = nfs_fh_put_context, +}; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c new file mode 100644 index 00000000000..3ef01f0ba0b --- /dev/null +++ b/fs/nfs/fscache.c @@ -0,0 +1,439 @@ +/* NFS filesystem cache interface + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <linux/in6.h> +#include <linux/seq_file.h> +#include <linux/slab.h> + +#include "internal.h" +#include "iostat.h" +#include "fscache.h" + +#define NFSDBG_FACILITY NFSDBG_FSCACHE + +static struct rb_root nfs_fscache_keys = RB_ROOT; +static DEFINE_SPINLOCK(nfs_fscache_keys_lock); + +/* + * Get the per-client index cookie for an NFS client if the appropriate mount + * flag was set + * - We always try and get an index cookie for the client, but get filehandle + * cookies on a per-superblock basis, depending on the mount flags + */ +void nfs_fscache_get_client_cookie(struct nfs_client *clp) +{ + /* create a cache index for looking up filehandles */ + clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index, + &nfs_fscache_server_index_def, + clp, true); + dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n", + clp, clp->fscache); +} + +/* + * Dispose of a per-client cookie + */ +void nfs_fscache_release_client_cookie(struct nfs_client *clp) +{ + dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n", + clp, clp->fscache); + + fscache_relinquish_cookie(clp->fscache, 0); + clp->fscache = NULL; +} + +/* + * Get the cache cookie for an NFS superblock. We have to handle + * uniquification here because the cache doesn't do it for us. + * + * The default uniquifier is just an empty string, but it may be overridden + * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent + * superblock across an automount point of some nature. + */ +void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen) +{ + struct nfs_fscache_key *key, *xkey; + struct nfs_server *nfss = NFS_SB(sb); + struct rb_node **p, *parent; + int diff; + + if (!uniq) { + uniq = ""; + ulen = 1; + } + + key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL); + if (!key) + return; + + key->nfs_client = nfss->nfs_client; + key->key.super.s_flags = sb->s_flags & NFS_MS_MASK; + key->key.nfs_server.flags = nfss->flags; + key->key.nfs_server.rsize = nfss->rsize; + key->key.nfs_server.wsize = nfss->wsize; + key->key.nfs_server.acregmin = nfss->acregmin; + key->key.nfs_server.acregmax = nfss->acregmax; + key->key.nfs_server.acdirmin = nfss->acdirmin; + key->key.nfs_server.acdirmax = nfss->acdirmax; + key->key.nfs_server.fsid = nfss->fsid; + key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor; + + key->key.uniq_len = ulen; + memcpy(key->key.uniquifier, uniq, ulen); + + spin_lock(&nfs_fscache_keys_lock); + p = &nfs_fscache_keys.rb_node; + parent = NULL; + while (*p) { + parent = *p; + xkey = rb_entry(parent, struct nfs_fscache_key, node); + + if (key->nfs_client < xkey->nfs_client) + goto go_left; + if (key->nfs_client > xkey->nfs_client) + goto go_right; + + diff = memcmp(&key->key, &xkey->key, sizeof(key->key)); + if (diff < 0) + goto go_left; + if (diff > 0) + goto go_right; + + if (key->key.uniq_len == 0) + goto non_unique; + diff = memcmp(key->key.uniquifier, + xkey->key.uniquifier, + key->key.uniq_len); + if (diff < 0) + goto go_left; + if (diff > 0) + goto go_right; + goto non_unique; + + go_left: + p = &(*p)->rb_left; + continue; + go_right: + p = &(*p)->rb_right; + } + + rb_link_node(&key->node, parent, p); + rb_insert_color(&key->node, &nfs_fscache_keys); + spin_unlock(&nfs_fscache_keys_lock); + nfss->fscache_key = key; + + /* create a cache index for looking up filehandles */ + nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, + &nfs_fscache_super_index_def, + nfss, true); + dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", + nfss, nfss->fscache); + return; + +non_unique: + spin_unlock(&nfs_fscache_keys_lock); + kfree(key); + nfss->fscache_key = NULL; + nfss->fscache = NULL; + printk(KERN_WARNING "NFS:" + " Cache request denied due to non-unique superblock keys\n"); +} + +/* + * release a per-superblock cookie + */ +void nfs_fscache_release_super_cookie(struct super_block *sb) +{ + struct nfs_server *nfss = NFS_SB(sb); + + dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n", + nfss, nfss->fscache); + + fscache_relinquish_cookie(nfss->fscache, 0); + nfss->fscache = NULL; + + if (nfss->fscache_key) { + spin_lock(&nfs_fscache_keys_lock); + rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys); + spin_unlock(&nfs_fscache_keys_lock); + kfree(nfss->fscache_key); + nfss->fscache_key = NULL; + } +} + +/* + * Initialise the per-inode cache cookie pointer for an NFS inode. + */ +void nfs_fscache_init_inode(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + nfsi->fscache = NULL; + if (!S_ISREG(inode->i_mode)) + return; + nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, + &nfs_fscache_inode_object_def, + nfsi, false); +} + +/* + * Release a per-inode cookie. + */ +void nfs_fscache_clear_inode(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct fscache_cookie *cookie = nfs_i_fscache(inode); + + dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); + + fscache_relinquish_cookie(cookie, false); + nfsi->fscache = NULL; +} + +static bool nfs_fscache_can_enable(void *data) +{ + struct inode *inode = data; + + return !inode_is_open_for_write(inode); +} + +/* + * Enable or disable caching for a file that is being opened as appropriate. + * The cookie is allocated when the inode is initialised, but is not enabled at + * that time. Enablement is deferred to file-open time to avoid stat() and + * access() thrashing the cache. + * + * For now, with NFS, only regular files that are open read-only will be able + * to use the cache. + * + * We enable the cache for an inode if we open it read-only and it isn't + * currently open for writing. We disable the cache if the inode is open + * write-only. + * + * The caller uses the file struct to pin i_writecount on the inode before + * calling us when a file is opened for writing, so we can make use of that. + * + * Note that this may be invoked multiple times in parallel by parallel + * nfs_open() functions. + */ +void nfs_fscache_open_file(struct inode *inode, struct file *filp) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct fscache_cookie *cookie = nfs_i_fscache(inode); + + if (!fscache_cookie_valid(cookie)) + return; + + if (inode_is_open_for_write(inode)) { + dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); + clear_bit(NFS_INO_FSCACHE, &nfsi->flags); + fscache_disable_cookie(cookie, true); + fscache_uncache_all_inode_pages(cookie, inode); + } else { + dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi); + fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode); + if (fscache_cookie_enabled(cookie)) + set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); + } +} +EXPORT_SYMBOL_GPL(nfs_fscache_open_file); + +/* + * Release the caching state associated with a page, if the page isn't busy + * interacting with the cache. + * - Returns true (can release page) or false (page busy). + */ +int nfs_fscache_release_page(struct page *page, gfp_t gfp) +{ + if (PageFsCache(page)) { + struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host); + + BUG_ON(!cookie); + dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", + cookie, page, NFS_I(page->mapping->host)); + + if (!fscache_maybe_release_page(cookie, page, gfp)) + return 0; + + nfs_add_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED, 1); + } + + return 1; +} + +/* + * Release the caching state associated with a page if undergoing complete page + * invalidation. + */ +void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode) +{ + struct fscache_cookie *cookie = nfs_i_fscache(inode); + + BUG_ON(!cookie); + + dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n", + cookie, page, NFS_I(inode)); + + fscache_wait_on_page_write(cookie, page); + + BUG_ON(!PageLocked(page)); + fscache_uncache_page(cookie, page); + nfs_add_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED, 1); +} + +/* + * Handle completion of a page being read from the cache. + * - Called in process (keventd) context. + */ +static void nfs_readpage_from_fscache_complete(struct page *page, + void *context, + int error) +{ + dfprintk(FSCACHE, + "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n", + page, context, error); + + /* if the read completes with an error, we just unlock the page and let + * the VM reissue the readpage */ + if (!error) { + SetPageUptodate(page); + unlock_page(page); + } else { + error = nfs_readpage_async(context, page->mapping->host, page); + if (error) + unlock_page(page); + } +} + +/* + * Retrieve a page from fscache + */ +int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, struct page *page) +{ + int ret; + + dfprintk(FSCACHE, + "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", + nfs_i_fscache(inode), page, page->index, page->flags, inode); + + ret = fscache_read_or_alloc_page(nfs_i_fscache(inode), + page, + nfs_readpage_from_fscache_complete, + ctx, + GFP_KERNEL); + + switch (ret) { + case 0: /* read BIO submitted (page in fscache) */ + dfprintk(FSCACHE, + "NFS: readpage_from_fscache: BIO submitted\n"); + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1); + return ret; + + case -ENOBUFS: /* inode not in cache */ + case -ENODATA: /* page not in cache */ + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1); + dfprintk(FSCACHE, + "NFS: readpage_from_fscache %d\n", ret); + return 1; + + default: + dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret); + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1); + } + return ret; +} + +/* + * Retrieve a set of pages from fscache + */ +int __nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + unsigned npages = *nr_pages; + int ret; + + dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", + nfs_i_fscache(inode), npages, inode); + + ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode), + mapping, pages, nr_pages, + nfs_readpage_from_fscache_complete, + ctx, + mapping_gfp_mask(mapping)); + if (*nr_pages < npages) + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, + npages); + if (*nr_pages > 0) + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, + *nr_pages); + + switch (ret) { + case 0: /* read submitted to the cache for all pages */ + BUG_ON(!list_empty(pages)); + BUG_ON(*nr_pages != 0); + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: submitted\n"); + + return ret; + + case -ENOBUFS: /* some pages aren't cached and can't be */ + case -ENODATA: /* some pages aren't cached */ + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: no page: %d\n", ret); + return 1; + + default: + dfprintk(FSCACHE, + "NFS: nfs_getpages_from_fscache: ret %d\n", ret); + } + + return ret; +} + +/* + * Store a newly fetched page in fscache + * - PG_fscache must be set on the page + */ +void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) +{ + int ret; + + dfprintk(FSCACHE, + "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", + nfs_i_fscache(inode), page, page->index, page->flags, sync); + + ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL); + dfprintk(FSCACHE, + "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", + page, page->index, page->flags, ret); + + if (ret != 0) { + fscache_uncache_page(nfs_i_fscache(inode), page); + nfs_add_fscache_stats(inode, + NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1); + nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1); + } else { + nfs_add_fscache_stats(inode, + NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1); + } +} diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h new file mode 100644 index 00000000000..d7fe3e799f2 --- /dev/null +++ b/fs/nfs/fscache.h @@ -0,0 +1,229 @@ +/* NFS filesystem cache interface definitions + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _NFS_FSCACHE_H +#define _NFS_FSCACHE_H + +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include <linux/nfs4_mount.h> +#include <linux/fscache.h> + +#ifdef CONFIG_NFS_FSCACHE + +/* + * set of NFS FS-Cache objects that form a superblock key + */ +struct nfs_fscache_key { + struct rb_node node; + struct nfs_client *nfs_client; /* the server */ + + /* the elements of the unique key - as used by nfs_compare_super() and + * nfs_compare_mount_options() to distinguish superblocks */ + struct { + struct { + unsigned long s_flags; /* various flags + * (& NFS_MS_MASK) */ + } super; + + struct { + struct nfs_fsid fsid; + int flags; + unsigned int rsize; /* read size */ + unsigned int wsize; /* write size */ + unsigned int acregmin; /* attr cache timeouts */ + unsigned int acregmax; + unsigned int acdirmin; + unsigned int acdirmax; + } nfs_server; + + struct { + rpc_authflavor_t au_flavor; + } rpc_auth; + + /* uniquifier - can be used if nfs_server.flags includes + * NFS_MOUNT_UNSHARED */ + u8 uniq_len; + char uniquifier[0]; + } key; +}; + +/* + * fscache-index.c + */ +extern struct fscache_netfs nfs_fscache_netfs; +extern const struct fscache_cookie_def nfs_fscache_server_index_def; +extern const struct fscache_cookie_def nfs_fscache_super_index_def; +extern const struct fscache_cookie_def nfs_fscache_inode_object_def; + +extern int nfs_fscache_register(void); +extern void nfs_fscache_unregister(void); + +/* + * fscache.c + */ +extern void nfs_fscache_get_client_cookie(struct nfs_client *); +extern void nfs_fscache_release_client_cookie(struct nfs_client *); + +extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int); +extern void nfs_fscache_release_super_cookie(struct super_block *); + +extern void nfs_fscache_init_inode(struct inode *); +extern void nfs_fscache_clear_inode(struct inode *); +extern void nfs_fscache_open_file(struct inode *, struct file *); + +extern void __nfs_fscache_invalidate_page(struct page *, struct inode *); +extern int nfs_fscache_release_page(struct page *, gfp_t); + +extern int __nfs_readpage_from_fscache(struct nfs_open_context *, + struct inode *, struct page *); +extern int __nfs_readpages_from_fscache(struct nfs_open_context *, + struct inode *, struct address_space *, + struct list_head *, unsigned *); +extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int); + +/* + * wait for a page to complete writing to the cache + */ +static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, + struct page *page) +{ + if (PageFsCache(page)) + fscache_wait_on_page_write(nfsi->fscache, page); +} + +/* + * release the caching state associated with a page if undergoing complete page + * invalidation + */ +static inline void nfs_fscache_invalidate_page(struct page *page, + struct inode *inode) +{ + if (PageFsCache(page)) + __nfs_fscache_invalidate_page(page, inode); +} + +/* + * Retrieve a page from an inode data storage object. + */ +static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct page *page) +{ + if (NFS_I(inode)->fscache) + return __nfs_readpage_from_fscache(ctx, inode, page); + return -ENOBUFS; +} + +/* + * Retrieve a set of pages from an inode data storage object. + */ +static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + if (NFS_I(inode)->fscache) + return __nfs_readpages_from_fscache(ctx, inode, mapping, pages, + nr_pages); + return -ENOBUFS; +} + +/* + * Store a page newly fetched from the server in an inode data storage object + * in the cache. + */ +static inline void nfs_readpage_to_fscache(struct inode *inode, + struct page *page, + int sync) +{ + if (PageFsCache(page)) + __nfs_readpage_to_fscache(inode, page, sync); +} + +/* + * Invalidate the contents of fscache for this inode. This will not sleep. + */ +static inline void nfs_fscache_invalidate(struct inode *inode) +{ + fscache_invalidate(NFS_I(inode)->fscache); +} + +/* + * Wait for an object to finish being invalidated. + */ +static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) +{ + fscache_wait_on_invalidate(NFS_I(inode)->fscache); +} + +/* + * indicate the client caching state as readable text + */ +static inline const char *nfs_server_fscache_state(struct nfs_server *server) +{ + if (server->fscache && (server->options & NFS_OPTION_FSCACHE)) + return "yes"; + return "no "; +} + +#else /* CONFIG_NFS_FSCACHE */ +static inline int nfs_fscache_register(void) { return 0; } +static inline void nfs_fscache_unregister(void) {} + +static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {} +static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {} + +static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} + +static inline void nfs_fscache_init_inode(struct inode *inode) {} +static inline void nfs_fscache_clear_inode(struct inode *inode) {} +static inline void nfs_fscache_open_file(struct inode *inode, + struct file *filp) {} + +static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) +{ + return 1; /* True: may release page */ +} +static inline void nfs_fscache_invalidate_page(struct page *page, + struct inode *inode) {} +static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, + struct page *page) {} + +static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct page *page) +{ + return -ENOBUFS; +} +static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, + struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} +static inline void nfs_readpage_to_fscache(struct inode *inode, + struct page *page, int sync) {} + + +static inline void nfs_fscache_invalidate(struct inode *inode) {} +static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) {} + +static inline const char *nfs_server_fscache_state(struct nfs_server *server) +{ + return "no "; +} + +#endif /* CONFIG_NFS_FSCACHE */ +#endif /* _NFS_FSCACHE_H */ diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index fae97196daa..b94f80420a5 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -23,21 +23,15 @@ #include <linux/sunrpc/stats.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> -#include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> -#include <linux/nfs_idmap.h> #include <linux/vfs.h> #include <linux/namei.h> -#include <linux/mnt_namespace.h> #include <linux/security.h> -#include <asm/system.h> #include <asm/uaccess.h> -#include "nfs4_fs.h" -#include "delegation.h" #include "internal.h" #define NFSDBG_FACILITY NFSDBG_CLIENT @@ -50,13 +44,10 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i { /* The mntroot acts as the dummy root dentry for this superblock */ if (sb->s_root == NULL) { - sb->s_root = d_alloc_root(inode); - if (sb->s_root == NULL) { - iput(inode); + sb->s_root = d_make_root(inode); + if (sb->s_root == NULL) return -ENOMEM; - } - /* Circumvent igrab(): we know the inode is not being freed */ - atomic_inc(&inode->i_count); + ihold(inode); /* * Ensure that this dentry is invisible to d_find_alias(). * Otherwise, it may be spliced into the tree by @@ -65,9 +56,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i * This again causes shrink_dcache_for_umount_subtree() to * Oops, since the test for IS_ROOT() will fail. */ - spin_lock(&dcache_lock); - list_del_init(&sb->s_root->d_alias); - spin_unlock(&dcache_lock); + spin_lock(&sb->s_root->d_inode->i_lock); + spin_lock(&sb->s_root->d_lock); + hlist_del_init(&sb->s_root->d_alias); + spin_unlock(&sb->s_root->d_lock); + spin_unlock(&sb->s_root->d_inode->i_lock); } return 0; } @@ -75,222 +68,66 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i /* * get an NFS2/NFS3 root dentry from the root filehandle */ -struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh) +struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, + const char *devname) { struct nfs_server *server = NFS_SB(sb); struct nfs_fsinfo fsinfo; - struct nfs_fattr fattr; - struct dentry *mntroot; + struct dentry *ret; struct inode *inode; + void *name = kstrdup(devname, GFP_KERNEL); int error; - /* get the actual root for this mount */ - fsinfo.fattr = &fattr; - - error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); - if (error < 0) { - dprintk("nfs_get_root: getattr error = %d\n", -error); - return ERR_PTR(error); - } - - inode = nfs_fhget(sb, mntfh, fsinfo.fattr); - if (IS_ERR(inode)) { - dprintk("nfs_get_root: get root inode failed\n"); - return ERR_CAST(inode); - } - - error = nfs_superblock_set_dummy_root(sb, inode); - if (error != 0) - return ERR_PTR(error); - - /* root dentries normally start off anonymous and get spliced in later - * if the dentry tree reaches them; however if the dentry already - * exists, we'll pick it up at this point and use it as the root - */ - mntroot = d_alloc_anon(inode); - if (!mntroot) { - iput(inode); - dprintk("nfs_get_root: get root dentry failed\n"); + if (!name) return ERR_PTR(-ENOMEM); - } - - security_d_instantiate(mntroot, inode); - - if (!mntroot->d_op) - mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops; - - return mntroot; -} - -#ifdef CONFIG_NFS_V4 - -/* - * Do a simple pathwalk from the root FH of the server to the nominated target - * of the mountpoint - * - give error on symlinks - * - give error on ".." occurring in the path - * - follow traversals - */ -int nfs4_path_walk(struct nfs_server *server, - struct nfs_fh *mntfh, - const char *path) -{ - struct nfs_fsinfo fsinfo; - struct nfs_fattr fattr; - struct nfs_fh lastfh; - struct qstr name; - int ret; - - dprintk("--> nfs4_path_walk(,,%s)\n", path); - - fsinfo.fattr = &fattr; - nfs_fattr_init(&fattr); - - /* Eat leading slashes */ - while (*path == '/') - path++; - - /* Start by getting the root filehandle from the server */ - ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); - if (ret < 0) { - dprintk("nfs4_get_root: getroot error = %d\n", -ret); - return ret; - } - - if (fattr.type != NFDIR) { - printk(KERN_ERR "nfs4_get_root:" - " getroot encountered non-directory\n"); - return -ENOTDIR; - } - - /* FIXME: It is quite valid for the server to return a referral here */ - if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { - printk(KERN_ERR "nfs4_get_root:" - " getroot obtained referral\n"); - return -EREMOTE; - } - -next_component: - dprintk("Next: %s\n", path); - /* extract the next bit of the path */ - if (!*path) - goto path_walk_complete; - - name.name = path; - while (*path && *path != '/') - path++; - name.len = path - (const char *) name.name; - - if (name.len > NFS4_MAXNAMLEN) - return -ENAMETOOLONG; - -eat_dot_dir: - while (*path == '/') - path++; - - if (path[0] == '.' && (path[1] == '/' || !path[1])) { - path += 2; - goto eat_dot_dir; - } - - /* FIXME: Why shouldn't the user be able to use ".." in the path? */ - if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2]) - ) { - printk(KERN_ERR "nfs4_get_root:" - " Mount path contains reference to \"..\"\n"); - return -EINVAL; - } - - /* lookup the next FH in the sequence */ - memcpy(&lastfh, mntfh, sizeof(lastfh)); - - dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path); - - ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name, - mntfh, &fattr); - if (ret < 0) { - dprintk("nfs4_get_root: getroot error = %d\n", -ret); - return ret; - } - - if (fattr.type != NFDIR) { - printk(KERN_ERR "nfs4_get_root:" - " lookupfh encountered non-directory\n"); - return -ENOTDIR; - } - - /* FIXME: Referrals are quite valid here too */ - if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { - printk(KERN_ERR "nfs4_get_root:" - " lookupfh obtained referral\n"); - return -EREMOTE; - } - - goto next_component; - -path_walk_complete: - memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid)); - dprintk("<-- nfs4_path_walk() = 0\n"); - return 0; -} - -/* - * get an NFS4 root dentry from the root filehandle - */ -struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) -{ - struct nfs_server *server = NFS_SB(sb); - struct nfs_fattr fattr; - struct dentry *mntroot; - struct inode *inode; - int error; - - dprintk("--> nfs4_get_root()\n"); - - /* get the info about the server and filesystem */ - error = nfs4_server_capabilities(server, mntfh); - if (error < 0) { - dprintk("nfs_get_root: getcaps error = %d\n", - -error); - return ERR_PTR(error); + /* get the actual root for this mount */ + fsinfo.fattr = nfs_alloc_fattr(); + if (fsinfo.fattr == NULL) { + kfree(name); + return ERR_PTR(-ENOMEM); } - /* get the actual root for this mount */ - error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); + error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); if (error < 0) { dprintk("nfs_get_root: getattr error = %d\n", -error); - return ERR_PTR(error); + ret = ERR_PTR(error); + goto out; } - inode = nfs_fhget(sb, mntfh, &fattr); + inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL); if (IS_ERR(inode)) { dprintk("nfs_get_root: get root inode failed\n"); - return ERR_CAST(inode); + ret = ERR_CAST(inode); + goto out; } error = nfs_superblock_set_dummy_root(sb, inode); - if (error != 0) - return ERR_PTR(error); + if (error != 0) { + ret = ERR_PTR(error); + goto out; + } /* root dentries normally start off anonymous and get spliced in later * if the dentry tree reaches them; however if the dentry already * exists, we'll pick it up at this point and use it as the root */ - mntroot = d_alloc_anon(inode); - if (!mntroot) { - iput(inode); + ret = d_obtain_alias(inode); + if (IS_ERR(ret)) { dprintk("nfs_get_root: get root dentry failed\n"); - return ERR_PTR(-ENOMEM); + goto out; } - security_d_instantiate(mntroot, inode); - - if (!mntroot->d_op) - mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops; - - dprintk("<-- nfs4_get_root()\n"); - return mntroot; + security_d_instantiate(ret, inode); + spin_lock(&ret->d_lock); + if (IS_ROOT(ret) && !ret->d_fsdata && + !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { + ret->d_fsdata = name; + name = NULL; + } + spin_unlock(&ret->d_lock); +out: + kfree(name); + nfs_free_fattr(fsinfo.fattr); + return ret; } - -#endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 86147b0ab2c..567983d2c0e 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -33,486 +33,755 @@ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ - -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/init.h> #include <linux/types.h> -#include <linux/slab.h> -#include <linux/socket.h> -#include <linux/in.h> -#include <linux/sched.h> - -#include <linux/sunrpc/clnt.h> -#include <linux/workqueue.h> +#include <linux/parser.h> +#include <linux/fs.h> +#include <linux/nfs_idmap.h> +#include <net/net_namespace.h> #include <linux/sunrpc/rpc_pipe_fs.h> - #include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <linux/key.h> +#include <linux/keyctl.h> +#include <linux/key-type.h> +#include <keys/user-type.h> +#include <linux/module.h> -#include <linux/nfs_idmap.h> -#include "nfs4_fs.h" +#include "internal.h" +#include "netns.h" +#include "nfs4trace.h" -#define IDMAP_HASH_SZ 128 +#define NFS_UINT_MAXLEN 11 -/* Default cache timeout is 10 minutes */ -unsigned int nfs_idmap_cache_timeout = 600 * HZ; +static const struct cred *id_resolver_cache; +static struct key_type key_type_id_resolver_legacy; -static int param_set_idmap_timeout(const char *val, struct kernel_param *kp) +struct idmap_legacy_upcalldata { + struct rpc_pipe_msg pipe_msg; + struct idmap_msg idmap_msg; + struct key_construction *key_cons; + struct idmap *idmap; +}; + +struct idmap { + struct rpc_pipe_dir_object idmap_pdo; + struct rpc_pipe *idmap_pipe; + struct idmap_legacy_upcalldata *idmap_upcall_data; + struct mutex idmap_mutex; +}; + +/** + * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields + * @fattr: fully initialised struct nfs_fattr + * @owner_name: owner name string cache + * @group_name: group name string cache + */ +void nfs_fattr_init_names(struct nfs_fattr *fattr, + struct nfs4_string *owner_name, + struct nfs4_string *group_name) { - char *endp; - int num = simple_strtol(val, &endp, 0); - int jif = num * HZ; - if (endp == val || *endp || num < 0 || jif < num) - return -EINVAL; - *((int *)kp->arg) = jif; - return 0; + fattr->owner_name = owner_name; + fattr->group_name = group_name; } -module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int, - &nfs_idmap_cache_timeout, 0644); +static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr) +{ + fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME; + kfree(fattr->owner_name->data); +} -struct idmap_hashent { - unsigned long ih_expires; - __u32 ih_id; - size_t ih_namelen; - char ih_name[IDMAP_NAMESZ]; -}; +static void nfs_fattr_free_group_name(struct nfs_fattr *fattr) +{ + fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME; + kfree(fattr->group_name->data); +} -struct idmap_hashtable { - __u8 h_type; - struct idmap_hashent h_entries[IDMAP_HASH_SZ]; -}; +static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr) +{ + struct nfs4_string *owner = fattr->owner_name; + kuid_t uid; + + if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)) + return false; + if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) { + fattr->uid = uid; + fattr->valid |= NFS_ATTR_FATTR_OWNER; + } + return true; +} -struct idmap { - struct dentry *idmap_dentry; - wait_queue_head_t idmap_wq; - struct idmap_msg idmap_im; - struct mutex idmap_lock; /* Serializes upcalls */ - struct mutex idmap_im_lock; /* Protects the hashtable */ - struct idmap_hashtable idmap_user_hash; - struct idmap_hashtable idmap_group_hash; -}; +static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr) +{ + struct nfs4_string *group = fattr->group_name; + kgid_t gid; + + if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)) + return false; + if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) { + fattr->gid = gid; + fattr->valid |= NFS_ATTR_FATTR_GROUP; + } + return true; +} -static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *, - char __user *, size_t); -static ssize_t idmap_pipe_downcall(struct file *, const char __user *, - size_t); -static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); +/** + * nfs_fattr_free_names - free up the NFSv4 owner and group strings + * @fattr: a fully initialised nfs_fattr structure + */ +void nfs_fattr_free_names(struct nfs_fattr *fattr) +{ + if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME) + nfs_fattr_free_owner_name(fattr); + if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME) + nfs_fattr_free_group_name(fattr); +} -static unsigned int fnvhash32(const void *, size_t); +/** + * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free + * @server: pointer to the filesystem nfs_server structure + * @fattr: a fully initialised nfs_fattr structure + * + * This helper maps the cached NFSv4 owner/group strings in fattr into + * their numeric uid/gid equivalents, and then frees the cached strings. + */ +void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr) +{ + if (nfs_fattr_map_owner_name(server, fattr)) + nfs_fattr_free_owner_name(fattr); + if (nfs_fattr_map_group_name(server, fattr)) + nfs_fattr_free_group_name(fattr); +} -static struct rpc_pipe_ops idmap_upcall_ops = { - .upcall = idmap_pipe_upcall, - .downcall = idmap_pipe_downcall, - .destroy_msg = idmap_pipe_destroy_msg, +static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) +{ + unsigned long val; + char buf[16]; + + if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf)) + return 0; + memcpy(buf, name, namelen); + buf[namelen] = '\0'; + if (kstrtoul(buf, 0, &val) != 0) + return 0; + *res = val; + return 1; +} + +static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) +{ + return snprintf(buf, buflen, "%u", id); +} + +static struct key_type key_type_id_resolver = { + .name = "id_resolver", + .instantiate = user_instantiate, + .match = user_match, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = user_describe, + .read = user_read, }; -int -nfs_idmap_new(struct nfs_client *clp) +static int nfs_idmap_init_keyring(void) { - struct idmap *idmap; - int error; + struct cred *cred; + struct key *keyring; + int ret = 0; - BUG_ON(clp->cl_idmap != NULL); + printk(KERN_NOTICE "NFS: Registering the %s key type\n", + key_type_id_resolver.name); - idmap = kzalloc(sizeof(*idmap), GFP_KERNEL); - if (idmap == NULL) + cred = prepare_kernel_cred(NULL); + if (!cred) return -ENOMEM; - idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap", - idmap, &idmap_upcall_ops, 0); - if (IS_ERR(idmap->idmap_dentry)) { - error = PTR_ERR(idmap->idmap_dentry); - kfree(idmap); - return error; + keyring = keyring_alloc(".id_resolver", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; } - mutex_init(&idmap->idmap_lock); - mutex_init(&idmap->idmap_im_lock); - init_waitqueue_head(&idmap->idmap_wq); - idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER; - idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP; + ret = register_key_type(&key_type_id_resolver); + if (ret < 0) + goto failed_put_key; - clp->cl_idmap = idmap; + ret = register_key_type(&key_type_id_resolver_legacy); + if (ret < 0) + goto failed_reg_legacy; + + set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags); + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + id_resolver_cache = cred; return 0; + +failed_reg_legacy: + unregister_key_type(&key_type_id_resolver); +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; } -void -nfs_idmap_delete(struct nfs_client *clp) +static void nfs_idmap_quit_keyring(void) { - struct idmap *idmap = clp->cl_idmap; - - if (!idmap) - return; - rpc_unlink(idmap->idmap_dentry); - clp->cl_idmap = NULL; - kfree(idmap); + key_revoke(id_resolver_cache->thread_keyring); + unregister_key_type(&key_type_id_resolver); + unregister_key_type(&key_type_id_resolver_legacy); + put_cred(id_resolver_cache); } /* - * Helper routines for manipulating the hashtable + * Assemble the description to pass to request_key() + * This function will allocate a new string and update dest to point + * at it. The caller is responsible for freeing dest. + * + * On error 0 is returned. Otherwise, the length of dest is returned. */ -static inline struct idmap_hashent * -idmap_name_hash(struct idmap_hashtable* h, const char *name, size_t len) +static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen, + const char *type, size_t typelen, char **desc) { - return &h->h_entries[fnvhash32(name, len) % IDMAP_HASH_SZ]; + char *cp; + size_t desclen = typelen + namelen + 2; + + *desc = kmalloc(desclen, GFP_KERNEL); + if (!*desc) + return -ENOMEM; + + cp = *desc; + memcpy(cp, type, typelen); + cp += typelen; + *cp++ = ':'; + + memcpy(cp, name, namelen); + cp += namelen; + *cp = '\0'; + return desclen; } -static struct idmap_hashent * -idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len) +static struct key *nfs_idmap_request_key(const char *name, size_t namelen, + const char *type, struct idmap *idmap) { - struct idmap_hashent *he = idmap_name_hash(h, name, len); + char *desc; + struct key *rkey; + ssize_t ret; + + ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); + if (ret <= 0) + return ERR_PTR(ret); + + rkey = request_key(&key_type_id_resolver, desc, ""); + if (IS_ERR(rkey)) { + mutex_lock(&idmap->idmap_mutex); + rkey = request_key_with_auxdata(&key_type_id_resolver_legacy, + desc, "", 0, idmap); + mutex_unlock(&idmap->idmap_mutex); + } - if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0) - return NULL; - if (time_after(jiffies, he->ih_expires)) - return NULL; - return he; + kfree(desc); + return rkey; } -static inline struct idmap_hashent * -idmap_id_hash(struct idmap_hashtable* h, __u32 id) +static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, + const char *type, void *data, + size_t data_size, struct idmap *idmap) { - return &h->h_entries[fnvhash32(&id, sizeof(id)) % IDMAP_HASH_SZ]; + const struct cred *saved_cred; + struct key *rkey; + struct user_key_payload *payload; + ssize_t ret; + + saved_cred = override_creds(id_resolver_cache); + rkey = nfs_idmap_request_key(name, namelen, type, idmap); + revert_creds(saved_cred); + + if (IS_ERR(rkey)) { + ret = PTR_ERR(rkey); + goto out; + } + + rcu_read_lock(); + rkey->perm |= KEY_USR_VIEW; + + ret = key_validate(rkey); + if (ret < 0) + goto out_up; + + payload = rcu_dereference(rkey->payload.rcudata); + if (IS_ERR_OR_NULL(payload)) { + ret = PTR_ERR(payload); + goto out_up; + } + + ret = payload->datalen; + if (ret > 0 && ret <= data_size) + memcpy(data, payload->data, ret); + else + ret = -EINVAL; + +out_up: + rcu_read_unlock(); + key_put(rkey); +out: + return ret; } -static struct idmap_hashent * -idmap_lookup_id(struct idmap_hashtable *h, __u32 id) +/* ID -> Name */ +static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, + size_t buflen, struct idmap *idmap) { - struct idmap_hashent *he = idmap_id_hash(h, id); - if (he->ih_id != id || he->ih_namelen == 0) - return NULL; - if (time_after(jiffies, he->ih_expires)) - return NULL; - return he; + char id_str[NFS_UINT_MAXLEN]; + int id_len; + ssize_t ret; + + id_len = snprintf(id_str, sizeof(id_str), "%u", id); + ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap); + if (ret < 0) + return -EINVAL; + return ret; } -/* - * Routines for allocating new entries in the hashtable. - * For now, we just have 1 entry per bucket, so it's all - * pretty trivial. - */ -static inline struct idmap_hashent * -idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len) +/* Name -> ID */ +static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type, + __u32 *id, struct idmap *idmap) { - return idmap_name_hash(h, name, len); + char id_str[NFS_UINT_MAXLEN]; + long id_long; + ssize_t data_size; + int ret = 0; + + data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap); + if (data_size <= 0) { + ret = -EINVAL; + } else { + ret = kstrtol(id_str, 10, &id_long); + *id = (__u32)id_long; + } + return ret; } -static inline struct idmap_hashent * -idmap_alloc_id(struct idmap_hashtable *h, __u32 id) +/* idmap classic begins here */ + +enum { + Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err +}; + +static const match_table_t nfs_idmap_tokens = { + { Opt_find_uid, "uid:%s" }, + { Opt_find_gid, "gid:%s" }, + { Opt_find_user, "user:%s" }, + { Opt_find_group, "group:%s" }, + { Opt_find_err, NULL } +}; + +static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *); +static ssize_t idmap_pipe_downcall(struct file *, const char __user *, + size_t); +static void idmap_release_pipe(struct inode *); +static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); + +static const struct rpc_pipe_ops idmap_upcall_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = idmap_pipe_downcall, + .release_pipe = idmap_release_pipe, + .destroy_msg = idmap_pipe_destroy_msg, +}; + +static struct key_type key_type_id_resolver_legacy = { + .name = "id_legacy", + .instantiate = user_instantiate, + .match = user_match, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = user_describe, + .read = user_read, + .request_key = nfs_idmap_legacy_upcall, +}; + +static void nfs_idmap_pipe_destroy(struct dentry *dir, + struct rpc_pipe_dir_object *pdo) { - return idmap_id_hash(h, id); + struct idmap *idmap = pdo->pdo_data; + struct rpc_pipe *pipe = idmap->idmap_pipe; + + if (pipe->dentry) { + rpc_unlink(pipe->dentry); + pipe->dentry = NULL; + } } -static void -idmap_update_entry(struct idmap_hashent *he, const char *name, - size_t namelen, __u32 id) +static int nfs_idmap_pipe_create(struct dentry *dir, + struct rpc_pipe_dir_object *pdo) { - he->ih_id = id; - memcpy(he->ih_name, name, namelen); - he->ih_name[namelen] = '\0'; - he->ih_namelen = namelen; - he->ih_expires = jiffies + nfs_idmap_cache_timeout; + struct idmap *idmap = pdo->pdo_data; + struct rpc_pipe *pipe = idmap->idmap_pipe; + struct dentry *dentry; + + dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + pipe->dentry = dentry; + return 0; } -/* - * Name -> ID - */ -static int -nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h, - const char *name, size_t namelen, __u32 *id) +static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = { + .create = nfs_idmap_pipe_create, + .destroy = nfs_idmap_pipe_destroy, +}; + +int +nfs_idmap_new(struct nfs_client *clp) { - struct rpc_pipe_msg msg; - struct idmap_msg *im; - struct idmap_hashent *he; - DECLARE_WAITQUEUE(wq, current); - int ret = -EIO; + struct idmap *idmap; + struct rpc_pipe *pipe; + int error; - im = &idmap->idmap_im; + idmap = kzalloc(sizeof(*idmap), GFP_KERNEL); + if (idmap == NULL) + return -ENOMEM; - /* - * String sanity checks - * Note that the userland daemon expects NUL terminated strings - */ - for (;;) { - if (namelen == 0) - return -EINVAL; - if (name[namelen-1] != '\0') - break; - namelen--; + rpc_init_pipe_dir_object(&idmap->idmap_pdo, + &nfs_idmap_pipe_dir_object_ops, + idmap); + + pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0); + if (IS_ERR(pipe)) { + error = PTR_ERR(pipe); + goto err; } - if (namelen >= IDMAP_NAMESZ) - return -EINVAL; + idmap->idmap_pipe = pipe; + mutex_init(&idmap->idmap_mutex); - mutex_lock(&idmap->idmap_lock); - mutex_lock(&idmap->idmap_im_lock); + error = rpc_add_pipe_dir_object(clp->cl_net, + &clp->cl_rpcclient->cl_pipedir_objects, + &idmap->idmap_pdo); + if (error) + goto err_destroy_pipe; - he = idmap_lookup_name(h, name, namelen); - if (he != NULL) { - *id = he->ih_id; - ret = 0; + clp->cl_idmap = idmap; + return 0; +err_destroy_pipe: + rpc_destroy_pipe_data(idmap->idmap_pipe); +err: + kfree(idmap); + return error; +} + +void +nfs_idmap_delete(struct nfs_client *clp) +{ + struct idmap *idmap = clp->cl_idmap; + + if (!idmap) + return; + clp->cl_idmap = NULL; + rpc_remove_pipe_dir_object(clp->cl_net, + &clp->cl_rpcclient->cl_pipedir_objects, + &idmap->idmap_pdo); + rpc_destroy_pipe_data(idmap->idmap_pipe); + kfree(idmap); +} + +int nfs_idmap_init(void) +{ + int ret; + ret = nfs_idmap_init_keyring(); + if (ret != 0) goto out; - } +out: + return ret; +} - memset(im, 0, sizeof(*im)); - memcpy(im->im_name, name, namelen); +void nfs_idmap_quit(void) +{ + nfs_idmap_quit_keyring(); +} - im->im_type = h->h_type; - im->im_conv = IDMAP_CONV_NAMETOID; +static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, + struct idmap_msg *im, + struct rpc_pipe_msg *msg) +{ + substring_t substr; + int token, ret; + + im->im_type = IDMAP_TYPE_GROUP; + token = match_token(desc, nfs_idmap_tokens, &substr); + + switch (token) { + case Opt_find_uid: + im->im_type = IDMAP_TYPE_USER; + case Opt_find_gid: + im->im_conv = IDMAP_CONV_NAMETOID; + ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ); + break; - memset(&msg, 0, sizeof(msg)); - msg.data = im; - msg.len = sizeof(*im); + case Opt_find_user: + im->im_type = IDMAP_TYPE_USER; + case Opt_find_group: + im->im_conv = IDMAP_CONV_IDTONAME; + ret = match_int(&substr, &im->im_id); + break; - add_wait_queue(&idmap->idmap_wq, &wq); - if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) { - remove_wait_queue(&idmap->idmap_wq, &wq); + default: + ret = -EINVAL; goto out; } - set_current_state(TASK_UNINTERRUPTIBLE); - mutex_unlock(&idmap->idmap_im_lock); - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&idmap->idmap_wq, &wq); - mutex_lock(&idmap->idmap_im_lock); + msg->data = im; + msg->len = sizeof(struct idmap_msg); - if (im->im_status & IDMAP_STATUS_SUCCESS) { - *id = im->im_id; - ret = 0; - } - - out: - memset(im, 0, sizeof(*im)); - mutex_unlock(&idmap->idmap_im_lock); - mutex_unlock(&idmap->idmap_lock); +out: return ret; } -/* - * ID -> Name - */ -static int -nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h, - __u32 id, char *name) +static bool +nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, + struct idmap_legacy_upcalldata *data) { - struct rpc_pipe_msg msg; - struct idmap_msg *im; - struct idmap_hashent *he; - DECLARE_WAITQUEUE(wq, current); - int ret = -EIO; - unsigned int len; + if (idmap->idmap_upcall_data != NULL) { + WARN_ON_ONCE(1); + return false; + } + idmap->idmap_upcall_data = data; + return true; +} + +static void +nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret) +{ + struct key_construction *cons = idmap->idmap_upcall_data->key_cons; - im = &idmap->idmap_im; + kfree(idmap->idmap_upcall_data); + idmap->idmap_upcall_data = NULL; + complete_request_key(cons, ret); +} - mutex_lock(&idmap->idmap_lock); - mutex_lock(&idmap->idmap_im_lock); +static void +nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret) +{ + if (idmap->idmap_upcall_data != NULL) + nfs_idmap_complete_pipe_upcall_locked(idmap, ret); +} - he = idmap_lookup_id(h, id); - if (he) { - memcpy(name, he->ih_name, he->ih_namelen); - ret = he->ih_namelen; - goto out; - } +static int nfs_idmap_legacy_upcall(struct key_construction *cons, + const char *op, + void *aux) +{ + struct idmap_legacy_upcalldata *data; + struct rpc_pipe_msg *msg; + struct idmap_msg *im; + struct idmap *idmap = (struct idmap *)aux; + struct key *key = cons->key; + int ret = -ENOMEM; - memset(im, 0, sizeof(*im)); - im->im_type = h->h_type; - im->im_conv = IDMAP_CONV_IDTONAME; - im->im_id = id; + /* msg and im are freed in idmap_pipe_destroy_msg */ + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out1; - memset(&msg, 0, sizeof(msg)); - msg.data = im; - msg.len = sizeof(*im); + msg = &data->pipe_msg; + im = &data->idmap_msg; + data->idmap = idmap; + data->key_cons = cons; - add_wait_queue(&idmap->idmap_wq, &wq); + ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); + if (ret < 0) + goto out2; - if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) { - remove_wait_queue(&idmap->idmap_wq, &wq); - goto out; - } + ret = -EAGAIN; + if (!nfs_idmap_prepare_pipe_upcall(idmap, data)) + goto out2; - set_current_state(TASK_UNINTERRUPTIBLE); - mutex_unlock(&idmap->idmap_im_lock); - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&idmap->idmap_wq, &wq); - mutex_lock(&idmap->idmap_im_lock); - - if (im->im_status & IDMAP_STATUS_SUCCESS) { - if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0) - goto out; - memcpy(name, im->im_name, len); - ret = len; - } + ret = rpc_queue_upcall(idmap->idmap_pipe, msg); + if (ret < 0) + nfs_idmap_abort_pipe_upcall(idmap, ret); - out: - memset(im, 0, sizeof(*im)); - mutex_unlock(&idmap->idmap_im_lock); - mutex_unlock(&idmap->idmap_lock); + return ret; +out2: + kfree(data); +out1: + complete_request_key(cons, ret); return ret; } -/* RPC pipefs upcall/downcall routines */ -static ssize_t -idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, - char __user *dst, size_t buflen) +static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen) { - char *data = (char *)msg->data + msg->copied; - size_t mlen = min(msg->len, buflen); - unsigned long left; + return key_instantiate_and_link(key, data, datalen, + id_resolver_cache->thread_keyring, + authkey); +} - left = copy_to_user(dst, data, mlen); - if (left == mlen) { - msg->errno = -EFAULT; - return -EFAULT; - } +static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, + struct idmap_msg *upcall, + struct key *key, struct key *authkey) +{ + char id_str[NFS_UINT_MAXLEN]; + size_t len; + int ret = -ENOKEY; - mlen -= left; - msg->copied += mlen; - msg->errno = 0; - return mlen; + /* ret = -ENOKEY */ + if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv) + goto out; + switch (im->im_conv) { + case IDMAP_CONV_NAMETOID: + if (strcmp(upcall->im_name, im->im_name) != 0) + break; + /* Note: here we store the NUL terminator too */ + len = sprintf(id_str, "%d", im->im_id) + 1; + ret = nfs_idmap_instantiate(key, authkey, id_str, len); + break; + case IDMAP_CONV_IDTONAME: + if (upcall->im_id != im->im_id) + break; + len = strlen(im->im_name); + ret = nfs_idmap_instantiate(key, authkey, im->im_name, len); + break; + default: + ret = -EINVAL; + } +out: + return ret; } static ssize_t idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { - struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode); + struct rpc_inode *rpci = RPC_I(file_inode(filp)); struct idmap *idmap = (struct idmap *)rpci->private; - struct idmap_msg im_in, *im = &idmap->idmap_im; - struct idmap_hashtable *h; - struct idmap_hashent *he = NULL; + struct key_construction *cons; + struct idmap_msg im; size_t namelen_in; - int ret; - - if (mlen != sizeof(im_in)) - return -ENOSPC; + int ret = -ENOKEY; - if (copy_from_user(&im_in, src, mlen) != 0) - return -EFAULT; + /* If instantiation is successful, anyone waiting for key construction + * will have been woken up and someone else may now have used + * idmap_key_cons - so after this point we may no longer touch it. + */ + if (idmap->idmap_upcall_data == NULL) + goto out_noupcall; - mutex_lock(&idmap->idmap_im_lock); + cons = idmap->idmap_upcall_data->key_cons; - ret = mlen; - im->im_status = im_in.im_status; - /* If we got an error, terminate now, and wake up pending upcalls */ - if (!(im_in.im_status & IDMAP_STATUS_SUCCESS)) { - wake_up(&idmap->idmap_wq); + if (mlen != sizeof(im)) { + ret = -ENOSPC; goto out; } - /* Sanity checking of strings */ - ret = -EINVAL; - namelen_in = strnlen(im_in.im_name, IDMAP_NAMESZ); - if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) + if (copy_from_user(&im, src, mlen) != 0) { + ret = -EFAULT; goto out; + } - switch (im_in.im_type) { - case IDMAP_TYPE_USER: - h = &idmap->idmap_user_hash; - break; - case IDMAP_TYPE_GROUP: - h = &idmap->idmap_group_hash; - break; - default: - goto out; + if (!(im.im_status & IDMAP_STATUS_SUCCESS)) { + ret = -ENOKEY; + goto out; } - switch (im_in.im_conv) { - case IDMAP_CONV_IDTONAME: - /* Did we match the current upcall? */ - if (im->im_conv == IDMAP_CONV_IDTONAME - && im->im_type == im_in.im_type - && im->im_id == im_in.im_id) { - /* Yes: copy string, including the terminating '\0' */ - memcpy(im->im_name, im_in.im_name, namelen_in); - im->im_name[namelen_in] = '\0'; - wake_up(&idmap->idmap_wq); - } - he = idmap_alloc_id(h, im_in.im_id); - break; - case IDMAP_CONV_NAMETOID: - /* Did we match the current upcall? */ - if (im->im_conv == IDMAP_CONV_NAMETOID - && im->im_type == im_in.im_type - && strnlen(im->im_name, IDMAP_NAMESZ) == namelen_in - && memcmp(im->im_name, im_in.im_name, namelen_in) == 0) { - im->im_id = im_in.im_id; - wake_up(&idmap->idmap_wq); - } - he = idmap_alloc_name(h, im_in.im_name, namelen_in); - break; - default: + namelen_in = strnlen(im.im_name, IDMAP_NAMESZ); + if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { + ret = -EINVAL; goto out; +} + + ret = nfs_idmap_read_and_verify_message(&im, + &idmap->idmap_upcall_data->idmap_msg, + cons->key, cons->authkey); + if (ret >= 0) { + key_set_timeout(cons->key, nfs_idmap_cache_timeout); + ret = mlen; } - /* If the entry is valid, also copy it to the cache */ - if (he != NULL) - idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id); - ret = mlen; out: - mutex_unlock(&idmap->idmap_im_lock); + nfs_idmap_complete_pipe_upcall_locked(idmap, ret); +out_noupcall: return ret; } static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) { - struct idmap_msg *im = msg->data; - struct idmap *idmap = container_of(im, struct idmap, idmap_im); + struct idmap_legacy_upcalldata *data = container_of(msg, + struct idmap_legacy_upcalldata, + pipe_msg); + struct idmap *idmap = data->idmap; - if (msg->errno >= 0) - return; - mutex_lock(&idmap->idmap_im_lock); - im->im_status = IDMAP_STATUS_LOOKUPFAIL; - wake_up(&idmap->idmap_wq); - mutex_unlock(&idmap->idmap_im_lock); + if (msg->errno) + nfs_idmap_abort_pipe_upcall(idmap, msg->errno); } -/* - * Fowler/Noll/Vo hash - * http://www.isthe.com/chongo/tech/comp/fnv/ - */ - -#define FNV_P_32 ((unsigned int)0x01000193) /* 16777619 */ -#define FNV_1_32 ((unsigned int)0x811c9dc5) /* 2166136261 */ - -static unsigned int fnvhash32(const void *buf, size_t buflen) +static void +idmap_release_pipe(struct inode *inode) { - const unsigned char *p, *end = (const unsigned char *)buf + buflen; - unsigned int hash = FNV_1_32; - - for (p = buf; p < end; p++) { - hash *= FNV_P_32; - hash ^= (unsigned int)*p; - } + struct rpc_inode *rpci = RPC_I(inode); + struct idmap *idmap = (struct idmap *)rpci->private; - return hash; + nfs_idmap_abort_pipe_upcall(idmap, -EPIPE); } -int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid) { - struct idmap *idmap = clp->cl_idmap; - - return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid); + struct idmap *idmap = server->nfs_client->cl_idmap; + __u32 id = -1; + int ret = 0; + + if (!nfs_map_string_to_numeric(name, namelen, &id)) + ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap); + if (ret == 0) { + *uid = make_kuid(&init_user_ns, id); + if (!uid_valid(*uid)) + ret = -ERANGE; + } + trace_nfs4_map_name_to_uid(name, namelen, id, ret); + return ret; } -int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid) { - struct idmap *idmap = clp->cl_idmap; - - return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); + struct idmap *idmap = server->nfs_client->cl_idmap; + __u32 id = -1; + int ret = 0; + + if (!nfs_map_string_to_numeric(name, namelen, &id)) + ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap); + if (ret == 0) { + *gid = make_kgid(&init_user_ns, id); + if (!gid_valid(*gid)) + ret = -ERANGE; + } + trace_nfs4_map_group_to_gid(name, namelen, id, ret); + return ret; } -int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf) +int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen) { - struct idmap *idmap = clp->cl_idmap; - - return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; + __u32 id; + + id = from_kuid(&init_user_ns, uid); + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap); + if (ret < 0) + ret = nfs_map_numeric_to_string(id, buf, buflen); + trace_nfs4_map_uid_to_name(buf, ret, id, ret); + return ret; } -int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf) +int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen) { - struct idmap *idmap = clp->cl_idmap; - - return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; + __u32 id; + + id = from_kgid(&init_user_ns, gid); + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap); + if (ret < 0) + ret = nfs_map_numeric_to_string(id, buf, buflen); + trace_nfs4_map_gid_to_group(buf, ret, id, ret); + return ret; } - diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index a4c7cf2bff3..9927913c97c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -5,7 +5,7 @@ * * nfs inode and superblock handling functions * - * Modularised by Alan Cox <Alan.Cox@linux.org>, while hacking some + * Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some * experimental NFS changes. Modularisation taken straight from SYS5 fs. * * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts. @@ -30,15 +30,15 @@ #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> -#include <linux/smp_lock.h> #include <linux/seq_file.h> #include <linux/mount.h> -#include <linux/nfs_idmap.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/nfs_xdr.h> +#include <linux/slab.h> +#include <linux/compat.h> +#include <linux/freezer.h> -#include <asm/system.h> #include <asm/uaccess.h> #include "nfs4_fs.h" @@ -46,19 +46,23 @@ #include "delegation.h" #include "iostat.h" #include "internal.h" +#include "fscache.h" +#include "pnfs.h" +#include "nfs.h" +#include "netns.h" + +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_VFS #define NFS_64_BIT_INODE_NUMBERS_ENABLED 1 /* Default is to see 64-bit inode numbers */ -static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; +static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; static void nfs_invalidate_inode(struct inode *); static int nfs_update_inode(struct inode *, struct nfs_fattr *); -static void nfs_zap_acl_cache(struct inode *); - static struct kmem_cache * nfs_inode_cachep; static inline unsigned long @@ -68,6 +72,19 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) } /** + * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks + * @word: long word containing the bit lock + */ +int nfs_wait_bit_killable(void *word) +{ + if (fatal_signal_pending(current)) + return -ERESTARTSYS; + freezable_schedule_unsafe(); + return 0; +} +EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); + +/** * nfs_compat_user_ino64 - returns the user-visible inode number * @fileid: 64-bit fileid * @@ -76,7 +93,11 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) */ u64 nfs_compat_user_ino64(u64 fileid) { - int ino; +#ifdef CONFIG_COMPAT + compat_ulong_t ino; +#else + unsigned long ino; +#endif if (enable_ino64) return fileid; @@ -86,31 +107,30 @@ u64 nfs_compat_user_ino64(u64 fileid) return ino; } -int nfs_write_inode(struct inode *inode, int sync) +int nfs_drop_inode(struct inode *inode) { - int ret; - - if (sync) { - ret = filemap_fdatawait(inode->i_mapping); - if (ret == 0) - ret = nfs_commit_inode(inode, FLUSH_SYNC); - } else - ret = nfs_commit_inode(inode, 0); - if (ret >= 0) - return 0; - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return ret; + return NFS_STALE(inode) || generic_drop_inode(inode); } +EXPORT_SYMBOL_GPL(nfs_drop_inode); void nfs_clear_inode(struct inode *inode) { /* * The following should never happen... */ - BUG_ON(nfs_have_writebacks(inode)); - BUG_ON(!list_empty(&NFS_I(inode)->open_files)); + WARN_ON_ONCE(nfs_have_writebacks(inode)); + WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files)); nfs_zap_acl_cache(inode); nfs_access_zap_cache(inode); + nfs_fscache_clear_inode(inode); +} +EXPORT_SYMBOL_GPL(nfs_clear_inode); + +void nfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + nfs_clear_inode(inode); } /** @@ -118,19 +138,26 @@ void nfs_clear_inode(struct inode *inode) */ int nfs_sync_mapping(struct address_space *mapping) { - int ret; + int ret = 0; - if (mapping->nrpages == 0) - return 0; - unmap_mapping_range(mapping, 0, 0, 0); - ret = filemap_write_and_wait(mapping); - if (ret != 0) - goto out; - ret = nfs_wb_all(mapping->host); -out: + if (mapping->nrpages != 0) { + unmap_mapping_range(mapping, 0, 0, 0); + ret = nfs_wb_all(mapping->host); + } return ret; } +static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + if (inode->i_mapping->nrpages == 0) + flags &= ~NFS_INO_INVALID_DATA; + nfsi->cache_validity |= flags; + if (flags & NFS_INO_INVALID_DATA) + nfs_fscache_invalidate(inode); +} + /* * Invalidate the local caches */ @@ -144,11 +171,19 @@ static void nfs_zap_caches_locked(struct inode *inode) nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; - memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); - if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; - else - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; + memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_PAGECACHE); + } else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_PAGECACHE); + nfs_zap_label_cache_locked(nfsi); } void nfs_zap_caches(struct inode *inode) @@ -162,12 +197,12 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) { if (mapping->nrpages != 0) { spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); spin_unlock(&inode->i_lock); } } -static void nfs_zap_acl_cache(struct inode *inode) +void nfs_zap_acl_cache(struct inode *inode) { void (*clear_acl_cache)(struct inode *); @@ -178,13 +213,15 @@ static void nfs_zap_acl_cache(struct inode *inode) NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL; spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL_GPL(nfs_zap_acl_cache); void nfs_invalidate_atime(struct inode *inode) { spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL_GPL(nfs_invalidate_atime); /* * Invalidate, but do not unhash, the inode. @@ -216,6 +253,8 @@ nfs_find_actor(struct inode *inode, void *opaque) if (NFS_FILEID(inode) != fattr->fileid) return 0; + if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode)) + return 0; if (nfs_compare_fh(NFS_FH(inode), fh)) return 0; if (is_bad_inode(inode) || NFS_STALE(inode)) @@ -234,15 +273,74 @@ nfs_init_locked(struct inode *inode, void *opaque) return 0; } -/* Don't use READDIRPLUS on directories that we believe are too large */ -#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE) +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static void nfs_clear_label_invalid(struct inode *inode) +{ + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_LABEL; + spin_unlock(&inode->i_lock); +} + +void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, + struct nfs4_label *label) +{ + int error; + + if (label == NULL) + return; + + if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) { + error = security_inode_notifysecctx(inode, label->label, + label->len); + if (error) + printk(KERN_ERR "%s() %s %d " + "security_inode_notifysecctx() %d\n", + __func__, + (char *)label->label, + label->len, error); + nfs_clear_label_invalid(inode); + } +} + +struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) +{ + struct nfs4_label *label = NULL; + int minor_version = server->nfs_client->cl_minorversion; + + if (minor_version < 2) + return label; + + if (!(server->caps & NFS_CAP_SECURITY_LABEL)) + return label; + + label = kzalloc(sizeof(struct nfs4_label), flags); + if (label == NULL) + return ERR_PTR(-ENOMEM); + + label->label = kzalloc(NFS4_MAXLABELLEN, flags); + if (label->label == NULL) { + kfree(label); + return ERR_PTR(-ENOMEM); + } + label->len = NFS4_MAXLABELLEN; + + return label; +} +EXPORT_SYMBOL_GPL(nfs4_label_alloc); +#else +void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, + struct nfs4_label *label) +{ +} +#endif +EXPORT_SYMBOL_GPL(nfs_setsecurity); /* * This is our front-end to iget that looks up inodes by file handle * instead of inode number. */ struct inode * -nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) +nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs_find_desc desc = { .fh = fh, @@ -251,13 +349,13 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) struct inode *inode = ERR_PTR(-ENOENT); unsigned long hash; - if ((fattr->valid & NFS_ATTR_FATTR) == 0) - goto out_no_inode; + nfs_attr_check_mountpoint(sb, fattr); - if (!fattr->nlink) { - printk("NFS: Buggy server - nlink == 0!\n"); + if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) && + !nfs_attr_use_mounted_on_fileid(fattr)) + goto out_no_inode; + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) goto out_no_inode; - } hash = nfs_fattr_to_ino_t(fattr); @@ -278,65 +376,108 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) /* We can't support update_atime(), since the server will reset it */ inode->i_flags |= S_NOATIME|S_NOCMTIME; inode->i_mode = fattr->mode; + if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 + && nfs_server_capable(inode, NFS_CAP_MODE)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; if (S_ISREG(inode->i_mode)) { - inode->i_fop = &nfs_file_operations; + inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; inode->i_data.a_ops = &nfs_file_aops; inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; } else if (S_ISDIR(inode->i_mode)) { inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; - if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) - && fattr->size <= NFS_LIMIT_READDIRPLUS) - set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); + inode->i_data.a_ops = &nfs_dir_aops; /* Deal with crossing mountpoints */ - if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { + if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT || + fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) inode->i_op = &nfs_referral_inode_operations; else inode->i_op = &nfs_mountpoint_inode_operations; inode->i_fop = NULL; - set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags); + inode->i_flags |= S_AUTOMOUNT; } } else if (S_ISLNK(inode->i_mode)) inode->i_op = &nfs_symlink_inode_operations; else init_special_inode(inode, inode->i_mode, fattr->rdev); + memset(&inode->i_atime, 0, sizeof(inode->i_atime)); + memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); + memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); + inode->i_version = 0; + inode->i_size = 0; + clear_nlink(inode); + inode->i_uid = make_kuid(&init_user_ns, -2); + inode->i_gid = make_kgid(&init_user_ns, -2); + inode->i_blocks = 0; + memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); + nfsi->write_io = 0; + nfsi->read_io = 0; + nfsi->read_cache_jiffies = fattr->time_start; - nfsi->last_updated = now; - nfsi->cache_change_attribute = now; - inode->i_atime = fattr->atime; - inode->i_mtime = fattr->mtime; - inode->i_ctime = fattr->ctime; - if (fattr->valid & NFS_ATTR_FATTR_V4) - nfsi->change_attr = fattr->change_attr; - inode->i_size = nfs_size_to_loff_t(fattr->size); - inode->i_nlink = fattr->nlink; - inode->i_uid = fattr->uid; - inode->i_gid = fattr->gid; - if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { + nfsi->attr_gencount = fattr->gencount; + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + inode->i_atime = fattr->atime; + else if (nfs_server_capable(inode, NFS_CAP_ATIME)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + if (fattr->valid & NFS_ATTR_FATTR_MTIME) + inode->i_mtime = fattr->mtime; + else if (nfs_server_capable(inode, NFS_CAP_MTIME)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + if (fattr->valid & NFS_ATTR_FATTR_CTIME) + inode->i_ctime = fattr->ctime; + else if (nfs_server_capable(inode, NFS_CAP_CTIME)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + if (fattr->valid & NFS_ATTR_FATTR_CHANGE) + inode->i_version = fattr->change_attr; + else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + if (fattr->valid & NFS_ATTR_FATTR_SIZE) + inode->i_size = nfs_size_to_loff_t(fattr->size); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE); + if (fattr->valid & NFS_ATTR_FATTR_NLINK) + set_nlink(inode, fattr->nlink); + else if (nfs_server_capable(inode, NFS_CAP_NLINK)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + if (fattr->valid & NFS_ATTR_FATTR_OWNER) + inode->i_uid = fattr->uid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + if (fattr->valid & NFS_ATTR_FATTR_GROUP) + inode->i_gid = fattr->gid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + inode->i_blocks = fattr->du.nfs2.blocks; + if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } else { - inode->i_blocks = fattr->du.nfs2.blocks; } + + nfs_setsecurity(inode, fattr, label); + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; - memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); nfsi->access_cache = RB_ROOT; + nfs_fscache_init_inode(inode); + unlock_new_inode(inode); } else nfs_refresh_inode(inode, fattr); - dprintk("NFS: nfs_fhget(%s/%Ld ct=%d)\n", + dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), + (unsigned long long)NFS_FILEID(inode), + nfs_display_fhandle_hash(fh), atomic_read(&inode->i_count)); out: @@ -346,15 +487,16 @@ out_no_inode: dprintk("nfs_fhget: iget failed with error %ld\n", PTR_ERR(inode)); goto out; } +EXPORT_SYMBOL_GPL(nfs_fhget); -#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET) +#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN) int nfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; - struct nfs_fattr fattr; - int error; + struct nfs_fattr *fattr; + int error = -ENOMEM; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -369,26 +511,63 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) /* Optimization: if the end result is no change, don't RPC */ attr->ia_valid &= NFS_VALID_ATTRS; - if (attr->ia_valid == 0) + if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0) return 0; - lock_kernel(); + trace_nfs_setattr_enter(inode); + /* Write all dirty data */ if (S_ISREG(inode->i_mode)) { - filemap_write_and_wait(inode->i_mapping); + nfs_inode_dio_wait(inode); nfs_wb_all(inode); } + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out; /* * Return any delegations if we're going to change ACLs */ if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) - nfs_inode_return_delegation(inode); - error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); + NFS_PROTO(inode)->return_delegation(inode); + error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); if (error == 0) - nfs_refresh_inode(inode, &fattr); - unlock_kernel(); + error = nfs_refresh_inode(inode, fattr); + nfs_free_fattr(fattr); +out: + trace_nfs_setattr_exit(inode, error); return error; } +EXPORT_SYMBOL_GPL(nfs_setattr); + +/** + * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating + * + * This is a copy of the common vmtruncate, but with the locking + * corrected to take into account the fact that NFS requires + * inode->i_size to be updated under the inode->i_lock. + */ +static int nfs_vmtruncate(struct inode * inode, loff_t offset) +{ + int err; + + err = inode_newsize_ok(inode, offset); + if (err) + goto out; + + spin_lock(&inode->i_lock); + i_size_write(inode, offset); + /* Optimisation */ + if (offset == 0) + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; + spin_unlock(&inode->i_lock); + + truncate_pagecache(inode, offset); +out: + return err; +} /** * nfs_setattr_update_inode - Update inode metadata after a setattr call. @@ -401,6 +580,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) { if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { + spin_lock(&inode->i_lock); if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -410,46 +590,34 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; - spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL); spin_unlock(&inode->i_lock); } if ((attr->ia_valid & ATTR_SIZE) != 0) { nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); - inode->i_size = attr->ia_size; - vmtruncate(inode, attr->ia_size); + nfs_vmtruncate(inode, attr->ia_size); } } +EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); -static int nfs_wait_schedule(void *word) +static void nfs_request_parent_use_readdirplus(struct dentry *dentry) { - if (signal_pending(current)) - return -ERESTARTSYS; - schedule(); - return 0; -} + struct dentry *parent; -/* - * Wait for the inode to get unlocked. - */ -static int nfs_wait_on_inode(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - int error; - - error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING, - nfs_wait_schedule, TASK_KILLABLE); - - return error; + parent = dget_parent(dentry); + nfs_force_use_readdirplus(parent->d_inode); + dput(parent); } -static void nfs_wake_up_inode(struct inode *inode) +static bool nfs_need_revalidate_inode(struct inode *inode) { - struct nfs_inode *nfsi = NFS_I(inode); - - clear_bit(NFS_INO_REVALIDATING, &nfsi->flags); - smp_mb__after_clear_bit(); - wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING); + if (NFS_I(inode)->cache_validity & + (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) + return true; + if (nfs_attribute_cache_expired(inode)) + return true; + return false; } int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) @@ -458,17 +626,13 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; int err; - /* - * Flush out writes to the server in order to update c/mtime. - * - * Hold the i_mutex to suspend application writes temporarily; - * this prevents long-running writing applications from blocking - * nfs_wb_nocommit. - */ + trace_nfs_getattr_enter(inode); + /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - mutex_lock(&inode->i_mutex); - nfs_wb_nocommit(inode); - mutex_unlock(&inode->i_mutex); + nfs_inode_dio_wait(inode); + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto out; } /* @@ -484,60 +648,168 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) need_atime = 0; - if (need_atime) - err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); - else - err = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (need_atime || nfs_need_revalidate_inode(inode)) { + struct nfs_server *server = NFS_SERVER(inode); + + if (server->caps & NFS_CAP_READDIRPLUS) + nfs_request_parent_use_readdirplus(dentry); + err = __nfs_revalidate_inode(server, inode); + } if (!err) { generic_fillattr(inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); } +out: + trace_nfs_getattr_exit(inode, err); return err; } +EXPORT_SYMBOL_GPL(nfs_getattr); + +static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) +{ + atomic_set(&l_ctx->count, 1); + l_ctx->lockowner.l_owner = current->files; + l_ctx->lockowner.l_pid = current->tgid; + INIT_LIST_HEAD(&l_ctx->list); + nfs_iocounter_init(&l_ctx->io_count); +} + +static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *head = &ctx->lock_context; + struct nfs_lock_context *pos = head; + + do { + if (pos->lockowner.l_owner != current->files) + continue; + if (pos->lockowner.l_pid != current->tgid) + continue; + atomic_inc(&pos->count); + return pos; + } while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head); + return NULL; +} + +struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *res, *new = NULL; + struct inode *inode = ctx->dentry->d_inode; + + spin_lock(&inode->i_lock); + res = __nfs_find_lock_context(ctx); + if (res == NULL) { + spin_unlock(&inode->i_lock); + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + nfs_init_lock_context(new); + spin_lock(&inode->i_lock); + res = __nfs_find_lock_context(ctx); + if (res == NULL) { + list_add_tail(&new->list, &ctx->lock_context.list); + new->open_context = ctx; + res = new; + new = NULL; + } + } + spin_unlock(&inode->i_lock); + kfree(new); + return res; +} + +void nfs_put_lock_context(struct nfs_lock_context *l_ctx) +{ + struct nfs_open_context *ctx = l_ctx->open_context; + struct inode *inode = ctx->dentry->d_inode; + + if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) + return; + list_del(&l_ctx->list); + spin_unlock(&inode->i_lock); + kfree(l_ctx); +} + +/** + * nfs_close_context - Common close_context() routine NFSv2/v3 + * @ctx: pointer to context + * @is_sync: is this a synchronous close + * + * always ensure that the attributes are up to date if we're mounted + * with close-to-open semantics + */ +void nfs_close_context(struct nfs_open_context *ctx, int is_sync) +{ + struct inode *inode; + struct nfs_server *server; + + if (!(ctx->mode & FMODE_WRITE)) + return; + if (!is_sync) + return; + inode = ctx->dentry->d_inode; + if (!list_empty(&NFS_I(inode)->open_files)) + return; + server = NFS_SERVER(inode); + if (server->flags & NFS_MOUNT_NOCTO) + return; + nfs_revalidate_inode(server, inode); +} +EXPORT_SYMBOL_GPL(nfs_close_context); -static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred) +struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode) { struct nfs_open_context *ctx; + struct rpc_cred *cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return ERR_CAST(cred); ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); - if (ctx != NULL) { - ctx->path.dentry = dget(dentry); - ctx->path.mnt = mntget(mnt); - ctx->cred = get_rpccred(cred); - ctx->state = NULL; - ctx->lockowner = current->files; - ctx->error = 0; - ctx->dir_cookie = 0; - atomic_set(&ctx->count, 1); + if (!ctx) { + put_rpccred(cred); + return ERR_PTR(-ENOMEM); } + nfs_sb_active(dentry->d_sb); + ctx->dentry = dget(dentry); + ctx->cred = cred; + ctx->state = NULL; + ctx->mode = f_mode; + ctx->flags = 0; + ctx->error = 0; + nfs_init_lock_context(&ctx->lock_context); + ctx->lock_context.open_context = ctx; + INIT_LIST_HEAD(&ctx->list); + ctx->mdsthreshold = NULL; return ctx; } +EXPORT_SYMBOL_GPL(alloc_nfs_open_context); struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) { if (ctx != NULL) - atomic_inc(&ctx->count); + atomic_inc(&ctx->lock_context.count); return ctx; } +EXPORT_SYMBOL_GPL(get_nfs_open_context); -static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait) +static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { - struct inode *inode = ctx->path.dentry->d_inode; + struct inode *inode = ctx->dentry->d_inode; + struct super_block *sb = ctx->dentry->d_sb; - if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) + if (!list_empty(&ctx->list)) { + if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + return; + list_del(&ctx->list); + spin_unlock(&inode->i_lock); + } else if (!atomic_dec_and_test(&ctx->lock_context.count)) return; - list_del(&ctx->list); - spin_unlock(&inode->i_lock); - if (ctx->state != NULL) { - if (wait) - nfs4_close_sync(&ctx->path, ctx->state, ctx->mode); - else - nfs4_close_state(&ctx->path, ctx->state, ctx->mode); - } + if (inode != NULL) + NFS_PROTO(inode)->close_context(ctx, is_sync); if (ctx->cred != NULL) put_rpccred(ctx->cred); - dput(ctx->path.dentry); - mntput(ctx->path.mnt); + dput(ctx->dentry); + nfs_sb_deactive(sb); + kfree(ctx->mdsthreshold); kfree(ctx); } @@ -545,31 +817,35 @@ void put_nfs_open_context(struct nfs_open_context *ctx) { __put_nfs_open_context(ctx, 0); } - -static void put_nfs_open_context_sync(struct nfs_open_context *ctx) -{ - __put_nfs_open_context(ctx, 1); -} +EXPORT_SYMBOL_GPL(put_nfs_open_context); /* * Ensure that mmap has a recent RPC credential for use when writing out * shared pages */ -static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) +void nfs_inode_attach_open_context(struct nfs_open_context *ctx) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = ctx->dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); - filp->private_data = get_nfs_open_context(ctx); spin_lock(&inode->i_lock); list_add(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); + +void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) +{ + filp->private_data = get_nfs_open_context(ctx); + if (list_empty(&ctx->list)) + nfs_inode_attach_open_context(ctx); +} +EXPORT_SYMBOL_GPL(nfs_file_set_open_context); /* * Given an inode, search for an open context with the desired characteristics */ -struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode) +struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *pos, *ctx = NULL; @@ -578,10 +854,10 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c list_for_each_entry(pos, &nfsi->open_files, list) { if (cred != NULL && pos->cred != cred) continue; - if ((pos->mode & mode) == mode) { - ctx = get_nfs_open_context(pos); - break; - } + if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) + continue; + ctx = get_nfs_open_context(pos); + break; } spin_unlock(&inode->i_lock); return ctx; @@ -589,15 +865,16 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c static void nfs_file_clear_open_context(struct file *filp) { - struct inode *inode = filp->f_path.dentry->d_inode; struct nfs_open_context *ctx = nfs_file_open_context(filp); if (ctx) { + struct inode *inode = ctx->dentry->d_inode; + filp->private_data = NULL; spin_lock(&inode->i_lock); list_move_tail(&ctx->list, &NFS_I(inode)->open_files); spin_unlock(&inode->i_lock); - put_nfs_open_context_sync(ctx); + __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1); } } @@ -607,18 +884,13 @@ static void nfs_file_clear_open_context(struct file *filp) int nfs_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; - struct rpc_cred *cred; - cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0); - if (IS_ERR(cred)) - return PTR_ERR(cred); - ctx = alloc_nfs_open_context(filp->f_path.mnt, filp->f_path.dentry, cred); - put_rpccred(cred); - if (ctx == NULL) - return -ENOMEM; - ctx->mode = filp->f_mode; + ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); put_nfs_open_context(ctx); + nfs_fscache_open_file(inode, filp); return 0; } @@ -636,63 +908,68 @@ int __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { int status = -ESTALE; - struct nfs_fattr fattr; + struct nfs4_label *label = NULL; + struct nfs_fattr *fattr = NULL; struct nfs_inode *nfsi = NFS_I(inode); - dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", - inode->i_sb->s_id, (long long)NFS_FILEID(inode)); + dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n", + inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); + + trace_nfs_revalidate_inode_enter(inode); - nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); - lock_kernel(); if (is_bad_inode(inode)) - goto out_nowait; + goto out; if (NFS_STALE(inode)) - goto out_nowait; + goto out; - status = nfs_wait_on_inode(inode); - if (status < 0) + status = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) goto out; - status = -ESTALE; - if (NFS_STALE(inode)) + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); + + label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); + if (IS_ERR(label)) { + status = PTR_ERR(label); goto out; + } - status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); + status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label); if (status != 0) { - dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), status); + (unsigned long long)NFS_FILEID(inode), status); if (status == -ESTALE) { nfs_zap_caches(inode); if (!S_ISDIR(inode->i_mode)) set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); } - goto out; + goto err_out; } - spin_lock(&inode->i_lock); - status = nfs_update_inode(inode, &fattr); + status = nfs_refresh_inode(inode, fattr); if (status) { - spin_unlock(&inode->i_lock); - dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), status); - goto out; + (unsigned long long)NFS_FILEID(inode), status); + goto err_out; } - spin_unlock(&inode->i_lock); if (nfsi->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); - dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n", - inode->i_sb->s_id, - (long long)NFS_FILEID(inode)); + nfs_setsecurity(inode, fattr, label); - out: - nfs_wake_up_inode(inode); + dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n", + inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode)); - out_nowait: - unlock_kernel(); +err_out: + nfs4_label_free(label); +out: + nfs_free_fattr(fattr); + trace_nfs_revalidate_inode_exit(inode, status); return status; } @@ -700,9 +977,14 @@ int nfs_attribute_timeout(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); - if (nfs_have_delegation(inode, FMODE_READ)) + return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); +} + +int nfs_attribute_cache_expired(struct inode *inode) +{ + if (nfs_have_delegated_attributes(inode)) return 0; - return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); + return nfs_attribute_timeout(inode); } /** @@ -714,116 +996,152 @@ int nfs_attribute_timeout(struct inode *inode) */ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { - if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) - && !nfs_attribute_timeout(inode)) + if (!nfs_need_revalidate_inode(inode)) return NFS_STALE(inode) ? -ESTALE : 0; return __nfs_revalidate_inode(server, inode); } +EXPORT_SYMBOL_GPL(nfs_revalidate_inode); -static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_space *mapping) +static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); - + int ret; + if (mapping->nrpages != 0) { - int ret = invalidate_inode_pages2(mapping); + if (S_ISREG(inode->i_mode)) { + ret = nfs_sync_mapping(mapping); + if (ret < 0) + return ret; + } + ret = invalidate_inode_pages2(mapping); if (ret < 0) return ret; } - spin_lock(&inode->i_lock); - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode)) { + spin_lock(&inode->i_lock); memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); - spin_unlock(&inode->i_lock); + spin_unlock(&inode->i_lock); + } nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); - dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", - inode->i_sb->s_id, (long long)NFS_FILEID(inode)); + nfs_fscache_wait_on_invalidate(inode); + + dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n", + inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode)); return 0; } -static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) +static bool nfs_mapping_need_revalidate_inode(struct inode *inode) { - int ret = 0; - - mutex_lock(&inode->i_mutex); - if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_DATA) { - ret = nfs_sync_mapping(mapping); - if (ret == 0) - ret = nfs_invalidate_mapping_nolock(inode, mapping); - } - mutex_unlock(&inode->i_mutex); - return ret; + if (nfs_have_delegated_attributes(inode)) + return false; + return (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE) + || nfs_attribute_timeout(inode) + || NFS_STALE(inode); } /** - * nfs_revalidate_mapping_nolock - Revalidate the pagecache + * nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode * @mapping - pointer to mapping */ -int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping) +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); + unsigned long *bitlock = &nfsi->flags; int ret = 0; - if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) - || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { + /* swapfiles are not supposed to be shared. */ + if (IS_SWAPFILE(inode)) + goto out; + + if (nfs_mapping_need_revalidate_inode(inode)) { ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); if (ret < 0) goto out; } - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - ret = nfs_invalidate_mapping_nolock(inode, mapping); -out: - return ret; -} - -/** - * nfs_revalidate_mapping - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping - * - * This version of the function will take the inode->i_mutex and attempt to - * flush out all dirty data if it needs to invalidate the page cache. - */ -int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) -{ - struct nfs_inode *nfsi = NFS_I(inode); - int ret = 0; - if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) - || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { - ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); - if (ret < 0) + /* + * We must clear NFS_INO_INVALID_DATA first to ensure that + * invalidations that come in while we're shooting down the mappings + * are respected. But, that leaves a race window where one revalidator + * can clear the flag, and then another checks it before the mapping + * gets invalidated. Fix that by serializing access to this part of + * the function. + * + * At the same time, we need to allow other tasks to see whether we + * might be in the middle of invalidating the pages, so we only set + * the bit lock here if it looks like we're going to be doing that. + */ + for (;;) { + ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) goto out; + spin_lock(&inode->i_lock); + if (test_bit(NFS_INO_INVALIDATING, bitlock)) { + spin_unlock(&inode->i_lock); + continue; + } + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + break; + spin_unlock(&inode->i_lock); + goto out; } - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - ret = nfs_invalidate_mapping(inode, mapping); + + set_bit(NFS_INO_INVALIDATING, bitlock); + smp_wmb(); + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; + spin_unlock(&inode->i_lock); + trace_nfs_invalidate_mapping_enter(inode); + ret = nfs_invalidate_mapping(inode, mapping); + trace_nfs_invalidate_mapping_exit(inode, ret); + + clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, NFS_INO_INVALIDATING); out: return ret; } -static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) +static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); + unsigned long ret = 0; - if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 && - nfsi->change_attr == fattr->pre_change_attr) { - nfsi->change_attr = fattr->change_attr; + if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) + && (fattr->valid & NFS_ATTR_FATTR_CHANGE) + && inode->i_version == fattr->pre_change_attr) { + inode->i_version = fattr->change_attr; if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); + ret |= NFS_INO_INVALID_ATTR; } /* If we have atomic WCC data, we may update some attributes */ - if ((fattr->valid & NFS_ATTR_WCC) != 0) { - if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { - memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); - if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; - } - if (inode->i_size == nfs_size_to_loff_t(fattr->pre_size) && - nfsi->npages == 0) - inode->i_size = nfs_size_to_loff_t(fattr->size); + if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) + && (fattr->valid & NFS_ATTR_FATTR_CTIME) + && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + ret |= NFS_INO_INVALID_ATTR; + } + + if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) + && (fattr->valid & NFS_ATTR_FATTR_MTIME) + && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + if (S_ISDIR(inode->i_mode)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); + ret |= NFS_INO_INVALID_ATTR; } + if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) + && (fattr->valid & NFS_ATTR_FATTR_SIZE) + && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) + && nfsi->npages == 0) { + i_size_write(inode, nfs_size_to_loff_t(fattr->size)); + ret |= NFS_INO_INVALID_ATTR; + } + + return ret; } /** @@ -842,52 +1160,232 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat unsigned long invalid = 0; + if (nfs_have_delegated_attributes(inode)) + return 0; /* Has the inode gone and changed behind our back? */ - if (nfsi->fileid != fattr->fileid - || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) + return -EIO; + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) return -EIO; - } - - /* Do atomic weak cache consistency updates */ - nfs_wcc_update_inode(inode, fattr); - if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && - nfsi->change_attr != fattr->change_attr) + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && + inode->i_version != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; /* Verify a few of the more important attributes */ - if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) + invalid |= NFS_INO_INVALID_ATTR; - cur_size = i_size_read(inode); - new_isize = nfs_size_to_loff_t(fattr->size); - if (cur_size != new_isize && nfsi->npages == 0) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + cur_size = i_size_read(inode); + new_isize = nfs_size_to_loff_t(fattr->size); + if (cur_size != new_isize && nfsi->npages == 0) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + } /* Have any file permissions changed? */ - if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) - || inode->i_uid != fattr->uid - || inode->i_gid != fattr->gid) + if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid)) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid)) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; /* Has the link count changed? */ - if (inode->i_nlink != fattr->nlink) + if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) invalid |= NFS_INO_INVALID_ATTR; - if (!timespec_equal(&inode->i_atime, &fattr->atime)) + if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) - nfsi->cache_validity |= invalid; - else - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ATIME - | NFS_INO_REVAL_PAGECACHE); + nfs_set_cache_invalid(inode, invalid); nfsi->read_cache_jiffies = fattr->time_start; return 0; } +static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + if (!(fattr->valid & NFS_ATTR_FATTR_CTIME)) + return 0; + return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; +} + +static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + return 0; + return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); +} + +static atomic_long_t nfs_attr_generation_counter; + +static unsigned long nfs_read_attr_generation_counter(void) +{ + return atomic_long_read(&nfs_attr_generation_counter); +} + +unsigned long nfs_inc_attr_generation_counter(void) +{ + return atomic_long_inc_return(&nfs_attr_generation_counter); +} + +void nfs_fattr_init(struct nfs_fattr *fattr) +{ + fattr->valid = 0; + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); + fattr->owner_name = NULL; + fattr->group_name = NULL; +} +EXPORT_SYMBOL_GPL(nfs_fattr_init); + +struct nfs_fattr *nfs_alloc_fattr(void) +{ + struct nfs_fattr *fattr; + + fattr = kmalloc(sizeof(*fattr), GFP_NOFS); + if (fattr != NULL) + nfs_fattr_init(fattr); + return fattr; +} +EXPORT_SYMBOL_GPL(nfs_alloc_fattr); + +struct nfs_fh *nfs_alloc_fhandle(void) +{ + struct nfs_fh *fh; + + fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS); + if (fh != NULL) + fh->size = 0; + return fh; +} +EXPORT_SYMBOL_GPL(nfs_alloc_fhandle); + +#ifdef NFS_DEBUG +/* + * _nfs_display_fhandle_hash - calculate the crc32 hash for the filehandle + * in the same way that wireshark does + * + * @fh: file handle + * + * For debugging only. + */ +u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh) +{ + /* wireshark uses 32-bit AUTODIN crc and does a bitwise + * not on the result */ + return nfs_fhandle_hash(fh); +} +EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash); + +/* + * _nfs_display_fhandle - display an NFS file handle on the console + * + * @fh: file handle to display + * @caption: display caption + * + * For debugging only. + */ +void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption) +{ + unsigned short i; + + if (fh == NULL || fh->size == 0) { + printk(KERN_DEFAULT "%s at %p is empty\n", caption, fh); + return; + } + + printk(KERN_DEFAULT "%s at %p is %u bytes, crc: 0x%08x:\n", + caption, fh, fh->size, _nfs_display_fhandle_hash(fh)); + for (i = 0; i < fh->size; i += 16) { + __be32 *pos = (__be32 *)&fh->data[i]; + + switch ((fh->size - i - 1) >> 2) { + case 0: + printk(KERN_DEFAULT " %08x\n", + be32_to_cpup(pos)); + break; + case 1: + printk(KERN_DEFAULT " %08x %08x\n", + be32_to_cpup(pos), be32_to_cpup(pos + 1)); + break; + case 2: + printk(KERN_DEFAULT " %08x %08x %08x\n", + be32_to_cpup(pos), be32_to_cpup(pos + 1), + be32_to_cpup(pos + 2)); + break; + default: + printk(KERN_DEFAULT " %08x %08x %08x %08x\n", + be32_to_cpup(pos), be32_to_cpup(pos + 1), + be32_to_cpup(pos + 2), be32_to_cpup(pos + 3)); + } + } +} +EXPORT_SYMBOL_GPL(_nfs_display_fhandle); +#endif + +/** + * nfs_inode_attrs_need_update - check if the inode attributes need updating + * @inode - pointer to inode + * @fattr - attributes + * + * Attempt to divine whether or not an RPC call reply carrying stale + * attributes got scheduled after another call carrying updated ones. + * + * To do so, the function first assumes that a more recent ctime means + * that the attributes in fattr are newer, however it also attempt to + * catch the case where ctime either didn't change, or went backwards + * (if someone reset the clock on the server) by looking at whether + * or not this RPC call was started after the inode was last updated. + * Note also the check for wraparound of 'attr_gencount' + * + * The function returns 'true' if it thinks the attributes in 'fattr' are + * more recent than the ones cached in the inode. + * + */ +static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + const struct nfs_inode *nfsi = NFS_I(inode); + + return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || + nfs_ctime_need_update(inode, fattr) || + nfs_size_need_update(inode, fattr) || + ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); +} + +/* + * Don't trust the change_attribute, mtime, ctime or size if + * a pnfs LAYOUTCOMMIT is outstanding + */ +static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode, + struct nfs_fattr *fattr) +{ + if (pnfs_layoutcommit_outstanding(inode)) + fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE | + NFS_ATTR_FATTR_MTIME | + NFS_ATTR_FATTR_CTIME | + NFS_ATTR_FATTR_SIZE); +} + +static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + int ret; + + trace_nfs_refresh_inode_enter(inode); + + nfs_inode_attrs_handle_layoutcommit(inode, fattr); + + if (nfs_inode_attrs_need_update(inode, fattr)) + ret = nfs_update_inode(inode, fattr); + else + ret = nfs_check_inode_attributes(inode, fattr); + + trace_nfs_refresh_inode_exit(inode, ret); + return ret; +} + /** * nfs_refresh_inode - try to update the inode attribute cache * @inode - pointer to inode @@ -900,20 +1398,29 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat */ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); int status; if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; spin_lock(&inode->i_lock); - if (time_after(fattr->time_start, nfsi->last_updated)) - status = nfs_update_inode(inode, fattr); - else - status = nfs_check_inode_attributes(inode, fattr); - + status = nfs_refresh_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); + return status; } +EXPORT_SYMBOL_GPL(nfs_refresh_inode); + +static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + + if (S_ISDIR(inode->i_mode)) + invalid |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, invalid); + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; + return nfs_refresh_inode_locked(inode, fattr); +} /** * nfs_post_op_update_inode - try to update the inode attribute cache @@ -931,15 +1438,15 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) */ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); + int status; spin_lock(&inode->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + status = nfs_post_op_update_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); - return nfs_refresh_inode(inode, fattr); + + return status; } +EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); /** * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache @@ -954,20 +1461,44 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) { - if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && - (fattr->valid & NFS_ATTR_WCC_V4) == 0) { - fattr->pre_change_attr = NFS_I(inode)->change_attr; - fattr->valid |= NFS_ATTR_WCC_V4; + int status; + + spin_lock(&inode->i_lock); + /* Don't do a WCC update if these attributes are already stale */ + if ((fattr->valid & NFS_ATTR_FATTR) == 0 || + !nfs_inode_attrs_need_update(inode, fattr)) { + fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE + | NFS_ATTR_FATTR_PRESIZE + | NFS_ATTR_FATTR_PREMTIME + | NFS_ATTR_FATTR_PRECTIME); + goto out_noforce; } - if ((fattr->valid & NFS_ATTR_FATTR) != 0 && - (fattr->valid & NFS_ATTR_WCC) == 0) { + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { + fattr->pre_change_attr = inode->i_version; + fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; + } + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); + fattr->valid |= NFS_ATTR_FATTR_PRECTIME; + } + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); - fattr->pre_size = inode->i_size; - fattr->valid |= NFS_ATTR_WCC; + fattr->valid |= NFS_ATTR_FATTR_PREMTIME; } - return nfs_post_op_update_inode(inode, fattr); + if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && + (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) { + fattr->pre_size = i_size_read(inode); + fattr->valid |= NFS_ATTR_FATTR_PRESIZE; + } +out_noforce: + status = nfs_post_op_update_inode_locked(inode, fattr); + spin_unlock(&inode->i_lock); + return status; } +EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); /* * Many nfs protocol calls return the new file attributes after @@ -988,25 +1519,39 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) loff_t cur_isize, new_isize; unsigned long invalid = 0; unsigned long now = jiffies; + unsigned long save_cache_validity; - dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", - __FUNCTION__, inode->i_sb->s_id, inode->i_ino, + dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", + __func__, inode->i_sb->s_id, inode->i_ino, + nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); - if (nfsi->fileid != fattr->fileid) - goto out_fileid; + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) { + printk(KERN_ERR "NFS: server %s error: fileid changed\n" + "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", + NFS_SERVER(inode)->nfs_client->cl_hostname, + inode->i_sb->s_id, (long long)nfsi->fileid, + (long long)fattr->fileid); + goto out_err; + } /* * Make sure the inode's type hasn't changed. */ - if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) - goto out_changed; + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { + /* + * Big trouble! The inode has become a different object. + */ + printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n", + __func__, inode->i_ino, inode->i_mode, fattr->mode); + goto out_err; + } server = NFS_SERVER(inode); /* Update the fsid? */ - if (S_ISDIR(inode->i_mode) && + if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && !nfs_fsid_equal(&server->fsid, &fattr->fsid) && - !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags)) + !IS_AUTOMOUNT(inode)) server->fsid = fattr->fsid; /* @@ -1014,107 +1559,161 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ nfsi->read_cache_jiffies = fattr->time_start; - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME + save_cache_validity = nfsi->cache_validity; + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED | NFS_INO_REVAL_PAGECACHE); /* Do atomic weak cache consistency updates */ - nfs_wcc_update_inode(inode, fattr); + invalid |= nfs_wcc_update_inode(inode, fattr); /* More cache consistency checks */ - if (!(fattr->valid & NFS_ATTR_FATTR_V4)) { - /* NFSv2/v3: Check if the mtime agrees */ - if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { - dprintk("NFS: mtime change on server for file %s/%ld\n", + if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { + if (inode->i_version != fattr->change_attr) { + dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + invalid |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_PAGECACHE; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); + inode->i_version = fattr->change_attr; } - /* If ctime has changed we should definitely clear access+acl caches */ - if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) - invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - } else if (nfsi->change_attr != fattr->change_attr) { - dprintk("NFS: change_attr change on server for file %s/%ld\n", - inode->i_sb->s_id, inode->i_ino); - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - if (S_ISDIR(inode->i_mode)) - nfs_force_lookup_revalidate(inode); - } + } else if (server->caps & NFS_CAP_CHANGE_ATTR) + nfsi->cache_validity |= save_cache_validity; + + if (fattr->valid & NFS_ATTR_FATTR_MTIME) { + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + } else if (server->caps & NFS_CAP_MTIME) + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_CTIME) { + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + } else if (server->caps & NFS_CAP_CTIME) + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_FORCED); /* Check if our cached file size is stale */ - new_isize = nfs_size_to_loff_t(fattr->size); - cur_isize = i_size_read(inode); - if (new_isize != cur_isize) { - /* Do we perhaps have any outstanding writes, or has - * the file grown beyond our last write? */ - if (nfsi->npages == 0 || new_isize > cur_isize) { - inode->i_size = new_isize; - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + new_isize = nfs_size_to_loff_t(fattr->size); + cur_isize = i_size_read(inode); + if (new_isize != cur_isize) { + /* Do we perhaps have any outstanding writes, or has + * the file grown beyond our last write? */ + if ((nfsi->npages == 0) || new_isize > cur_isize) { + i_size_write(inode, new_isize); + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + invalid &= ~NFS_INO_REVAL_PAGECACHE; + } + dprintk("NFS: isize change on server for file %s/%ld " + "(%Ld to %Ld)\n", + inode->i_sb->s_id, + inode->i_ino, + (long long)cur_isize, + (long long)new_isize); } - dprintk("NFS: isize change on server for file %s/%ld\n", - inode->i_sb->s_id, inode->i_ino); - } - - - memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); - nfsi->change_attr = fattr->change_attr; - - if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) || - inode->i_uid != fattr->uid || - inode->i_gid != fattr->gid) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - - inode->i_mode = fattr->mode; - inode->i_nlink = fattr->nlink; - inode->i_uid = fattr->uid; - inode->i_gid = fattr->gid; + } else + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); + + + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); + else if (server->caps & NFS_CAP_ATIME) + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_MODE) { + if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { + umode_t newmode = inode->i_mode & S_IFMT; + newmode |= fattr->mode & S_IALLUGO; + inode->i_mode = newmode; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + } + } else if (server->caps & NFS_CAP_MODE) + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_OWNER) { + if (!uid_eq(inode->i_uid, fattr->uid)) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + inode->i_uid = fattr->uid; + } + } else if (server->caps & NFS_CAP_OWNER) + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_GROUP) { + if (!gid_eq(inode->i_gid, fattr->gid)) { + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + inode->i_gid = fattr->gid; + } + } else if (server->caps & NFS_CAP_OWNER_GROUP) + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + + if (fattr->valid & NFS_ATTR_FATTR_NLINK) { + if (inode->i_nlink != fattr->nlink) { + invalid |= NFS_INO_INVALID_ATTR; + if (S_ISDIR(inode->i_mode)) + invalid |= NFS_INO_INVALID_DATA; + set_nlink(inode, fattr->nlink); + } + } else if (server->caps & NFS_CAP_NLINK) + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_FORCED); - if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { + if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } else { - inode->i_blocks = fattr->du.nfs2.blocks; } + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + inode->i_blocks = fattr->du.nfs2.blocks; /* Update attrtimeo value if we're out of the unstable period */ if (invalid & NFS_INO_INVALID_ATTR) { nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; - nfsi->last_updated = now; + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { - if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { + if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; } - /* - * Avoid jiffy wraparound issues with nfsi->last_updated - */ - if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now)) - nfsi->last_updated = nfsi->read_cache_jiffies; } invalid &= ~NFS_INO_INVALID_ATTR; /* Don't invalidate the data if we were to blame */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) invalid &= ~NFS_INO_INVALID_DATA; - if (!nfs_have_delegation(inode, FMODE_READ) || - (nfsi->cache_validity & NFS_INO_REVAL_FORCED)) - nfsi->cache_validity |= invalid; - nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED; + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) || + (save_cache_validity & NFS_INO_REVAL_FORCED)) + nfs_set_cache_invalid(inode, invalid); return 0; - out_changed: - /* - * Big trouble! The inode has become a different object. - */ - printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n", - __FUNCTION__, inode->i_ino, inode->i_mode, fattr->mode); out_err: /* * No need to worry about unhashing the dentry, as the @@ -1123,32 +1722,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ nfs_invalidate_inode(inode); return -ESTALE; - - out_fileid: - printk(KERN_ERR "NFS: server %s error: fileid changed\n" - "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", - NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id, - (long long)nfsi->fileid, (long long)fattr->fileid); - goto out_err; } - -#ifdef CONFIG_NFS_V4 - -/* - * Clean out any remaining NFSv4 state that might be left over due - * to open() calls that passed nfs_atomic_lookup, but failed to call - * nfs_open(). - */ -void nfs4_clear_inode(struct inode *inode) -{ - /* If we are holding a delegation, return it! */ - nfs_inode_return_delegation_noreclaim(inode); - /* First call standard NFS clear_inode() code */ - nfs_clear_inode(inode); -} -#endif - struct inode *nfs_alloc_inode(struct super_block *sb) { struct nfs_inode *nfsi; @@ -1157,32 +1732,37 @@ struct inode *nfs_alloc_inode(struct super_block *sb) return NULL; nfsi->flags = 0UL; nfsi->cache_validity = 0UL; -#ifdef CONFIG_NFS_V3_ACL - nfsi->acl_access = ERR_PTR(-EAGAIN); - nfsi->acl_default = ERR_PTR(-EAGAIN); -#endif -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) nfsi->nfs4_acl = NULL; #endif /* CONFIG_NFS_V4 */ return &nfsi->vfs_inode; } +EXPORT_SYMBOL_GPL(nfs_alloc_inode); -void nfs_destroy_inode(struct inode *inode) +static void nfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); } +void nfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, nfs_i_callback); +} +EXPORT_SYMBOL_GPL(nfs_destroy_inode); + static inline void nfs4_init_once(struct nfs_inode *nfsi) { -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) INIT_LIST_HEAD(&nfsi->open_states); nfsi->delegation = NULL; nfsi->delegation_state = 0; init_rwsem(&nfsi->rwsem); + nfsi->layout = NULL; #endif } -static void init_once(struct kmem_cache * cachep, void *foo) +static void init_once(void *foo) { struct nfs_inode *nfsi = (struct nfs_inode *) foo; @@ -1190,9 +1770,10 @@ static void init_once(struct kmem_cache * cachep, void *foo) INIT_LIST_HEAD(&nfsi->open_files); INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); - INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); - nfsi->ncommit = 0; + INIT_LIST_HEAD(&nfsi->commit_info.list); nfsi->npages = 0; + nfsi->commit_info.ncommit = 0; + atomic_set(&nfsi->commit_info.rpcs_out, 0); atomic_set(&nfsi->silly_count, 1); INIT_HLIST_HEAD(&nfsi->silly_list); init_waitqueue_head(&nfsi->waitqueue); @@ -1214,9 +1795,66 @@ static int __init nfs_init_inodecache(void) static void nfs_destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(nfs_inode_cachep); } +struct workqueue_struct *nfsiod_workqueue; +EXPORT_SYMBOL_GPL(nfsiod_workqueue); + +/* + * start up the nfsiod workqueue + */ +static int nfsiod_start(void) +{ + struct workqueue_struct *wq; + dprintk("RPC: creating workqueue nfsiod\n"); + wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0); + if (wq == NULL) + return -ENOMEM; + nfsiod_workqueue = wq; + return 0; +} + +/* + * Destroy the nfsiod workqueue + */ +static void nfsiod_stop(void) +{ + struct workqueue_struct *wq; + + wq = nfsiod_workqueue; + if (wq == NULL) + return; + nfsiod_workqueue = NULL; + destroy_workqueue(wq); +} + +int nfs_net_id; +EXPORT_SYMBOL_GPL(nfs_net_id); + +static int nfs_net_init(struct net *net) +{ + nfs_clients_init(net); + return 0; +} + +static void nfs_net_exit(struct net *net) +{ + nfs_cleanup_cb_ident_idr(net); +} + +static struct pernet_operations nfs_net_ops = { + .init = nfs_net_init, + .exit = nfs_net_exit, + .id = &nfs_net_id, + .size = sizeof(struct nfs_net), +}; + /* * Initialize NFS */ @@ -1224,52 +1862,71 @@ static int __init init_nfs_fs(void) { int err; + err = register_pernet_subsys(&nfs_net_ops); + if (err < 0) + goto out9; + + err = nfs_fscache_register(); + if (err < 0) + goto out8; + + err = nfsiod_start(); + if (err) + goto out7; + err = nfs_fs_proc_init(); if (err) - goto out5; + goto out6; err = nfs_init_nfspagecache(); if (err) - goto out4; + goto out5; err = nfs_init_inodecache(); if (err) - goto out3; + goto out4; err = nfs_init_readpagecache(); if (err) - goto out2; + goto out3; err = nfs_init_writepagecache(); if (err) - goto out1; + goto out2; err = nfs_init_directcache(); if (err) - goto out0; + goto out1; #ifdef CONFIG_PROC_FS - rpc_proc_register(&nfs_rpcstat); + rpc_proc_register(&init_net, &nfs_rpcstat); #endif if ((err = register_nfs_fs()) != 0) - goto out; + goto out0; + return 0; -out: +out0: #ifdef CONFIG_PROC_FS - rpc_proc_unregister("nfs"); + rpc_proc_unregister(&init_net, "nfs"); #endif nfs_destroy_directcache(); -out0: - nfs_destroy_writepagecache(); out1: - nfs_destroy_readpagecache(); + nfs_destroy_writepagecache(); out2: - nfs_destroy_inodecache(); + nfs_destroy_readpagecache(); out3: - nfs_destroy_nfspagecache(); + nfs_destroy_inodecache(); out4: - nfs_fs_proc_exit(); + nfs_destroy_nfspagecache(); out5: + nfs_fs_proc_exit(); +out6: + nfsiod_stop(); +out7: + nfs_fscache_unregister(); +out8: + unregister_pernet_subsys(&nfs_net_ops); +out9: return err; } @@ -1280,11 +1937,14 @@ static void __exit exit_nfs_fs(void) nfs_destroy_readpagecache(); nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); + nfs_fscache_unregister(); + unregister_pernet_subsys(&nfs_net_ops); #ifdef CONFIG_PROC_FS - rpc_proc_unregister("nfs"); + rpc_proc_unregister(&init_net, "nfs"); #endif unregister_nfs_fs(); nfs_fs_proc_exit(); + nfsiod_stop(); } /* Not quite true; I just maintain it */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 931992763e6..f415cbf9f6c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -2,8 +2,12 @@ * NFS internal definitions */ +#include "nfs4_fs.h" #include <linux/mount.h> #include <linux/security.h> +#include <linux/crc32.h> + +#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) struct nfs_string; @@ -15,6 +19,23 @@ struct nfs_string; */ #define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) +static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) +{ + if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) + fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT; +} + +static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr) +{ + if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) || + (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) && + ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0))) + return 0; + + fattr->fileid = fattr->mounted_on_fileid; + return 1; +} + struct nfs_clone_mount { const struct super_block *sb; const struct dentry *dentry; @@ -28,27 +49,60 @@ struct nfs_clone_mount { }; /* + * Note: RFC 1813 doesn't limit the number of auth flavors that + * a server can return, so make something up. + */ +#define NFS_MAX_SECFLAVORS (12) + +/* + * Value used if the user did not specify a port value. + */ +#define NFS_UNSPEC_PORT (-1) + +/* + * Maximum number of pages that readdir can use for creating + * a vmapped array of pages. + */ +#define NFS_MAX_READDIR_PAGES 8 + +struct nfs_client_initdata { + unsigned long init_flags; + const char *hostname; + const struct sockaddr *addr; + size_t addrlen; + struct nfs_subversion *nfs_mod; + int proto; + u32 minorversion; + struct net *net; +}; + +/* * In-kernel mount arguments */ struct nfs_parsed_mount_data { int flags; - int rsize, wsize; - int timeo, retrans; - int acregmin, acregmax, + unsigned int rsize, wsize; + unsigned int timeo, retrans; + unsigned int acregmin, acregmax, acdirmin, acdirmax; - int namlen; + unsigned int namlen; + unsigned int options; unsigned int bsize; - unsigned int auth_flavor_len; - rpc_authflavor_t auth_flavors[1]; + struct nfs_auth_info auth_info; + rpc_authflavor_t selected_flavor; char *client_address; + unsigned int version; + unsigned int minorversion; + char *fscache_uniq; + bool need_mount; struct { struct sockaddr_storage address; size_t addrlen; char *hostname; - unsigned int version; - unsigned short port; - int protocol; + u32 version; + int port; + unsigned short protocol; } mount_server; struct { @@ -56,30 +110,88 @@ struct nfs_parsed_mount_data { size_t addrlen; char *hostname; char *export_path; - int protocol; + int port; + unsigned short protocol; } nfs_server; struct security_mnt_opts lsm_opts; + struct net *net; }; -/* client.c */ -extern struct rpc_program nfs_program; +/* mount_clnt.c */ +struct nfs_mount_request { + struct sockaddr *sap; + size_t salen; + char *hostname; + char *dirpath; + u32 version; + unsigned short protocol; + struct nfs_fh *fh; + int noresvport; + unsigned int *auth_flav_len; + rpc_authflavor_t *auth_flavs; + struct net *net; +}; + +struct nfs_mount_info { + void (*fill_super)(struct super_block *, struct nfs_mount_info *); + int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *); + struct nfs_parsed_mount_data *parsed; + struct nfs_clone_mount *cloned; + struct nfs_fh *mntfh; +}; + +extern int nfs_mount(struct nfs_mount_request *info); +extern void nfs_umount(const struct nfs_mount_request *info); +/* client.c */ +extern const struct rpc_program nfs_program; +extern void nfs_clients_init(struct net *net); +extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); +int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t); +struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, + const struct rpc_timeout *, const char *, + rpc_authflavor_t); +int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); +void nfs_server_insert_lists(struct nfs_server *); +void nfs_server_remove_lists(struct nfs_server *); +void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int); +int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, + rpc_authflavor_t); +struct nfs_server *nfs_alloc_server(void); +void nfs_server_copy_userdata(struct nfs_server *, struct nfs_server *); + +extern void nfs_cleanup_cb_ident_idr(struct net *); extern void nfs_put_client(struct nfs_client *); -extern struct nfs_client *nfs_find_client(const struct sockaddr *, u32); -extern struct nfs_client *nfs_find_client_next(struct nfs_client *); -extern struct nfs_server *nfs_create_server( - const struct nfs_parsed_mount_data *, - struct nfs_fh *); +extern void nfs_free_client(struct nfs_client *); +extern struct nfs_client *nfs4_find_client_ident(struct net *, int); +extern struct nfs_client * +nfs4_find_client_sessionid(struct net *, const struct sockaddr *, + struct nfs4_sessionid *, u32); +extern struct nfs_server *nfs_create_server(struct nfs_mount_info *, + struct nfs_subversion *); extern struct nfs_server *nfs4_create_server( - const struct nfs_parsed_mount_data *, - struct nfs_fh *); + struct nfs_mount_info *, + struct nfs_subversion *); extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, struct nfs_fh *); +extern int nfs4_update_server(struct nfs_server *server, const char *hostname, + struct sockaddr *sap, size_t salen, + struct net *net); extern void nfs_free_server(struct nfs_server *server); extern struct nfs_server *nfs_clone_server(struct nfs_server *, struct nfs_fh *, - struct nfs_fattr *); + struct nfs_fattr *, + rpc_authflavor_t); +extern int nfs_wait_client_init_complete(const struct nfs_client *clp); +extern void nfs_mark_client_ready(struct nfs_client *clp, int state); +extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, + const struct sockaddr *ds_addr, + int ds_addrlen, int ds_proto, + unsigned int ds_timeo, + unsigned int ds_retrans); +extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, + struct inode *); #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); @@ -93,20 +205,22 @@ static inline void nfs_fs_proc_exit(void) } #endif -/* nfs4namespace.c */ -#ifdef CONFIG_NFS_V4 -extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry); -#else -static inline -struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) -{ - return ERR_PTR(-ENOENT); -} +#ifdef CONFIG_NFS_V4_1 +int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *); +#endif + +/* nfs3client.c */ +#if IS_ENABLED(CONFIG_NFS_V3) +struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *); +struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *, + struct nfs_fattr *, rpc_authflavor_t); #endif /* callback_xdr.c */ extern struct svc_version nfs4_callback_version1; +extern struct svc_version nfs4_callback_version4; +struct nfs_pageio_descriptor; /* pagelist.c */ extern int __init nfs_init_nfspagecache(void); extern void nfs_destroy_nfspagecache(void); @@ -115,84 +229,278 @@ extern void nfs_destroy_readpagecache(void); extern int __init nfs_init_writepagecache(void); extern void nfs_destroy_writepagecache(void); -#ifdef CONFIG_NFS_DIRECTIO extern int __init nfs_init_directcache(void); extern void nfs_destroy_directcache(void); -#else -#define nfs_init_directcache() (0) -#define nfs_destroy_directcache() do {} while(0) -#endif +extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr, + void (*release)(struct nfs_pgio_header *hdr)); +void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); +int nfs_iocounter_wait(struct nfs_io_counter *c); + +extern const struct nfs_pageio_ops nfs_pgio_rw_ops; +struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *); +void nfs_rw_header_free(struct nfs_pgio_header *); +void nfs_pgio_data_release(struct nfs_pgio_data *); +int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); +int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *, + const struct rpc_call_ops *, int, int); +void nfs_free_request(struct nfs_page *req); + +static inline void nfs_iocounter_init(struct nfs_io_counter *c) +{ + c->flags = 0; + atomic_set(&c->io_count, 0); +} /* nfs2xdr.c */ -extern int nfs_stat_to_errno(int); extern struct rpc_procinfo nfs_procedures[]; -extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int); +extern int nfs2_decode_dirent(struct xdr_stream *, + struct nfs_entry *, int); /* nfs3xdr.c */ extern struct rpc_procinfo nfs3_procedures[]; -extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int); +extern int nfs3_decode_dirent(struct xdr_stream *, + struct nfs_entry *, int); /* nfs4xdr.c */ -#ifdef CONFIG_NFS_V4 -extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); +#if IS_ENABLED(CONFIG_NFS_V4) +extern int nfs4_decode_dirent(struct xdr_stream *, + struct nfs_entry *, int); +#endif +#ifdef CONFIG_NFS_V4_1 +extern const u32 nfs41_maxread_overhead; +extern const u32 nfs41_maxwrite_overhead; +extern const u32 nfs41_maxgetdevinfo_overhead; #endif /* nfs4proc.c */ -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) extern struct rpc_procinfo nfs4_procedures[]; #endif +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); +static inline void nfs4_label_free(struct nfs4_label *label) +{ + if (label) { + kfree(label->label); + kfree(label); + } + return; +} + +static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) +{ + if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL)) + nfsi->cache_validity |= NFS_INO_INVALID_LABEL; +} +#else +static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; } +static inline void nfs4_label_free(void *label) {} +static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) +{ +} +#endif /* CONFIG_NFS_V4_SECURITY_LABEL */ + +/* proc.c */ +void nfs_close_context(struct nfs_open_context *ctx, int is_sync); +extern struct nfs_client *nfs_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr); + /* dir.c */ -extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); +extern void nfs_force_use_readdirplus(struct inode *dir); +extern unsigned long nfs_access_cache_count(struct shrinker *shrink, + struct shrink_control *sc); +extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, + struct shrink_control *sc); +struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); +int nfs_create(struct inode *, struct dentry *, umode_t, bool); +int nfs_mkdir(struct inode *, struct dentry *, umode_t); +int nfs_rmdir(struct inode *, struct dentry *); +int nfs_unlink(struct inode *, struct dentry *); +int nfs_symlink(struct inode *, struct dentry *, const char *); +int nfs_link(struct dentry *, struct inode *, struct dentry *); +int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); +int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); + +/* file.c */ +int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int); +loff_t nfs_file_llseek(struct file *, loff_t, int); +int nfs_file_flush(struct file *, fl_owner_t); +ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); +ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, + size_t, unsigned int); +int nfs_file_mmap(struct file *, struct vm_area_struct *); +ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); +int nfs_file_release(struct inode *, struct file *); +int nfs_lock(struct file *, int, struct file_lock *); +int nfs_flock(struct file *, int, struct file_lock *); +int nfs_check_flags(int); +int nfs_setlease(struct file *, long, struct file_lock **); /* inode.c */ +extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_destroy_inode(struct inode *); -extern int nfs_write_inode(struct inode *,int); +extern int nfs_write_inode(struct inode *, struct writeback_control *); +extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); -#ifdef CONFIG_NFS_V4 -extern void nfs4_clear_inode(struct inode *); -#endif +extern void nfs_evict_inode(struct inode *); +void nfs_zap_acl_cache(struct inode *inode); +extern int nfs_wait_bit_killable(void *word); /* super.c */ +extern const struct super_operations nfs_sops; +extern struct file_system_type nfs_fs_type; extern struct file_system_type nfs_xdev_fs_type; -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) extern struct file_system_type nfs4_xdev_fs_type; extern struct file_system_type nfs4_referral_fs_type; #endif +bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t); +struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *, + struct nfs_subversion *); +void nfs_initialise_sb(struct super_block *); +int nfs_set_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); +int nfs_clone_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); +struct dentry *nfs_fs_mount_common(struct nfs_server *, int, const char *, + struct nfs_mount_info *, struct nfs_subversion *); +struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void *); +struct dentry * nfs_xdev_mount_common(struct file_system_type *, int, + const char *, struct nfs_mount_info *); +void nfs_kill_super(struct super_block *); +void nfs_fill_super(struct super_block *, struct nfs_mount_info *); extern struct rpc_stat nfs_rpcstat; extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); -extern void nfs_sb_active(struct nfs_server *server); -extern void nfs_sb_deactive(struct nfs_server *server); +extern void nfs_sb_active(struct super_block *sb); +extern void nfs_sb_deactive(struct super_block *sb); /* namespace.c */ -extern char *nfs_path(const char *base, - const struct dentry *droot, - const struct dentry *dentry, - char *buffer, ssize_t buflen); +#define NFS_PATH_CANONICAL 1 +extern char *nfs_path(char **p, struct dentry *dentry, + char *buffer, ssize_t buflen, unsigned flags); +extern struct vfsmount *nfs_d_automount(struct path *path); +struct vfsmount *nfs_submount(struct nfs_server *, struct dentry *, + struct nfs_fh *, struct nfs_fattr *); +struct vfsmount *nfs_do_submount(struct dentry *, struct nfs_fh *, + struct nfs_fattr *, rpc_authflavor_t); /* getroot.c */ -extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *); -#ifdef CONFIG_NFS_V4 -extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *); +extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, + const char *); +#if IS_ENABLED(CONFIG_NFS_V4) +extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *, + const char *); + +extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool); +#endif + +struct nfs_pgio_completion_ops; +/* read.c */ +extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, + struct inode *inode, bool force_mds, + const struct nfs_pgio_completion_ops *compl_ops); +extern void nfs_read_prepare(struct rpc_task *task, void *calldata); +extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); -extern int nfs4_path_walk(struct nfs_server *server, - struct nfs_fh *mntfh, - const char *path); +/* super.c */ +void nfs_clone_super(struct super_block *, struct nfs_mount_info *); +void nfs_umount_begin(struct super_block *); +int nfs_statfs(struct dentry *, struct kstatfs *); +int nfs_show_options(struct seq_file *, struct dentry *); +int nfs_show_devname(struct seq_file *, struct dentry *); +int nfs_show_path(struct seq_file *, struct dentry *); +int nfs_show_stats(struct seq_file *, struct dentry *); +void nfs_put_super(struct super_block *); +int nfs_remount(struct super_block *sb, int *flags, char *raw_data); + +/* write.c */ +extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, + struct inode *inode, int ioflags, bool force_mds, + const struct nfs_pgio_completion_ops *compl_ops); +extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); +extern void nfs_commit_free(struct nfs_commit_data *p); +extern void nfs_write_prepare(struct rpc_task *task, void *calldata); +extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); +extern int nfs_initiate_commit(struct rpc_clnt *clnt, + struct nfs_commit_data *data, + const struct rpc_call_ops *call_ops, + int how, int flags); +extern void nfs_init_commit(struct nfs_commit_data *data, + struct list_head *head, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo); +int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, + struct nfs_commit_info *cinfo, int max); +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *); +int nfs_scan_commit(struct inode *inode, struct list_head *dst, + struct nfs_commit_info *cinfo); +void nfs_mark_request_commit(struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo); +int nfs_generic_commit_list(struct inode *inode, struct list_head *head, + int how, struct nfs_commit_info *cinfo); +void nfs_retry_commit(struct list_head *page_list, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo); +void nfs_commitdata_release(struct nfs_commit_data *data); +void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, + struct nfs_commit_info *cinfo); +void nfs_request_remove_commit_list(struct nfs_page *req, + struct nfs_commit_info *cinfo); +void nfs_init_cinfo(struct nfs_commit_info *cinfo, + struct inode *inode, + struct nfs_direct_req *dreq); +int nfs_key_timeout_notify(struct file *filp, struct inode *inode); +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); + +#ifdef CONFIG_MIGRATION +extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *, enum migrate_mode); +#else +#define nfs_migrate_page NULL #endif +/* unlink.c */ +extern struct rpc_task * +nfs_async_rename(struct inode *old_dir, struct inode *new_dir, + struct dentry *old_dentry, struct dentry *new_dentry, + void (*complete)(struct rpc_task *, struct nfs_renamedata *)); +extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); + +/* direct.c */ +void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, + struct nfs_direct_req *dreq); +static inline void nfs_inode_dio_wait(struct inode *inode) +{ + inode_dio_wait(inode); +} +extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); + +/* nfs4proc.c */ +extern void __nfs4_read_done_cb(struct nfs_pgio_data *); +extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr); +extern int nfs40_walk_client_list(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred); +extern int nfs41_walk_client_list(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred); + /* * Determine the device name as a string */ -static inline char *nfs_devname(const struct vfsmount *mnt_parent, - const struct dentry *dentry, +static inline char *nfs_devname(struct dentry *dentry, char *buffer, ssize_t buflen) { - return nfs_path(mnt_parent->mnt_devname, mnt_parent->mnt_root, - dentry, buffer, buflen); + char *dummy; + return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL); } /* @@ -255,19 +563,29 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) static inline unsigned int nfs_page_length(struct page *page) { - loff_t i_size = i_size_read(page->mapping->host); + loff_t i_size = i_size_read(page_file_mapping(page)->host); if (i_size > 0) { + pgoff_t page_index = page_file_index(page); pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; - if (page->index < end_index) + if (page_index < end_index) return PAGE_CACHE_SIZE; - if (page->index == end_index) + if (page_index == end_index) return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; } return 0; } /* + * Convert a umode to a dirent->d_type + */ +static inline +unsigned char nfs_umode_to_dtype(umode_t mode) +{ + return (mode >> 12) & 15; +} + +/* * Determine the number of pages in an array of length 'len' and * with a base offset of 'base' */ @@ -278,3 +596,34 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) PAGE_SIZE - 1) >> PAGE_SHIFT; } +/* + * Convert a struct timespec into a 64-bit change attribute + * + * This does approximately the same thing as timespec_to_ns(), + * but for calculation efficiency, we multiply the seconds by + * 1024*1024*1024. + */ +static inline +u64 nfs_timespec_to_change_attr(const struct timespec *ts) +{ + return ((u64)ts->tv_sec << 30) + ts->tv_nsec; +} + +#ifdef CONFIG_CRC32 +/** + * nfs_fhandle_hash - calculate the crc32 hash for the filehandle + * @fh - pointer to filehandle + * + * returns a crc32 hash for the filehandle that is compatible with + * the one displayed by "wireshark". + */ +static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) +{ + return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size); +} +#else +static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) +{ + return 0; +} +#endif diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index 6350ecbde58..c5832487c45 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -5,160 +5,67 @@ * * Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com> * - * NFS client per-mount statistics provide information about the health of - * the NFS client and the health of each NFS mount point. Generally these - * are not for detailed problem diagnosis, but simply to indicate that there - * is a problem. - * - * These counters are not meant to be human-readable, but are meant to be - * integrated into system monitoring tools such as "sar" and "iostat". As - * such, the counters are sampled by the tools over time, and are never - * zeroed after a file system is mounted. Moving averages can be computed - * by the tools by taking the difference between two instantaneous samples - * and dividing that by the time between the samples. */ #ifndef _NFS_IOSTAT #define _NFS_IOSTAT -#define NFS_IOSTAT_VERS "1.0" - -/* - * NFS byte counters - * - * 1. SERVER - the number of payload bytes read from or written to the - * server by the NFS client via an NFS READ or WRITE request. - * - * 2. NORMAL - the number of bytes read or written by applications via - * the read(2) and write(2) system call interfaces. - * - * 3. DIRECT - the number of bytes read or written from files opened - * with the O_DIRECT flag. - * - * These counters give a view of the data throughput into and out of the NFS - * client. Comparing the number of bytes requested by an application with the - * number of bytes the client requests from the server can provide an - * indication of client efficiency (per-op, cache hits, etc). - * - * These counters can also help characterize which access methods are in - * use. DIRECT by itself shows whether there is any O_DIRECT traffic. - * NORMAL + DIRECT shows how much data is going through the system call - * interface. A large amount of SERVER traffic without much NORMAL or - * DIRECT traffic shows that applications are using mapped files. - * - * NFS page counters - * - * These count the number of pages read or written via nfs_readpage(), - * nfs_readpages(), or their write equivalents. - */ -enum nfs_stat_bytecounters { - NFSIOS_NORMALREADBYTES = 0, - NFSIOS_NORMALWRITTENBYTES, - NFSIOS_DIRECTREADBYTES, - NFSIOS_DIRECTWRITTENBYTES, - NFSIOS_SERVERREADBYTES, - NFSIOS_SERVERWRITTENBYTES, - NFSIOS_READPAGES, - NFSIOS_WRITEPAGES, - __NFSIOS_BYTESMAX, -}; - -/* - * NFS event counters - * - * These counters provide a low-overhead way of monitoring client activity - * without enabling NFS trace debugging. The counters show the rate at - * which VFS requests are made, and how often the client invalidates its - * data and attribute caches. This allows system administrators to monitor - * such things as how close-to-open is working, and answer questions such - * as "why are there so many GETATTR requests on the wire?" - * - * They also count anamolous events such as short reads and writes, silly - * renames due to close-after-delete, and operations that change the size - * of a file (such operations can often be the source of data corruption - * if applications aren't using file locking properly). - */ -enum nfs_stat_eventcounters { - NFSIOS_INODEREVALIDATE = 0, - NFSIOS_DENTRYREVALIDATE, - NFSIOS_DATAINVALIDATE, - NFSIOS_ATTRINVALIDATE, - NFSIOS_VFSOPEN, - NFSIOS_VFSLOOKUP, - NFSIOS_VFSACCESS, - NFSIOS_VFSUPDATEPAGE, - NFSIOS_VFSREADPAGE, - NFSIOS_VFSREADPAGES, - NFSIOS_VFSWRITEPAGE, - NFSIOS_VFSWRITEPAGES, - NFSIOS_VFSGETDENTS, - NFSIOS_VFSSETATTR, - NFSIOS_VFSFLUSH, - NFSIOS_VFSFSYNC, - NFSIOS_VFSLOCK, - NFSIOS_VFSRELEASE, - NFSIOS_CONGESTIONWAIT, - NFSIOS_SETATTRTRUNC, - NFSIOS_EXTENDWRITE, - NFSIOS_SILLYRENAME, - NFSIOS_SHORTREAD, - NFSIOS_SHORTWRITE, - NFSIOS_DELAY, - __NFSIOS_COUNTSMAX, -}; - -#ifdef __KERNEL__ - #include <linux/percpu.h> #include <linux/cache.h> +#include <linux/nfs_iostat.h> struct nfs_iostats { unsigned long long bytes[__NFSIOS_BYTESMAX]; +#ifdef CONFIG_NFS_FSCACHE + unsigned long long fscache[__NFSIOS_FSCACHEMAX]; +#endif unsigned long events[__NFSIOS_COUNTSMAX]; } ____cacheline_aligned; -static inline void nfs_inc_server_stats(struct nfs_server *server, enum nfs_stat_eventcounters stat) +static inline void nfs_inc_server_stats(const struct nfs_server *server, + enum nfs_stat_eventcounters stat) { - struct nfs_iostats *iostats; - int cpu; - - cpu = get_cpu(); - iostats = per_cpu_ptr(server->io_stats, cpu); - iostats->events[stat] ++; - put_cpu_no_resched(); + this_cpu_inc(server->io_stats->events[stat]); } -static inline void nfs_inc_stats(struct inode *inode, enum nfs_stat_eventcounters stat) +static inline void nfs_inc_stats(const struct inode *inode, + enum nfs_stat_eventcounters stat) { nfs_inc_server_stats(NFS_SERVER(inode), stat); } -static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat_bytecounters stat, unsigned long addend) +static inline void nfs_add_server_stats(const struct nfs_server *server, + enum nfs_stat_bytecounters stat, + long addend) { - struct nfs_iostats *iostats; - int cpu; - - cpu = get_cpu(); - iostats = per_cpu_ptr(server->io_stats, cpu); - iostats->bytes[stat] += addend; - put_cpu_no_resched(); + this_cpu_add(server->io_stats->bytes[stat], addend); } -static inline void nfs_add_stats(struct inode *inode, enum nfs_stat_bytecounters stat, unsigned long addend) +static inline void nfs_add_stats(const struct inode *inode, + enum nfs_stat_bytecounters stat, + long addend) { nfs_add_server_stats(NFS_SERVER(inode), stat, addend); } -static inline struct nfs_iostats *nfs_alloc_iostats(void) +#ifdef CONFIG_NFS_FSCACHE +static inline void nfs_add_fscache_stats(struct inode *inode, + enum nfs_stat_fscachecounters stat, + long addend) +{ + this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); +} +#endif + +static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void) { return alloc_percpu(struct nfs_iostats); } -static inline void nfs_free_iostats(struct nfs_iostats *stats) +static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats) { if (stats != NULL) free_percpu(stats); } -#endif -#endif +#endif /* _NFS_IOSTAT */ diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 49c7cd0502c..99a45283b9e 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -14,12 +14,121 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/sched.h> #include <linux/nfs_fs.h> +#include "internal.h" -#ifdef RPC_DEBUG +#ifdef NFS_DEBUG # define NFSDBG_FACILITY NFSDBG_MOUNT #endif -static struct rpc_program mnt_program; +/* + * Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4 + */ +#define MNTPATHLEN (1024) + +/* + * XDR data type sizes + */ +#define encode_dirpath_sz (1 + XDR_QUADLEN(MNTPATHLEN)) +#define MNT_status_sz (1) +#define MNT_fhs_status_sz (1) +#define MNT_fhandle_sz XDR_QUADLEN(NFS2_FHSIZE) +#define MNT_fhandle3_sz (1 + XDR_QUADLEN(NFS3_FHSIZE)) +#define MNT_authflav3_sz (1 + NFS_MAX_SECFLAVORS) + +/* + * XDR argument and result sizes + */ +#define MNT_enc_dirpath_sz encode_dirpath_sz +#define MNT_dec_mountres_sz (MNT_status_sz + MNT_fhandle_sz) +#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandle_sz + \ + MNT_authflav3_sz) + +/* + * Defined by RFC 1094, section A.5 + */ +enum { + MOUNTPROC_NULL = 0, + MOUNTPROC_MNT = 1, + MOUNTPROC_DUMP = 2, + MOUNTPROC_UMNT = 3, + MOUNTPROC_UMNTALL = 4, + MOUNTPROC_EXPORT = 5, +}; + +/* + * Defined by RFC 1813, section 5.2 + */ +enum { + MOUNTPROC3_NULL = 0, + MOUNTPROC3_MNT = 1, + MOUNTPROC3_DUMP = 2, + MOUNTPROC3_UMNT = 3, + MOUNTPROC3_UMNTALL = 4, + MOUNTPROC3_EXPORT = 5, +}; + +static const struct rpc_program mnt_program; + +/* + * Defined by OpenGroup XNFS Version 3W, chapter 8 + */ +enum mountstat { + MNT_OK = 0, + MNT_EPERM = 1, + MNT_ENOENT = 2, + MNT_EACCES = 13, + MNT_EINVAL = 22, +}; + +static struct { + u32 status; + int errno; +} mnt_errtbl[] = { + { .status = MNT_OK, .errno = 0, }, + { .status = MNT_EPERM, .errno = -EPERM, }, + { .status = MNT_ENOENT, .errno = -ENOENT, }, + { .status = MNT_EACCES, .errno = -EACCES, }, + { .status = MNT_EINVAL, .errno = -EINVAL, }, +}; + +/* + * Defined by RFC 1813, section 5.1.5 + */ +enum mountstat3 { + MNT3_OK = 0, /* no error */ + MNT3ERR_PERM = 1, /* Not owner */ + MNT3ERR_NOENT = 2, /* No such file or directory */ + MNT3ERR_IO = 5, /* I/O error */ + MNT3ERR_ACCES = 13, /* Permission denied */ + MNT3ERR_NOTDIR = 20, /* Not a directory */ + MNT3ERR_INVAL = 22, /* Invalid argument */ + MNT3ERR_NAMETOOLONG = 63, /* Filename too long */ + MNT3ERR_NOTSUPP = 10004, /* Operation not supported */ + MNT3ERR_SERVERFAULT = 10006, /* A failure on the server */ +}; + +static struct { + u32 status; + int errno; +} mnt3_errtbl[] = { + { .status = MNT3_OK, .errno = 0, }, + { .status = MNT3ERR_PERM, .errno = -EPERM, }, + { .status = MNT3ERR_NOENT, .errno = -ENOENT, }, + { .status = MNT3ERR_IO, .errno = -EIO, }, + { .status = MNT3ERR_ACCES, .errno = -EACCES, }, + { .status = MNT3ERR_NOTDIR, .errno = -ENOTDIR, }, + { .status = MNT3ERR_INVAL, .errno = -EINVAL, }, + { .status = MNT3ERR_NAMETOOLONG, .errno = -ENAMETOOLONG, }, + { .status = MNT3ERR_NOTSUPP, .errno = -ENOTSUPP, }, + { .status = MNT3ERR_SERVERFAULT, .errno = -EREMOTEIO, }, +}; + +struct mountres { + int errno; + struct nfs_fh *fh; + unsigned int *auth_count; + rpc_authflavor_t *auth_flavors; +}; struct mnt_fhstatus { u32 status; @@ -28,162 +137,387 @@ struct mnt_fhstatus { /** * nfs_mount - Obtain an NFS file handle for the given host and path - * @addr: pointer to server's address - * @len: size of server's address - * @hostname: name of server host, or NULL - * @path: pointer to string containing export path to mount - * @version: mount version to use for this request - * @protocol: transport protocol to use for thie request - * @fh: pointer to location to place returned file handle + * @info: pointer to mount request arguments * - * Uses default timeout parameters specified by underlying transport. + * Uses default timeout parameters specified by underlying transport. On + * successful return, the auth_flavs list and auth_flav_len will be populated + * with the list from the server or a faked-up list if the server didn't + * provide one. */ -int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path, - int version, int protocol, struct nfs_fh *fh) +int nfs_mount(struct nfs_mount_request *info) { - struct mnt_fhstatus result = { - .fh = fh + struct mountres result = { + .fh = info->fh, + .auth_count = info->auth_flav_len, + .auth_flavors = info->auth_flavs, }; struct rpc_message msg = { - .rpc_argp = path, + .rpc_argp = info->dirpath, .rpc_resp = &result, }; struct rpc_create_args args = { - .protocol = protocol, - .address = addr, - .addrsize = len, - .servername = hostname, + .net = info->net, + .protocol = info->protocol, + .address = info->sap, + .addrsize = info->salen, + .servername = info->hostname, .program = &mnt_program, - .version = version, + .version = info->version, .authflavor = RPC_AUTH_UNIX, - .flags = 0, }; struct rpc_clnt *mnt_clnt; int status; dprintk("NFS: sending MNT request for %s:%s\n", - (hostname ? hostname : "server"), path); + (info->hostname ? info->hostname : "server"), + info->dirpath); + + if (strlen(info->dirpath) > MNTPATHLEN) + return -ENAMETOOLONG; + + if (info->noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; mnt_clnt = rpc_create(&args); if (IS_ERR(mnt_clnt)) goto out_clnt_err; - if (version == NFS_MNT3_VERSION) + if (info->version == NFS_MNT3_VERSION) msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT]; else - msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT]; + msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT]; - status = rpc_call_sync(mnt_clnt, &msg, 0); + status = rpc_call_sync(mnt_clnt, &msg, RPC_TASK_SOFT|RPC_TASK_TIMEOUT); rpc_shutdown_client(mnt_clnt); if (status < 0) goto out_call_err; - if (result.status != 0) + if (result.errno != 0) goto out_mnt_err; dprintk("NFS: MNT request succeeded\n"); status = 0; + /* + * If the server didn't provide a flavor list, allow the + * client to try any flavor. + */ + if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) { + dprintk("NFS: Faking up auth_flavs list\n"); + info->auth_flavs[0] = RPC_AUTH_NULL; + *info->auth_flav_len = 1; + } out: return status; out_clnt_err: status = PTR_ERR(mnt_clnt); - dprintk("NFS: failed to create RPC client, status=%d\n", status); + dprintk("NFS: failed to create MNT RPC client, status=%d\n", status); goto out; out_call_err: - dprintk("NFS: failed to start MNT request, status=%d\n", status); + dprintk("NFS: MNT request failed, status=%d\n", status); goto out; out_mnt_err: - dprintk("NFS: MNT server returned result %d\n", result.status); - status = -EACCES; + dprintk("NFS: MNT server returned result %d\n", result.errno); + status = result.errno; goto out; } +/** + * nfs_umount - Notify a server that we have unmounted this export + * @info: pointer to umount request arguments + * + * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always + * use UDP. + */ +void nfs_umount(const struct nfs_mount_request *info) +{ + static const struct rpc_timeout nfs_umnt_timeout = { + .to_initval = 1 * HZ, + .to_maxval = 3 * HZ, + .to_retries = 2, + }; + struct rpc_create_args args = { + .net = info->net, + .protocol = IPPROTO_UDP, + .address = info->sap, + .addrsize = info->salen, + .timeout = &nfs_umnt_timeout, + .servername = info->hostname, + .program = &mnt_program, + .version = info->version, + .authflavor = RPC_AUTH_UNIX, + .flags = RPC_CLNT_CREATE_NOPING, + }; + struct rpc_message msg = { + .rpc_argp = info->dirpath, + }; + struct rpc_clnt *clnt; + int status; + + if (strlen(info->dirpath) > MNTPATHLEN) + return; + + if (info->noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + + clnt = rpc_create(&args); + if (IS_ERR(clnt)) + goto out_clnt_err; + + dprintk("NFS: sending UMNT request for %s:%s\n", + (info->hostname ? info->hostname : "server"), info->dirpath); + + if (info->version == NFS_MNT3_VERSION) + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT]; + else + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT]; + + status = rpc_call_sync(clnt, &msg, 0); + rpc_shutdown_client(clnt); + + if (unlikely(status < 0)) + goto out_call_err; + + return; + +out_clnt_err: + dprintk("NFS: failed to create UMNT RPC client, status=%ld\n", + PTR_ERR(clnt)); + return; + +out_call_err: + dprintk("NFS: UMNT request failed, status=%d\n", status); +} + /* * XDR encode/decode functions for MOUNT */ -static int xdr_encode_dirpath(struct rpc_rqst *req, __be32 *p, - const char *path) + +static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname) +{ + const u32 pathname_len = strlen(pathname); + __be32 *p; + + p = xdr_reserve_space(xdr, 4 + pathname_len); + xdr_encode_opaque(p, pathname, pathname_len); +} + +static void mnt_xdr_enc_dirpath(struct rpc_rqst *req, struct xdr_stream *xdr, + const char *dirpath) +{ + encode_mntdirpath(xdr, dirpath); +} + +/* + * RFC 1094: "A non-zero status indicates some sort of error. In this + * case, the status is a UNIX error number." This can be problematic + * if the server and client use different errno values for the same + * error. + * + * However, the OpenGroup XNFS spec provides a simple mapping that is + * independent of local errno values on the server and the client. + */ +static int decode_status(struct xdr_stream *xdr, struct mountres *res) { - p = xdr_encode_string(p, path); + unsigned int i; + u32 status; + __be32 *p; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + status = be32_to_cpup(p); + + for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) { + if (mnt_errtbl[i].status == status) { + res->errno = mnt_errtbl[i].errno; + return 0; + } + } + + dprintk("NFS: unrecognized MNT status code: %u\n", status); + res->errno = -EACCES; return 0; } -static int xdr_decode_fhstatus(struct rpc_rqst *req, __be32 *p, - struct mnt_fhstatus *res) +static int decode_fhandle(struct xdr_stream *xdr, struct mountres *res) { struct nfs_fh *fh = res->fh; + __be32 *p; + + p = xdr_inline_decode(xdr, NFS2_FHSIZE); + if (unlikely(p == NULL)) + return -EIO; + + fh->size = NFS2_FHSIZE; + memcpy(fh->data, p, NFS2_FHSIZE); + return 0; +} - if ((res->status = ntohl(*p++)) == 0) { - fh->size = NFS2_FHSIZE; - memcpy(fh->data, p, NFS2_FHSIZE); +static int mnt_xdr_dec_mountres(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct mountres *res) +{ + int status; + + status = decode_status(xdr, res); + if (unlikely(status != 0 || res->errno != 0)) + return status; + return decode_fhandle(xdr, res); +} + +static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res) +{ + unsigned int i; + u32 status; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + status = be32_to_cpup(p); + + for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) { + if (mnt3_errtbl[i].status == status) { + res->errno = mnt3_errtbl[i].errno; + return 0; + } } + + dprintk("NFS: unrecognized MNT3 status code: %u\n", status); + res->errno = -EACCES; return 0; } -static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, - struct mnt_fhstatus *res) +static int decode_fhandle3(struct xdr_stream *xdr, struct mountres *res) { struct nfs_fh *fh = res->fh; + u32 size; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + + size = be32_to_cpup(p); + if (size > NFS3_FHSIZE || size == 0) + return -EIO; + + p = xdr_inline_decode(xdr, size); + if (unlikely(p == NULL)) + return -EIO; + + fh->size = size; + memcpy(fh->data, p, size); + return 0; +} + +static int decode_auth_flavors(struct xdr_stream *xdr, struct mountres *res) +{ + rpc_authflavor_t *flavors = res->auth_flavors; + unsigned int *count = res->auth_count; + u32 entries, i; + __be32 *p; + + if (*count == 0) + return 0; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + return -EIO; + entries = be32_to_cpup(p); + dprintk("NFS: received %u auth flavors\n", entries); + if (entries > NFS_MAX_SECFLAVORS) + entries = NFS_MAX_SECFLAVORS; + + p = xdr_inline_decode(xdr, 4 * entries); + if (unlikely(p == NULL)) + return -EIO; + + if (entries > *count) + entries = *count; - if ((res->status = ntohl(*p++)) == 0) { - int size = ntohl(*p++); - if (size <= NFS3_FHSIZE) { - fh->size = size; - memcpy(fh->data, p, size); - } else - res->status = -EBADHANDLE; + for (i = 0; i < entries; i++) { + flavors[i] = be32_to_cpup(p++); + dprintk("NFS: auth flavor[%u]: %d\n", i, flavors[i]); } + *count = i; + return 0; } -#define MNT_dirpath_sz (1 + 256) -#define MNT_fhstatus_sz (1 + 8) -#define MNT_fhstatus3_sz (1 + 16) +static int mnt_xdr_dec_mountres3(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct mountres *res) +{ + int status; + + status = decode_fhs_status(xdr, res); + if (unlikely(status != 0 || res->errno != 0)) + return status; + status = decode_fhandle3(xdr, res); + if (unlikely(status != 0)) { + res->errno = -EBADHANDLE; + return 0; + } + return decode_auth_flavors(xdr, res); +} static struct rpc_procinfo mnt_procedures[] = { - [MNTPROC_MNT] = { - .p_proc = MNTPROC_MNT, - .p_encode = (kxdrproc_t) xdr_encode_dirpath, - .p_decode = (kxdrproc_t) xdr_decode_fhstatus, - .p_arglen = MNT_dirpath_sz, - .p_replen = MNT_fhstatus_sz, - .p_statidx = MNTPROC_MNT, + [MOUNTPROC_MNT] = { + .p_proc = MOUNTPROC_MNT, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres, + .p_arglen = MNT_enc_dirpath_sz, + .p_replen = MNT_dec_mountres_sz, + .p_statidx = MOUNTPROC_MNT, .p_name = "MOUNT", }, + [MOUNTPROC_UMNT] = { + .p_proc = MOUNTPROC_UMNT, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC_UMNT, + .p_name = "UMOUNT", + }, }; static struct rpc_procinfo mnt3_procedures[] = { [MOUNTPROC3_MNT] = { .p_proc = MOUNTPROC3_MNT, - .p_encode = (kxdrproc_t) xdr_encode_dirpath, - .p_decode = (kxdrproc_t) xdr_decode_fhstatus3, - .p_arglen = MNT_dirpath_sz, - .p_replen = MNT_fhstatus3_sz, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_decode = (kxdrdproc_t)mnt_xdr_dec_mountres3, + .p_arglen = MNT_enc_dirpath_sz, + .p_replen = MNT_dec_mountres3_sz, .p_statidx = MOUNTPROC3_MNT, .p_name = "MOUNT", }, + [MOUNTPROC3_UMNT] = { + .p_proc = MOUNTPROC3_UMNT, + .p_encode = (kxdreproc_t)mnt_xdr_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC3_UMNT, + .p_name = "UMOUNT", + }, }; -static struct rpc_version mnt_version1 = { +static const struct rpc_version mnt_version1 = { .number = 1, - .nrprocs = 2, + .nrprocs = ARRAY_SIZE(mnt_procedures), .procs = mnt_procedures, }; -static struct rpc_version mnt_version3 = { +static const struct rpc_version mnt_version3 = { .number = 3, - .nrprocs = 2, + .nrprocs = ARRAY_SIZE(mnt3_procedures), .procs = mnt3_procedures, }; -static struct rpc_version *mnt_version[] = { +static const struct rpc_version *mnt_version[] = { NULL, &mnt_version1, NULL, @@ -192,7 +526,7 @@ static struct rpc_version *mnt_version[] = { static struct rpc_stat mnt_stats; -static struct rpc_program mnt_program = { +static const struct rpc_program mnt_program = { .name = "mount", .number = NFS_MNT_PROGRAM, .nrvers = ARRAY_SIZE(mnt_version), diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 607f6eb9cdb..b5a0afc3ee1 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -7,54 +7,65 @@ * NFS namespace */ +#include <linux/module.h> #include <linux/dcache.h> +#include <linux/gfp.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/nfs_fs.h> #include <linux/string.h> #include <linux/sunrpc/clnt.h> #include <linux/vfs.h> +#include <linux/sunrpc/gss_api.h> #include "internal.h" #define NFSDBG_FACILITY NFSDBG_VFS static void nfs_expire_automounts(struct work_struct *work); -LIST_HEAD(nfs_automount_list); +static LIST_HEAD(nfs_automount_list); static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts); int nfs_mountpoint_expiry_timeout = 500 * HZ; -static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, - const struct dentry *dentry, - struct nfs_fh *fh, - struct nfs_fattr *fattr); - /* * nfs_path - reconstruct the path given an arbitrary dentry - * @base - arbitrary string to prepend to the path - * @droot - pointer to root dentry for mountpoint + * @base - used to return pointer to the end of devname part of path * @dentry - pointer to dentry * @buffer - result buffer * @buflen - length of buffer + * @flags - options (see below) * - * Helper function for constructing the path from the - * root dentry to an arbitrary hashed dentry. + * Helper function for constructing the server pathname + * by arbitrary hashed dentry. * * This is mainly for use in figuring out the path on the - * server side when automounting on top of an existing partition. + * server side when automounting on top of an existing partition + * and in generating /proc/mounts and friends. + * + * Supported flags: + * NFS_PATH_CANONICAL: ensure there is exactly one slash after + * the original device (export) name + * (if unset, the original name is returned verbatim) */ -char *nfs_path(const char *base, - const struct dentry *droot, - const struct dentry *dentry, - char *buffer, ssize_t buflen) +char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen, + unsigned flags) { - char *end = buffer+buflen; + char *end; int namelen; + unsigned seq; + const char *base; +rename_retry: + end = buffer+buflen; *--end = '\0'; buflen--; - spin_lock(&dcache_lock); - while (!IS_ROOT(dentry) && dentry != droot) { + + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); + while (1) { + spin_lock(&dentry->d_lock); + if (IS_ROOT(dentry)) + break; namelen = dentry->d_name.len; buflen -= namelen + 1; if (buflen < 0) @@ -62,29 +73,60 @@ char *nfs_path(const char *base, end -= namelen; memcpy(end, dentry->d_name.name, namelen); *--end = '/'; + spin_unlock(&dentry->d_lock); dentry = dentry->d_parent; } - spin_unlock(&dcache_lock); + if (read_seqretry(&rename_lock, seq)) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + goto rename_retry; + } + if ((flags & NFS_PATH_CANONICAL) && *end != '/') { + if (--buflen < 0) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + goto Elong; + } + *--end = '/'; + } + *p = end; + base = dentry->d_fsdata; + if (!base) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + WARN_ON(1); + return end; + } namelen = strlen(base); - /* Strip off excess slashes in base string */ - while (namelen > 0 && base[namelen - 1] == '/') - namelen--; + if (flags & NFS_PATH_CANONICAL) { + /* Strip off excess slashes in base string */ + while (namelen > 0 && base[namelen - 1] == '/') + namelen--; + } buflen -= namelen; - if (buflen < 0) + if (buflen < 0) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); goto Elong; + } end -= namelen; memcpy(end, base, namelen); + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); return end; Elong_unlock: - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; Elong: return ERR_PTR(-ENAMETOOLONG); } +EXPORT_SYMBOL_GPL(nfs_path); /* - * nfs_follow_mountpoint - handle crossing a mountpoint on the server - * @dentry - dentry of mountpoint - * @nd - nameidata info + * nfs_d_automount - Handle crossing a mountpoint on the server + * @path - The mountpoint * * When we encounter a mountpoint on the server, we want to set up * a mountpoint on the client too, to prevent inode numbers from @@ -94,77 +136,72 @@ Elong: * situation, and that different filesystems may want to use * different security flavours. */ -static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) +struct vfsmount *nfs_d_automount(struct path *path) { struct vfsmount *mnt; - struct nfs_server *server = NFS_SERVER(dentry->d_inode); - struct dentry *parent; - struct nfs_fh fh; - struct nfs_fattr fattr; - int err; + struct nfs_server *server = NFS_SERVER(path->dentry->d_inode); + struct nfs_fh *fh = NULL; + struct nfs_fattr *fattr = NULL; - dprintk("--> nfs_follow_mountpoint()\n"); + dprintk("--> nfs_d_automount()\n"); - BUG_ON(IS_ROOT(dentry)); - dprintk("%s: enter\n", __FUNCTION__); - dput(nd->path.dentry); - nd->path.dentry = dget(dentry); + mnt = ERR_PTR(-ESTALE); + if (IS_ROOT(path->dentry)) + goto out_nofree; - /* Look it up again */ - parent = dget_parent(nd->path.dentry); - err = server->nfs_client->rpc_ops->lookup(parent->d_inode, - &nd->path.dentry->d_name, - &fh, &fattr); - dput(parent); - if (err != 0) - goto out_err; + mnt = ERR_PTR(-ENOMEM); + fh = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + if (fh == NULL || fattr == NULL) + goto out; - if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) - mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry); - else - mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, &fh, - &fattr); - err = PTR_ERR(mnt); + dprintk("%s: enter\n", __func__); + + mnt = server->nfs_client->rpc_ops->submount(server, path->dentry, fh, fattr); if (IS_ERR(mnt)) - goto out_err; - - mntget(mnt); - err = do_add_mount(mnt, nd, nd->path.mnt->mnt_flags|MNT_SHRINKABLE, - &nfs_automount_list); - if (err < 0) { - mntput(mnt); - if (err == -EBUSY) - goto out_follow; - goto out_err; - } - mntput(nd->path.mnt); - dput(nd->path.dentry); - nd->path.mnt = mnt; - nd->path.dentry = dget(mnt->mnt_root); + goto out; + + dprintk("%s: done, success\n", __func__); + mntget(mnt); /* prevent immediate expiration */ + mnt_set_expiry(mnt, &nfs_automount_list); schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); + out: - dprintk("%s: done, returned %d\n", __FUNCTION__, err); - - dprintk("<-- nfs_follow_mountpoint() = %d\n", err); - return ERR_PTR(err); -out_err: - path_put(&nd->path); - goto out; -out_follow: - while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path.mnt, &nd->path.dentry)) - ; - err = 0; - goto out; + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); +out_nofree: + if (IS_ERR(mnt)) + dprintk("<-- %s(): error %ld\n", __func__, PTR_ERR(mnt)); + else + dprintk("<-- %s() = %p\n", __func__, mnt); + return mnt; +} + +static int +nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + if (NFS_FH(dentry->d_inode)->size != 0) + return nfs_getattr(mnt, dentry, stat); + generic_fillattr(dentry->d_inode, stat); + return 0; +} + +static int +nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr) +{ + if (NFS_FH(dentry->d_inode)->size != 0) + return nfs_setattr(dentry, attr); + return -EACCES; } const struct inode_operations nfs_mountpoint_inode_operations = { - .follow_link = nfs_follow_mountpoint, .getattr = nfs_getattr, + .setattr = nfs_setattr, }; const struct inode_operations nfs_referral_inode_operations = { - .follow_link = nfs_follow_mountpoint, + .getattr = nfs_namespace_getattr, + .setattr = nfs_namespace_setattr, }; static void nfs_expire_automounts(struct work_struct *work) @@ -189,40 +226,26 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, const char *devname, struct nfs_clone_mount *mountdata) { -#ifdef CONFIG_NFS_V4 - struct vfsmount *mnt = NULL; - switch (server->nfs_client->rpc_ops->version) { - case 2: - case 3: - mnt = vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); - break; - case 4: - mnt = vfs_kern_mount(&nfs4_xdev_fs_type, 0, devname, mountdata); - } - return mnt; -#else return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); -#endif } /** * nfs_do_submount - set up mountpoint when crossing a filesystem boundary - * @mnt_parent - mountpoint of parent directory * @dentry - parent directory * @fh - filehandle for new root dentry * @fattr - attributes for new root inode + * @authflavor - security flavor to use when performing the mount * */ -static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, - const struct dentry *dentry, - struct nfs_fh *fh, - struct nfs_fattr *fattr) +struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, + struct nfs_fattr *fattr, rpc_authflavor_t authflavor) { struct nfs_clone_mount mountdata = { - .sb = mnt_parent->mnt_sb, + .sb = dentry->d_sb, .dentry = dentry, .fh = fh, .fattr = fattr, + .authflavor = authflavor, }; struct vfsmount *mnt = ERR_PTR(-ENOMEM); char *page = (char *) __get_free_page(GFP_USER); @@ -230,21 +253,37 @@ static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, dprintk("--> nfs_do_submount()\n"); - dprintk("%s: submounting on %s/%s\n", __FUNCTION__, - dentry->d_parent->d_name.name, - dentry->d_name.name); + dprintk("%s: submounting on %pd2\n", __func__, + dentry); if (page == NULL) goto out; - devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE); + devname = nfs_devname(dentry, page, PAGE_SIZE); mnt = (struct vfsmount *)devname; if (IS_ERR(devname)) goto free_page; - mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata); + mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata); free_page: free_page((unsigned long)page); out: - dprintk("%s: done\n", __FUNCTION__); + dprintk("%s: done\n", __func__); dprintk("<-- nfs_do_submount() = %p\n", mnt); return mnt; } +EXPORT_SYMBOL_GPL(nfs_do_submount); + +struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry, + struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + int err; + struct dentry *parent = dget_parent(dentry); + + /* Look it up again to get its attributes */ + err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL); + dput(parent); + if (err != 0) + return ERR_PTR(err); + + return nfs_do_submount(dentry, fh, fattr, server->client->cl_auth->au_flavor); +} +EXPORT_SYMBOL_GPL(nfs_submount); diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h new file mode 100644 index 00000000000..8ee1fab8326 --- /dev/null +++ b/fs/nfs/netns.h @@ -0,0 +1,36 @@ +/* + * NFS-private data for each "struct net". Accessed with net_generic(). + */ + +#ifndef __NFS_NETNS_H__ +#define __NFS_NETNS_H__ + +#include <linux/nfs4.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +struct bl_dev_msg { + int32_t status; + uint32_t major, minor; +}; + +struct nfs_net { + struct cache_detail *nfs_dns_resolve; + struct rpc_pipe *bl_device_pipe; + struct bl_dev_msg bl_mount_reply; + wait_queue_head_t bl_wq; + struct list_head nfs_client_list; + struct list_head nfs_volume_list; +#if IS_ENABLED(CONFIG_NFS_V4) + struct idr cb_ident_idr; /* Protected by nfs_client_lock */ + unsigned short nfs_callback_tcpport; + unsigned short nfs_callback_tcpport6; + int cb_users[NFS4_MAX_MINOR_VERSION + 1]; +#endif + spinlock_t nfs_client_lock; + struct timespec boot_time; +}; + +extern int nfs_net_id; + +#endif diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h new file mode 100644 index 00000000000..43679df56cd --- /dev/null +++ b/fs/nfs/nfs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2012 Netapp, Inc. All rights reserved. + * + * Function and structures exported by the NFS module + * for use by NFS version-specific modules. + */ +#ifndef __LINUX_INTERNAL_NFS_H +#define __LINUX_INTERNAL_NFS_H + +#include <linux/fs.h> +#include <linux/sunrpc/sched.h> +#include <linux/nfs_xdr.h> + +struct nfs_subversion { + struct module *owner; /* THIS_MODULE pointer */ + struct file_system_type *nfs_fs; /* NFS filesystem type */ + const struct rpc_version *rpc_vers; /* NFS version information */ + const struct nfs_rpc_ops *rpc_ops; /* NFS operations */ + const struct super_operations *sops; /* NFS Super operations */ + const struct xattr_handler **xattr; /* NFS xattr handlers */ + struct list_head list; /* List of NFS versions */ +}; + +struct nfs_subversion *get_nfs_version(unsigned int); +void put_nfs_version(struct nfs_subversion *); +void register_nfs_version(struct nfs_subversion *); +void unregister_nfs_version(struct nfs_subversion *); + +#endif /* __LINUX_INTERNAL_NFS_H */ diff --git a/fs/nfs/nfs2super.c b/fs/nfs/nfs2super.c new file mode 100644 index 00000000000..0a9782c9171 --- /dev/null +++ b/fs/nfs/nfs2super.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2012 Netapp, Inc. All rights reserved. + */ +#include <linux/module.h> +#include <linux/nfs_fs.h> +#include "internal.h" +#include "nfs.h" + +static struct nfs_subversion nfs_v2 = { + .owner = THIS_MODULE, + .nfs_fs = &nfs_fs_type, + .rpc_vers = &nfs_version2, + .rpc_ops = &nfs_v2_clientops, + .sops = &nfs_sops, +}; + +static int __init init_nfs_v2(void) +{ + register_nfs_version(&nfs_v2); + return 0; +} + +static void __exit exit_nfs_v2(void) +{ + unregister_nfs_version(&nfs_v2); +} + +MODULE_LICENSE("GPL"); + +module_init(init_nfs_v2); +module_exit(exit_nfs_v2); diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 1f7ea675e0c..5f61b83f4a1 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -12,8 +12,6 @@ #include <linux/param.h> #include <linux/time.h> #include <linux/mm.h> -#include <linux/slab.h> -#include <linux/utsname.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/in.h> @@ -63,653 +61,1061 @@ #define NFS_readdirres_sz (1) #define NFS_statfsres_sz (1+NFS_info_sz) +static int nfs_stat_to_errno(enum nfs_stat); + /* - * Common NFS XDR functions as inlines + * While encoding arguments, set up the reply buffer in advance to + * receive reply data directly into the page cache. */ -static inline __be32 * -xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fhandle) +static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, + unsigned int base, unsigned int len, + unsigned int bufsize) { - memcpy(p, fhandle->data, NFS2_FHSIZE); - return p + XDR_QUADLEN(NFS2_FHSIZE); -} + struct rpc_auth *auth = req->rq_cred->cr_auth; + unsigned int replen; -static inline __be32 * -xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle) -{ - /* NFSv2 handles have a fixed length */ - fhandle->size = NFS2_FHSIZE; - memcpy(fhandle->data, p, NFS2_FHSIZE); - return p + XDR_QUADLEN(NFS2_FHSIZE); + replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; + xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); } -static inline __be32* -xdr_encode_time(__be32 *p, struct timespec *timep) +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) { - *p++ = htonl(timep->tv_sec); - /* Convert nanoseconds into microseconds */ - *p++ = htonl(timep->tv_nsec ? timep->tv_nsec / 1000 : 0); - return p; + dprintk("NFS: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); } -static inline __be32* -xdr_encode_current_server_time(__be32 *p, struct timespec *timep) + +/* + * Encode/decode NFSv2 basic data types + * + * Basic NFSv2 data types are defined in section 2.3 of RFC 1094: + * "NFS: Network File System Protocol Specification". + * + * Not all basic data types have their own encoding and decoding + * functions. For run-time efficiency, some data types are encoded + * or decoded inline. + */ + +/* + * typedef opaque nfsdata<>; + */ +static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result) { - /* - * Passing the invalid value useconds=1000000 is a - * Sun convention for "set to current server time". - * It's needed to make permissions checks for the - * "touch" program across v2 mounts to Solaris and - * Irix boxes work correctly. See description of - * sattr in section 6.1 of "NFS Illustrated" by - * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5 - */ - *p++ = htonl(timep->tv_sec); - *p++ = htonl(1000000); - return p; + u32 recvd, count; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + recvd = xdr_read_pages(xdr, count); + if (unlikely(count > recvd)) + goto out_cheating; +out: + result->eof = 0; /* NFSv2 does not pass EOF flag on the wire. */ + result->count = count; + return count; +out_cheating: + dprintk("NFS: server cheating in read result: " + "count %u > recvd %u\n", count, recvd); + count = recvd; + goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static inline __be32* -xdr_decode_time(__be32 *p, struct timespec *timep) +/* + * enum stat { + * NFS_OK = 0, + * NFSERR_PERM = 1, + * NFSERR_NOENT = 2, + * NFSERR_IO = 5, + * NFSERR_NXIO = 6, + * NFSERR_ACCES = 13, + * NFSERR_EXIST = 17, + * NFSERR_NODEV = 19, + * NFSERR_NOTDIR = 20, + * NFSERR_ISDIR = 21, + * NFSERR_FBIG = 27, + * NFSERR_NOSPC = 28, + * NFSERR_ROFS = 30, + * NFSERR_NAMETOOLONG = 63, + * NFSERR_NOTEMPTY = 66, + * NFSERR_DQUOT = 69, + * NFSERR_STALE = 70, + * NFSERR_WFLUSH = 99 + * }; + */ +static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status) { - timep->tv_sec = ntohl(*p++); - /* Convert microseconds into nanoseconds */ - timep->tv_nsec = ntohl(*p++) * 1000; - return p; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + *status = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static __be32 * -xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) -{ - u32 rdev; - fattr->type = (enum nfs_ftype) ntohl(*p++); - fattr->mode = ntohl(*p++); - fattr->nlink = ntohl(*p++); - fattr->uid = ntohl(*p++); - fattr->gid = ntohl(*p++); - fattr->size = ntohl(*p++); - fattr->du.nfs2.blocksize = ntohl(*p++); - rdev = ntohl(*p++); - fattr->du.nfs2.blocks = ntohl(*p++); - fattr->fsid.major = ntohl(*p++); - fattr->fsid.minor = 0; - fattr->fileid = ntohl(*p++); - p = xdr_decode_time(p, &fattr->atime); - p = xdr_decode_time(p, &fattr->mtime); - p = xdr_decode_time(p, &fattr->ctime); - fattr->valid |= NFS_ATTR_FATTR; - fattr->rdev = new_decode_dev(rdev); - if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) { - fattr->type = NFFIFO; - fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; - fattr->rdev = 0; - } +/* + * 2.3.2. ftype + * + * enum ftype { + * NFNON = 0, + * NFREG = 1, + * NFDIR = 2, + * NFBLK = 3, + * NFCHR = 4, + * NFLNK = 5 + * }; + * + */ +static __be32 *xdr_decode_ftype(__be32 *p, u32 *type) +{ + *type = be32_to_cpup(p++); + if (unlikely(*type > NF2FIFO)) + *type = NFBAD; return p; } -static inline __be32 * -xdr_encode_sattr(__be32 *p, struct iattr *attr) +/* + * 2.3.3. fhandle + * + * typedef opaque fhandle[FHSIZE]; + */ +static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh) { - const __be32 not_set = __constant_htonl(0xFFFFFFFF); - - *p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set; - *p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set; - *p++ = (attr->ia_valid & ATTR_GID) ? htonl(attr->ia_gid) : not_set; - *p++ = (attr->ia_valid & ATTR_SIZE) ? htonl(attr->ia_size) : not_set; + __be32 *p; - if (attr->ia_valid & ATTR_ATIME_SET) { - p = xdr_encode_time(p, &attr->ia_atime); - } else if (attr->ia_valid & ATTR_ATIME) { - p = xdr_encode_current_server_time(p, &attr->ia_atime); - } else { - *p++ = not_set; - *p++ = not_set; - } - - if (attr->ia_valid & ATTR_MTIME_SET) { - p = xdr_encode_time(p, &attr->ia_mtime); - } else if (attr->ia_valid & ATTR_MTIME) { - p = xdr_encode_current_server_time(p, &attr->ia_mtime); - } else { - *p++ = not_set; - *p++ = not_set; - } - return p; + p = xdr_reserve_space(xdr, NFS2_FHSIZE); + memcpy(p, fh->data, NFS2_FHSIZE); } -/* - * NFS encode functions - */ -/* - * Encode file handle argument - * GETATTR, READLINK, STATFS - */ -static int -nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh) +static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) { - p = xdr_encode_fhandle(p, fh); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + __be32 *p; + + p = xdr_inline_decode(xdr, NFS2_FHSIZE); + if (unlikely(p == NULL)) + goto out_overflow; + fh->size = NFS2_FHSIZE; + memcpy(fh->data, p, NFS2_FHSIZE); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } /* - * Encode SETATTR arguments + * 2.3.4. timeval + * + * struct timeval { + * unsigned int seconds; + * unsigned int useconds; + * }; */ -static int -nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args) +static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_sattr(p, args->sattr); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + *p++ = cpu_to_be32(timep->tv_sec); + if (timep->tv_nsec != 0) + *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC); + else + *p++ = cpu_to_be32(0); + return p; } /* - * Encode directory ops argument - * LOOKUP, RMDIR + * Passing the invalid value useconds=1000000 is a Sun convention for + * "set to current server time". It's needed to make permissions checks + * for the "touch" program across v2 mounts to Solaris and Irix servers + * work correctly. See description of sattr in section 6.1 of "NFS + * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5. */ -static int -nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args) +static __be32 *xdr_encode_current_server_time(__be32 *p, + const struct timespec *timep) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name, args->len); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + *p++ = cpu_to_be32(timep->tv_sec); + *p++ = cpu_to_be32(1000000); + return p; } -/* - * Encode REMOVE argument - */ -static int -nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args) +static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name.name, args->name.len); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + timep->tv_sec = be32_to_cpup(p++); + timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC; + return p; } /* - * Arguments to a READ call. Since we read data directly into the page - * cache, we also set up the reply iovec here so that iov[1] points - * exactly to the page we want to fetch. + * 2.3.5. fattr + * + * struct fattr { + * ftype type; + * unsigned int mode; + * unsigned int nlink; + * unsigned int uid; + * unsigned int gid; + * unsigned int size; + * unsigned int blocksize; + * unsigned int rdev; + * unsigned int blocks; + * unsigned int fsid; + * unsigned int fileid; + * timeval atime; + * timeval mtime; + * timeval ctime; + * }; + * */ -static int -nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) +static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; - unsigned int replen; - u32 offset = (u32)args->offset; - u32 count = args->count; + u32 rdev, type; + __be32 *p; + + p = xdr_inline_decode(xdr, NFS_fattr_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + + fattr->valid |= NFS_ATTR_FATTR_V2; + + p = xdr_decode_ftype(p, &type); + + fattr->mode = be32_to_cpup(p++); + fattr->nlink = be32_to_cpup(p++); + fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); + if (!uid_valid(fattr->uid)) + goto out_uid; + fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); + if (!gid_valid(fattr->gid)) + goto out_gid; + + fattr->size = be32_to_cpup(p++); + fattr->du.nfs2.blocksize = be32_to_cpup(p++); + + rdev = be32_to_cpup(p++); + fattr->rdev = new_decode_dev(rdev); + if (type == (u32)NFCHR && rdev == (u32)NFS2_FIFO_DEV) { + fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; + fattr->rdev = 0; + } - p = xdr_encode_fhandle(p, args->fh); - *p++ = htonl(offset); - *p++ = htonl(count); - *p++ = htonl(count); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + fattr->du.nfs2.blocks = be32_to_cpup(p++); + fattr->fsid.major = be32_to_cpup(p++); + fattr->fsid.minor = 0; + fattr->fileid = be32_to_cpup(p++); + + p = xdr_decode_time(p, &fattr->atime); + p = xdr_decode_time(p, &fattr->mtime); + xdr_decode_time(p, &fattr->ctime); + fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime); - /* Inline the page array */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, - args->pages, args->pgbase, count); - req->rq_rcv_buf.flags |= XDRBUF_READ; return 0; +out_uid: + dprintk("NFS: returned invalid uid\n"); + return -EINVAL; +out_gid: + dprintk("NFS: returned invalid gid\n"); + return -EINVAL; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } /* - * Decode READ reply + * 2.3.6. sattr + * + * struct sattr { + * unsigned int mode; + * unsigned int uid; + * unsigned int gid; + * unsigned int size; + * timeval atime; + * timeval mtime; + * }; */ -static int -nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) -{ - struct kvec *iov = req->rq_rcv_buf.head; - size_t hdrlen; - u32 count, recvd; - int status; - if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); - p = xdr_decode_fattr(p, res->fattr); +#define NFS2_SATTR_NOT_SET (0xffffffff) - count = ntohl(*p++); - res->eof = 0; - hdrlen = (u8 *) p - (u8 *) iov->iov_base; - if (iov->iov_len < hdrlen) { - dprintk("NFS: READ reply header overflowed:" - "length %Zu > %Zu\n", hdrlen, iov->iov_len); - return -errno_NFSERR_IO; - } else if (iov->iov_len != hdrlen) { - dprintk("NFS: READ header is short. iovec will be shifted.\n"); - xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen); - } +static __be32 *xdr_time_not_set(__be32 *p) +{ + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + return p; +} - recvd = req->rq_rcv_buf.len - hdrlen; - if (count > recvd) { - dprintk("NFS: server cheating in read reply: " - "count %u > recvd %u\n", count, recvd); - count = recvd; - } +static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS_sattr_sz << 2); + + if (attr->ia_valid & ATTR_MODE) + *p++ = cpu_to_be32(attr->ia_mode); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + if (attr->ia_valid & ATTR_UID) + *p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid)); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + if (attr->ia_valid & ATTR_GID) + *p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid)); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + if (attr->ia_valid & ATTR_SIZE) + *p++ = cpu_to_be32((u32)attr->ia_size); + else + *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); + + if (attr->ia_valid & ATTR_ATIME_SET) + p = xdr_encode_time(p, &attr->ia_atime); + else if (attr->ia_valid & ATTR_ATIME) + p = xdr_encode_current_server_time(p, &attr->ia_atime); + else + p = xdr_time_not_set(p); + if (attr->ia_valid & ATTR_MTIME_SET) + xdr_encode_time(p, &attr->ia_mtime); + else if (attr->ia_valid & ATTR_MTIME) + xdr_encode_current_server_time(p, &attr->ia_mtime); + else + xdr_time_not_set(p); +} - dprintk("RPC: readres OK count %u\n", count); - if (count < res->count) - res->count = count; +/* + * 2.3.7. filename + * + * typedef string filename<MAXNAMLEN>; + */ +static void encode_filename(struct xdr_stream *xdr, + const char *name, u32 length) +{ + __be32 *p; - return count; + WARN_ON_ONCE(length > NFS2_MAXNAMLEN); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, name, length); } +static int decode_filename_inline(struct xdr_stream *xdr, + const char **name, u32 *length) +{ + __be32 *p; + u32 count; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + if (count > NFS3_MAXNAMLEN) + goto out_nametoolong; + p = xdr_inline_decode(xdr, count); + if (unlikely(p == NULL)) + goto out_overflow; + *name = (const char *)p; + *length = count; + return 0; +out_nametoolong: + dprintk("NFS: returned filename too long: %u\n", count); + return -ENAMETOOLONG; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} /* - * Write arguments. Splice the buffer to be written into the iovec. + * 2.3.8. path + * + * typedef string path<MAXPATHLEN>; */ -static int -nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) +static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length) { - struct xdr_buf *sndbuf = &req->rq_snd_buf; - u32 offset = (u32)args->offset; - u32 count = args->count; + __be32 *p; - p = xdr_encode_fhandle(p, args->fh); - *p++ = htonl(offset); - *p++ = htonl(offset); - *p++ = htonl(count); - *p++ = htonl(count); - sndbuf->len = xdr_adjust_iovec(sndbuf->head, p); + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(length); + xdr_write_pages(xdr, pages, 0, length); +} - /* Copy the page array */ - xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); - sndbuf->flags |= XDRBUF_WRITE; +static int decode_path(struct xdr_stream *xdr) +{ + u32 length, recvd; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p); + if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN)) + goto out_size; + recvd = xdr_read_pages(xdr, length); + if (unlikely(length > recvd)) + goto out_cheating; + xdr_terminate_string(xdr->buf, length); return 0; +out_size: + dprintk("NFS: returned pathname too long: %u\n", length); + return -ENAMETOOLONG; +out_cheating: + dprintk("NFS: server cheating in pathname result: " + "length %u > received %u\n", length, recvd); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } /* - * Encode create arguments - * CREATE, MKDIR + * 2.3.9. attrstat + * + * union attrstat switch (stat status) { + * case NFS_OK: + * fattr attributes; + * default: + * void; + * }; */ -static int -nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args) +static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name, args->len); - p = xdr_encode_sattr(p, args->sattr); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_fattr(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); } /* - * Encode RENAME arguments + * 2.3.10. diropargs + * + * struct diropargs { + * fhandle dir; + * filename name; + * }; */ -static int -nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args) +static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh, + const char *name, u32 length) { - p = xdr_encode_fhandle(p, args->fromfh); - p = xdr_encode_array(p, args->fromname, args->fromlen); - p = xdr_encode_fhandle(p, args->tofh); - p = xdr_encode_array(p, args->toname, args->tolen); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + encode_fhandle(xdr, fh); + encode_filename(xdr, name, length); } /* - * Encode LINK arguments + * 2.3.11. diropres + * + * union diropres switch (stat status) { + * case NFS_OK: + * struct { + * fhandle file; + * fattr attributes; + * } diropok; + * default: + * void; + * }; */ -static int -nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args) +static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result) { - p = xdr_encode_fhandle(p, args->fromfh); - p = xdr_encode_fhandle(p, args->tofh); - p = xdr_encode_array(p, args->toname, args->tolen); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + int error; + + error = decode_fhandle(xdr, result->fh); + if (unlikely(error)) + goto out; + error = decode_fattr(xdr, result->fattr); +out: + return error; +} + +static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_diropok(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); } + /* - * Encode SYMLINK arguments + * NFSv2 XDR encode functions + * + * NFSv2 argument types are defined in section 2.2 of RFC 1094: + * "NFS: Network File System Protocol Specification". */ -static int -nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args) + +static void nfs2_xdr_enc_fhandle(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_fh *fh) { - struct xdr_buf *sndbuf = &req->rq_snd_buf; - size_t pad; + encode_fhandle(xdr, fh); +} - p = xdr_encode_fhandle(p, args->fromfh); - p = xdr_encode_array(p, args->fromname, args->fromlen); - *p++ = htonl(args->pathlen); - sndbuf->len = xdr_adjust_iovec(sndbuf->head, p); +/* + * 2.2.3. sattrargs + * + * struct sattrargs { + * fhandle file; + * sattr attributes; + * }; + */ +static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_sattrargs *args) +{ + encode_fhandle(xdr, args->fh); + encode_sattr(xdr, args->sattr); +} - xdr_encode_pages(sndbuf, args->pages, 0, args->pathlen); +static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_diropargs *args) +{ + encode_diropargs(xdr, args->fh, args->name, args->len); +} - /* - * xdr_encode_pages may have added a few bytes to ensure the - * pathname ends on a 4-byte boundary. Start encoding the - * attributes after the pad bytes. - */ - pad = sndbuf->tail->iov_len; - if (pad > 0) - p++; - p = xdr_encode_sattr(p, args->sattr); - sndbuf->len += xdr_adjust_iovec(sndbuf->tail, p) - pad; - return 0; +static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_readlinkargs *args) +{ + encode_fhandle(xdr, args->fh); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->pglen, NFS_readlinkres_sz); } /* - * Encode arguments to readdir call + * 2.2.7. readargs + * + * struct readargs { + * fhandle file; + * unsigned offset; + * unsigned count; + * unsigned totalcount; + * }; */ -static int -nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args) +static void encode_readargs(struct xdr_stream *xdr, + const struct nfs_pgio_args *args) { - struct rpc_task *task = req->rq_task; - struct rpc_auth *auth = task->tk_msg.rpc_cred->cr_auth; - unsigned int replen; + u32 offset = args->offset; u32 count = args->count; + __be32 *p; - p = xdr_encode_fhandle(p, args->fh); - *p++ = htonl(args->cookie); - *p++ = htonl(count); /* see above */ - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + encode_fhandle(xdr, args->fh); - /* Inline the page array */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readdirres_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count); - return 0; + p = xdr_reserve_space(xdr, 4 + 4 + 4); + *p++ = cpu_to_be32(offset); + *p++ = cpu_to_be32(count); + *p = cpu_to_be32(count); } -/* - * Decode the result of a readdir call. - * We're not really decoding anymore, we just leave the buffer untouched - * and only check that it is syntactically correct. - * The real decoding happens in nfs_decode_entry below, called directly - * from nfs_readdir for each entry. - */ -static int -nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) -{ - struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - struct kvec *iov = rcvbuf->head; - struct page **page; - size_t hdrlen; - unsigned int pglen, recvd; - u32 len; - int status, nr; - __be32 *end, *entry, *kaddr; - - if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); - - hdrlen = (u8 *) p - (u8 *) iov->iov_base; - if (iov->iov_len < hdrlen) { - dprintk("NFS: READDIR reply header overflowed:" - "length %Zu > %Zu\n", hdrlen, iov->iov_len); - return -errno_NFSERR_IO; - } else if (iov->iov_len != hdrlen) { - dprintk("NFS: READDIR header is short. iovec will be shifted.\n"); - xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); - } - - pglen = rcvbuf->page_len; - recvd = rcvbuf->len - hdrlen; - if (pglen > recvd) - pglen = recvd; - page = rcvbuf->pages; - kaddr = p = kmap_atomic(*page, KM_USER0); - end = (__be32 *)((char *)p + pglen); - entry = p; - for (nr = 0; *p++; nr++) { - if (p + 2 > end) - goto short_pkt; - p++; /* fileid */ - len = ntohl(*p++); - p += XDR_QUADLEN(len) + 1; /* name plus cookie */ - if (len > NFS2_MAXNAMLEN) { - dprintk("NFS: giant filename in readdir (len 0x%x)!\n", - len); - goto err_unmap; - } - if (p + 2 > end) - goto short_pkt; - entry = p; - } - if (!nr && (entry[0] != 0 || entry[1] == 0)) - goto short_pkt; - out: - kunmap_atomic(kaddr, KM_USER0); - return nr; - short_pkt: - entry[0] = entry[1] = 0; - /* truncate listing ? */ - if (!nr) { - dprintk("NFS: readdir reply truncated!\n"); - entry[1] = 1; - } - goto out; -err_unmap: - nr = -errno_NFSERR_IO; - goto out; +static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_pgio_args *args) +{ + encode_readargs(xdr, args); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->count, NFS_readres_sz); + req->rq_rcv_buf.flags |= XDRBUF_READ; } -__be32 * -nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +/* + * 2.2.9. writeargs + * + * struct writeargs { + * fhandle file; + * unsigned beginoffset; + * unsigned offset; + * unsigned totalcount; + * nfsdata data; + * }; + */ +static void encode_writeargs(struct xdr_stream *xdr, + const struct nfs_pgio_args *args) { - if (!*p++) { - if (!*p) - return ERR_PTR(-EAGAIN); - entry->eof = 1; - return ERR_PTR(-EBADCOOKIE); - } + u32 offset = args->offset; + u32 count = args->count; + __be32 *p; - entry->ino = ntohl(*p++); - entry->len = ntohl(*p++); - entry->name = (const char *) p; - p += XDR_QUADLEN(entry->len); - entry->prev_cookie = entry->cookie; - entry->cookie = ntohl(*p++); - entry->eof = !p[0] && p[1]; + encode_fhandle(xdr, args->fh); - return p; + p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4); + *p++ = cpu_to_be32(offset); + *p++ = cpu_to_be32(offset); + *p++ = cpu_to_be32(count); + + /* nfsdata */ + *p = cpu_to_be32(count); + xdr_write_pages(xdr, args->pages, args->pgbase, count); +} + +static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_pgio_args *args) +{ + encode_writeargs(xdr, args); + xdr->buf->flags |= XDRBUF_WRITE; } /* - * NFS XDR decode functions + * 2.2.10. createargs + * + * struct createargs { + * diropargs where; + * sattr attributes; + * }; */ +static void nfs2_xdr_enc_createargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_createargs *args) +{ + encode_diropargs(xdr, args->fh, args->name, args->len); + encode_sattr(xdr, args->sattr); +} + +static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_removeargs *args) +{ + encode_diropargs(xdr, args->fh, args->name.name, args->name.len); +} + /* - * Decode simple status reply + * 2.2.12. renameargs + * + * struct renameargs { + * diropargs from; + * diropargs to; + * }; */ -static int -nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy) +static void nfs2_xdr_enc_renameargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_renameargs *args) { - int status; + const struct qstr *old = args->old_name; + const struct qstr *new = args->new_name; - if ((status = ntohl(*p++)) != 0) - status = -nfs_stat_to_errno(status); - return status; + encode_diropargs(xdr, args->old_dir, old->name, old->len); + encode_diropargs(xdr, args->new_dir, new->name, new->len); } /* - * Decode attrstat reply - * GETATTR, SETATTR, WRITE + * 2.2.13. linkargs + * + * struct linkargs { + * fhandle from; + * diropargs to; + * }; */ -static int -nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) +static void nfs2_xdr_enc_linkargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_linkargs *args) { - int status; + encode_fhandle(xdr, args->fromfh); + encode_diropargs(xdr, args->tofh, args->toname, args->tolen); +} - if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); - xdr_decode_fattr(p, fattr); - return 0; +/* + * 2.2.14. symlinkargs + * + * struct symlinkargs { + * diropargs from; + * path to; + * sattr attributes; + * }; + */ +static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_symlinkargs *args) +{ + encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen); + encode_path(xdr, args->pages, args->pathlen); + encode_sattr(xdr, args->sattr); } /* - * Decode diropres reply - * LOOKUP, CREATE, MKDIR + * 2.2.17. readdirargs + * + * struct readdirargs { + * fhandle dir; + * nfscookie cookie; + * unsigned count; + * }; */ -static int -nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res) +static void encode_readdirargs(struct xdr_stream *xdr, + const struct nfs_readdirargs *args) { - int status; + __be32 *p; - if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); - p = xdr_decode_fhandle(p, res->fh); - xdr_decode_fattr(p, res->fattr); - return 0; + encode_fhandle(xdr, args->fh); + + p = xdr_reserve_space(xdr, 4 + 4); + *p++ = cpu_to_be32(args->cookie); + *p = cpu_to_be32(args->count); +} + +static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_readdirargs *args) +{ + encode_readdirargs(xdr, args); + prepare_reply_buffer(req, args->pages, 0, + args->count, NFS_readdirres_sz); } /* - * Encode READLINK args + * NFSv2 XDR decode functions + * + * NFSv2 result types are defined in section 2.2 of RFC 1094: + * "NFS: Network File System Protocol Specification". */ -static int -nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args) + +static int nfs2_xdr_dec_stat(struct rpc_rqst *req, struct xdr_stream *xdr, + void *__unused) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; - unsigned int replen; + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} - p = xdr_encode_fhandle(p, args->fh); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); +static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + return decode_attrstat(xdr, result); +} - /* Inline the page array */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readlinkres_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen); - return 0; +static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_diropok *result) +{ + return decode_diropres(xdr, result); } /* - * Decode READLINK reply + * 2.2.6. readlinkres + * + * union readlinkres switch (stat status) { + * case NFS_OK: + * path data; + * default: + * void; + * }; */ -static int -nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy) +static int nfs2_xdr_dec_readlinkres(struct rpc_rqst *req, + struct xdr_stream *xdr, void *__unused) { - struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - struct kvec *iov = rcvbuf->head; - size_t hdrlen; - u32 len, recvd; - char *kaddr; - int status; + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_path(xdr); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} - if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); - /* Convert length of symlink */ - len = ntohl(*p++); - if (len >= rcvbuf->page_len) { - dprintk("nfs: server returned giant symlink!\n"); - return -ENAMETOOLONG; - } - hdrlen = (u8 *) p - (u8 *) iov->iov_base; - if (iov->iov_len < hdrlen) { - dprintk("NFS: READLINK reply header overflowed:" - "length %Zu > %Zu\n", hdrlen, iov->iov_len); - return -errno_NFSERR_IO; - } else if (iov->iov_len != hdrlen) { - dprintk("NFS: READLINK header is short. iovec will be shifted.\n"); - xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); - } - recvd = req->rq_rcv_buf.len - hdrlen; - if (recvd < len) { - dprintk("NFS: server cheating in readlink reply: " - "count %u > recvd %u\n", len, recvd); - return -EIO; +/* + * 2.2.7. readres + * + * union readres switch (stat status) { + * case NFS_OK: + * fattr attributes; + * nfsdata data; + * default: + * void; + * }; + */ +static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_pgio_res *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_fattr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_nfsdata(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + +static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_pgio_res *result) +{ + /* All NFSv2 writes are "file sync" writes */ + result->verf->committed = NFS_FILE_SYNC; + return decode_attrstat(xdr, result->fattr); +} + +/** + * nfs2_decode_dirent - Decode a single NFSv2 directory entry stored in + * the local page cache. + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + * + * 2.2.17. entry + * + * struct entry { + * unsigned fileid; + * filename name; + * nfscookie cookie; + * entry *nextentry; + * }; + */ +int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + int plus) +{ + __be32 *p; + int error; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p++ == xdr_zero) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p++ == xdr_zero) + return -EAGAIN; + entry->eof = 1; + return -EBADCOOKIE; } - /* NULL terminate the string we got */ - kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0); - kaddr[len+rcvbuf->page_base] = '\0'; - kunmap_atomic(kaddr, KM_USER0); + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + entry->ino = be32_to_cpup(p); + + error = decode_filename_inline(xdr, &entry->name, &entry->len); + if (unlikely(error)) + return error; + + /* + * The type (size and byte order) of nfscookie isn't defined in + * RFC 1094. This implementation assumes that it's an XDR uint32. + */ + entry->prev_cookie = entry->cookie; + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + entry->cookie = be32_to_cpup(p); + + entry->d_type = DT_UNKNOWN; + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EAGAIN; } /* - * Decode WRITE reply + * 2.2.17. readdirres + * + * union readdirres switch (stat status) { + * case NFS_OK: + * struct { + * entry *entries; + * bool eof; + * } readdirok; + * default: + * void; + * }; + * + * Read the directory contents into the page cache, but don't + * touch them. The actual decoding is done by nfs2_decode_dirent() + * during subsequent nfs_readdir() calls. */ -static int -nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) +static int decode_readdirok(struct xdr_stream *xdr) +{ + return xdr_read_pages(xdr, xdr->buf->page_len); +} + +static int nfs2_xdr_dec_readdirres(struct rpc_rqst *req, + struct xdr_stream *xdr, void *__unused) { - res->verf->committed = NFS_FILE_SYNC; - return nfs_xdr_attrstat(req, p, res->fattr); + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_readdirok(xdr); +out: + return error; +out_default: + return nfs_stat_to_errno(status); } /* - * Decode STATFS reply + * 2.2.18. statfsres + * + * union statfsres (stat status) { + * case NFS_OK: + * struct { + * unsigned tsize; + * unsigned bsize; + * unsigned blocks; + * unsigned bfree; + * unsigned bavail; + * } info; + * default: + * void; + * }; */ -static int -nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res) +static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result) { - int status; - - if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); - - res->tsize = ntohl(*p++); - res->bsize = ntohl(*p++); - res->blocks = ntohl(*p++); - res->bfree = ntohl(*p++); - res->bavail = ntohl(*p++); + __be32 *p; + + p = xdr_inline_decode(xdr, NFS_info_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + result->tsize = be32_to_cpup(p++); + result->bsize = be32_to_cpup(p++); + result->blocks = be32_to_cpup(p++); + result->bfree = be32_to_cpup(p++); + result->bavail = be32_to_cpup(p); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } +static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs2_fsstat *result) +{ + enum nfs_stat status; + int error; + + error = decode_stat(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS_OK) + goto out_default; + error = decode_info(xdr, result); +out: + return error; +out_default: + return nfs_stat_to_errno(status); +} + + /* * We need to translate between nfs status return values and * the local errno values which may not be the same. */ -static struct { +static const struct { int stat; int errno; } nfs_errtbl[] = { { NFS_OK, 0 }, - { NFSERR_PERM, EPERM }, - { NFSERR_NOENT, ENOENT }, - { NFSERR_IO, errno_NFSERR_IO }, - { NFSERR_NXIO, ENXIO }, -/* { NFSERR_EAGAIN, EAGAIN }, */ - { NFSERR_ACCES, EACCES }, - { NFSERR_EXIST, EEXIST }, - { NFSERR_XDEV, EXDEV }, - { NFSERR_NODEV, ENODEV }, - { NFSERR_NOTDIR, ENOTDIR }, - { NFSERR_ISDIR, EISDIR }, - { NFSERR_INVAL, EINVAL }, - { NFSERR_FBIG, EFBIG }, - { NFSERR_NOSPC, ENOSPC }, - { NFSERR_ROFS, EROFS }, - { NFSERR_MLINK, EMLINK }, - { NFSERR_NAMETOOLONG, ENAMETOOLONG }, - { NFSERR_NOTEMPTY, ENOTEMPTY }, - { NFSERR_DQUOT, EDQUOT }, - { NFSERR_STALE, ESTALE }, - { NFSERR_REMOTE, EREMOTE }, + { NFSERR_PERM, -EPERM }, + { NFSERR_NOENT, -ENOENT }, + { NFSERR_IO, -errno_NFSERR_IO}, + { NFSERR_NXIO, -ENXIO }, +/* { NFSERR_EAGAIN, -EAGAIN }, */ + { NFSERR_ACCES, -EACCES }, + { NFSERR_EXIST, -EEXIST }, + { NFSERR_XDEV, -EXDEV }, + { NFSERR_NODEV, -ENODEV }, + { NFSERR_NOTDIR, -ENOTDIR }, + { NFSERR_ISDIR, -EISDIR }, + { NFSERR_INVAL, -EINVAL }, + { NFSERR_FBIG, -EFBIG }, + { NFSERR_NOSPC, -ENOSPC }, + { NFSERR_ROFS, -EROFS }, + { NFSERR_MLINK, -EMLINK }, + { NFSERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFSERR_NOTEMPTY, -ENOTEMPTY }, + { NFSERR_DQUOT, -EDQUOT }, + { NFSERR_STALE, -ESTALE }, + { NFSERR_REMOTE, -EREMOTE }, #ifdef EWFLUSH - { NFSERR_WFLUSH, EWFLUSH }, + { NFSERR_WFLUSH, -EWFLUSH }, #endif - { NFSERR_BADHANDLE, EBADHANDLE }, - { NFSERR_NOT_SYNC, ENOTSYNC }, - { NFSERR_BAD_COOKIE, EBADCOOKIE }, - { NFSERR_NOTSUPP, ENOTSUPP }, - { NFSERR_TOOSMALL, ETOOSMALL }, - { NFSERR_SERVERFAULT, ESERVERFAULT }, - { NFSERR_BADTYPE, EBADTYPE }, - { NFSERR_JUKEBOX, EJUKEBOX }, - { -1, EIO } + { NFSERR_BADHANDLE, -EBADHANDLE }, + { NFSERR_NOT_SYNC, -ENOTSYNC }, + { NFSERR_BAD_COOKIE, -EBADCOOKIE }, + { NFSERR_NOTSUPP, -ENOTSUPP }, + { NFSERR_TOOSMALL, -ETOOSMALL }, + { NFSERR_SERVERFAULT, -EREMOTEIO }, + { NFSERR_BADTYPE, -EBADTYPE }, + { NFSERR_JUKEBOX, -EJUKEBOX }, + { -1, -EIO } }; -/* - * Convert an NFS error code to a local one. - * This one is used jointly by NFSv2 and NFSv3. +/** + * nfs_stat_to_errno - convert an NFS status code to a local errno + * @status: NFS status code to convert + * + * Returns a local errno value, or -EIO if the NFS status code is + * not recognized. This function is used jointly by NFSv2 and NFSv3. */ -int -nfs_stat_to_errno(int stat) +static int nfs_stat_to_errno(enum nfs_stat status) { int i; for (i = 0; nfs_errtbl[i].stat != -1; i++) { - if (nfs_errtbl[i].stat == stat) + if (nfs_errtbl[i].stat == (int)status) return nfs_errtbl[i].errno; } - dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat); + dprintk("NFS: Unrecognized nfs status value: %u\n", status); return nfs_errtbl[i].errno; } #define PROC(proc, argtype, restype, timer) \ [NFSPROC_##proc] = { \ .p_proc = NFSPROC_##proc, \ - .p_encode = (kxdrproc_t) nfs_xdr_##argtype, \ - .p_decode = (kxdrproc_t) nfs_xdr_##restype, \ + .p_encode = (kxdreproc_t)nfs2_xdr_enc_##argtype, \ + .p_decode = (kxdrdproc_t)nfs2_xdr_dec_##restype, \ .p_arglen = NFS_##argtype##_sz, \ .p_replen = NFS_##restype##_sz, \ .p_timer = timer, \ @@ -717,24 +1123,24 @@ nfs_stat_to_errno(int stat) .p_name = #proc, \ } struct rpc_procinfo nfs_procedures[] = { - PROC(GETATTR, fhandle, attrstat, 1), - PROC(SETATTR, sattrargs, attrstat, 0), - PROC(LOOKUP, diropargs, diropres, 2), - PROC(READLINK, readlinkargs, readlinkres, 3), - PROC(READ, readargs, readres, 3), - PROC(WRITE, writeargs, writeres, 4), - PROC(CREATE, createargs, diropres, 0), - PROC(REMOVE, removeargs, stat, 0), - PROC(RENAME, renameargs, stat, 0), - PROC(LINK, linkargs, stat, 0), - PROC(SYMLINK, symlinkargs, stat, 0), - PROC(MKDIR, createargs, diropres, 0), - PROC(RMDIR, diropargs, stat, 0), - PROC(READDIR, readdirargs, readdirres, 3), - PROC(STATFS, fhandle, statfsres, 0), + PROC(GETATTR, fhandle, attrstat, 1), + PROC(SETATTR, sattrargs, attrstat, 0), + PROC(LOOKUP, diropargs, diropres, 2), + PROC(READLINK, readlinkargs, readlinkres, 3), + PROC(READ, readargs, readres, 3), + PROC(WRITE, writeargs, writeres, 4), + PROC(CREATE, createargs, diropres, 0), + PROC(REMOVE, removeargs, stat, 0), + PROC(RENAME, renameargs, stat, 0), + PROC(LINK, linkargs, stat, 0), + PROC(SYMLINK, symlinkargs, stat, 0), + PROC(MKDIR, createargs, diropres, 0), + PROC(RMDIR, diropargs, stat, 0), + PROC(READDIR, readdirargs, readdirres, 3), + PROC(STATFS, fhandle, statfsres, 0), }; -struct rpc_version nfs_version2 = { +const struct rpc_version nfs_version2 = { .number = 2, .nrprocs = ARRAY_SIZE(nfs_procedures), .procs = nfs_procedures diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 9b7362565c0..8f854dde415 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -1,188 +1,18 @@ #include <linux/fs.h> +#include <linux/gfp.h> #include <linux/nfs.h> #include <linux/nfs3.h> #include <linux/nfs_fs.h> #include <linux/posix_acl_xattr.h> #include <linux/nfsacl.h> -#define NFSDBG_FACILITY NFSDBG_PROC - -ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl; - int pos=0, len=0; - -# define output(s) do { \ - if (pos + sizeof(s) <= size) { \ - memcpy(buffer + pos, s, sizeof(s)); \ - pos += sizeof(s); \ - } \ - len += sizeof(s); \ - } while(0) - - acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - output("system.posix_acl_access"); - posix_acl_release(acl); - } - - if (S_ISDIR(inode->i_mode)) { - acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - output("system.posix_acl_default"); - posix_acl_release(acl); - } - } - -# undef output - - if (!buffer || len <= size) - return len; - return -ERANGE; -} - -ssize_t nfs3_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl; - int type, error = 0; - - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) - type = ACL_TYPE_ACCESS; - else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) - type = ACL_TYPE_DEFAULT; - else - return -EOPNOTSUPP; - - acl = nfs3_proc_getacl(inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - if (type == ACL_TYPE_ACCESS && acl->a_count == 0) - error = -ENODATA; - else - error = posix_acl_to_xattr(acl, buffer, size); - posix_acl_release(acl); - } else - error = -ENODATA; - - return error; -} - -int nfs3_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl; - int type, error; - - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) - type = ACL_TYPE_ACCESS; - else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) - type = ACL_TYPE_DEFAULT; - else - return -EOPNOTSUPP; - - acl = posix_acl_from_xattr(value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - error = nfs3_proc_setacl(inode, type, acl); - posix_acl_release(acl); - - return error; -} - -int nfs3_removexattr(struct dentry *dentry, const char *name) -{ - struct inode *inode = dentry->d_inode; - int type; - - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) - type = ACL_TYPE_ACCESS; - else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) - type = ACL_TYPE_DEFAULT; - else - return -EOPNOTSUPP; +#include "internal.h" - return nfs3_proc_setacl(inode, type, NULL); -} - -static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi) -{ - if (!IS_ERR(nfsi->acl_access)) { - posix_acl_release(nfsi->acl_access); - nfsi->acl_access = ERR_PTR(-EAGAIN); - } - if (!IS_ERR(nfsi->acl_default)) { - posix_acl_release(nfsi->acl_default); - nfsi->acl_default = ERR_PTR(-EAGAIN); - } -} - -void nfs3_forget_cached_acls(struct inode *inode) -{ - dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id, - inode->i_ino); - spin_lock(&inode->i_lock); - __nfs3_forget_cached_acls(NFS_I(inode)); - spin_unlock(&inode->i_lock); -} - -static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type) -{ - struct nfs_inode *nfsi = NFS_I(inode); - struct posix_acl *acl = ERR_PTR(-EINVAL); - - spin_lock(&inode->i_lock); - switch(type) { - case ACL_TYPE_ACCESS: - acl = nfsi->acl_access; - break; - - case ACL_TYPE_DEFAULT: - acl = nfsi->acl_default; - break; - - default: - goto out; - } - if (IS_ERR(acl)) - acl = ERR_PTR(-EAGAIN); - else - acl = posix_acl_dup(acl); -out: - spin_unlock(&inode->i_lock); - dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id, - inode->i_ino, type, acl); - return acl; -} - -static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl, - struct posix_acl *dfacl) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id, - inode->i_ino, acl, dfacl); - spin_lock(&inode->i_lock); - __nfs3_forget_cached_acls(NFS_I(inode)); - if (!IS_ERR(acl)) - nfsi->acl_access = posix_acl_dup(acl); - if (!IS_ERR(dfacl)) - nfsi->acl_default = posix_acl_dup(dfacl); - spin_unlock(&inode->i_lock); -} +#define NFSDBG_FACILITY NFSDBG_PROC -struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) +struct posix_acl *nfs3_get_acl(struct inode *inode, int type) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs_fattr fattr; struct page *pages[NFSACL_MAXPAGES] = { }; struct nfs3_getaclargs args = { .fh = NFS_FH(inode), @@ -190,13 +20,12 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) .pages = pages, }; struct nfs3_getaclres res = { - .fattr = &fattr, + NULL, }; struct rpc_message msg = { .rpc_argp = &args, .rpc_resp = &res, }; - struct posix_acl *acl; int status, count; if (!nfs_server_capable(inode, NFS_CAP_ACLS)) @@ -205,10 +34,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) status = nfs_revalidate_inode(server, inode); if (status < 0) return ERR_PTR(status); - acl = nfs3_get_cached_acl(inode, type); - if (acl != ERR_PTR(-EAGAIN)) - return acl; - acl = NULL; /* * Only get the access acl when explicitly requested: We don't @@ -225,6 +50,10 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) dprintk("NFS call getacl\n"); msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + return ERR_PTR(-ENOMEM); + status = rpc_call_sync(server->client_acl, &msg, 0); dprintk("NFS reply getacl: %d\n", status); @@ -234,7 +63,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) switch (status) { case 0: - status = nfs_refresh_inode(inode, &fattr); + status = nfs_refresh_inode(inode, res.fattr); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: @@ -251,43 +80,45 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) } if (res.acl_access != NULL) { - if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) { + if ((posix_acl_equiv_mode(res.acl_access, NULL) == 0) || + res.acl_access->a_count == 0) { posix_acl_release(res.acl_access); res.acl_access = NULL; } } - nfs3_cache_acls(inode, - (res.mask & NFS_ACL) ? res.acl_access : ERR_PTR(-EINVAL), - (res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL)); - switch(type) { - case ACL_TYPE_ACCESS: - acl = res.acl_access; - res.acl_access = NULL; - break; + if (res.mask & NFS_ACL) + set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access); + else + forget_cached_acl(inode, ACL_TYPE_ACCESS); - case ACL_TYPE_DEFAULT: - acl = res.acl_default; - res.acl_default = NULL; + if (res.mask & NFS_DFACL) + set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default); + else + forget_cached_acl(inode, ACL_TYPE_DEFAULT); + + nfs_free_fattr(res.fattr); + if (type == ACL_TYPE_ACCESS) { + posix_acl_release(res.acl_default); + return res.acl_access; + } else { + posix_acl_release(res.acl_access); + return res.acl_default; } getout: posix_acl_release(res.acl_access); posix_acl_release(res.acl_default); - - if (status != 0) { - posix_acl_release(acl); - acl = ERR_PTR(status); - } - return acl; + nfs_free_fattr(res.fattr); + return ERR_PTR(status); } -static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, - struct posix_acl *dfacl) +static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs_fattr fattr; - struct page *pages[NFSACL_MAXPAGES] = { }; + struct nfs_fattr *fattr; + struct page *pages[NFSACL_MAXPAGES]; struct nfs3_setaclargs args = { .inode = inode, .mask = NFS_ACL, @@ -298,14 +129,14 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, .rpc_argp = &args, .rpc_resp = &fattr, }; - int status, count; + int status; status = -EOPNOTSUPP; if (!nfs_server_capable(inode, NFS_CAP_ACLS)) goto out; - /* We are doing this here, because XDR marshalling can only - return -ENOMEM. */ + /* We are doing this here because XDR marshalling does not + * return any results, it BUGs. */ status = -ENOSPC; if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES) goto out; @@ -314,24 +145,40 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, if (S_ISDIR(inode->i_mode)) { args.mask |= NFS_DFACL; args.acl_default = dfacl; + args.len = nfsacl_size(acl, dfacl); + } else + args.len = nfsacl_size(acl, NULL); + + if (args.len > NFS_ACL_INLINE_BUFSIZE) { + unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT); + + status = -ENOMEM; + do { + args.pages[args.npages] = alloc_page(GFP_KERNEL); + if (args.pages[args.npages] == NULL) + goto out_freepages; + args.npages++; + } while (args.npages < npages); } dprintk("NFS call setacl\n"); + status = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out_freepages; + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; + msg.rpc_resp = fattr; status = rpc_call_sync(server->client_acl, &msg, 0); - spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS; - spin_unlock(&inode->i_lock); + nfs_access_zap_cache(inode); + nfs_zap_acl_cache(inode); dprintk("NFS reply setacl: %d\n", status); - /* pages may have been allocated at the xdr layer. */ - for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++) - __free_page(args.pages[count]); - switch (status) { case 0: - status = nfs_refresh_inode(inode, &fattr); - nfs3_cache_acls(inode, acl, dfacl); + status = nfs_refresh_inode(inode, fattr); + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: @@ -341,44 +188,53 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, case -ENOTSUPP: status = -EOPNOTSUPP; } + nfs_free_fattr(fattr); +out_freepages: + while (args.npages != 0) { + args.npages--; + __free_page(args.pages[args.npages]); + } out: return status; } -int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl) +int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + int ret; + ret = __nfs3_proc_setacls(inode, acl, dfacl); + return (ret == -EOPNOTSUPP) ? 0 : ret; + +} + +int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type) { struct posix_acl *alloc = NULL, *dfacl = NULL; int status; if (S_ISDIR(inode->i_mode)) { switch(type) { - case ACL_TYPE_ACCESS: - alloc = dfacl = nfs3_proc_getacl(inode, - ACL_TYPE_DEFAULT); - if (IS_ERR(alloc)) - goto fail; - break; - - case ACL_TYPE_DEFAULT: - dfacl = acl; - alloc = acl = nfs3_proc_getacl(inode, - ACL_TYPE_ACCESS); - if (IS_ERR(alloc)) - goto fail; - break; - - default: - return -EINVAL; + case ACL_TYPE_ACCESS: + alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(alloc)) + goto fail; + break; + + case ACL_TYPE_DEFAULT: + dfacl = acl; + alloc = acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(alloc)) + goto fail; + break; } - } else if (type != ACL_TYPE_ACCESS) - return -EINVAL; + } if (acl == NULL) { alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); if (IS_ERR(alloc)) goto fail; } - status = nfs3_proc_setacls(inode, acl, dfacl); + status = __nfs3_proc_setacls(inode, acl, dfacl); posix_acl_release(alloc); return status; @@ -386,31 +242,51 @@ fail: return PTR_ERR(alloc); } -int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, - mode_t mode) +const struct xattr_handler *nfs3_xattr_handlers[] = { + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + NULL, +}; + +static int +nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, + size_t size, ssize_t *result) { - struct posix_acl *dfacl, *acl; - int error = 0; + struct posix_acl *acl; + char *p = data + *result; - dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(dfacl)) { - error = PTR_ERR(dfacl); - return (error == -EOPNOTSUPP) ? 0 : error; - } - if (!dfacl) - return 0; - acl = posix_acl_clone(dfacl, GFP_KERNEL); - error = -ENOMEM; + acl = get_acl(inode, type); if (!acl) - goto out_release_dfacl; - error = posix_acl_create_masq(acl, &mode); - if (error < 0) - goto out_release_acl; - error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ? - dfacl : NULL); -out_release_acl: + return 0; + posix_acl_release(acl); -out_release_dfacl: - posix_acl_release(dfacl); - return error; + + *result += strlen(name); + *result += 1; + if (!size) + return 0; + if (*result > size) + return -ERANGE; + + strcpy(p, name); + return 0; +} + +ssize_t +nfs3_listxattr(struct dentry *dentry, char *data, size_t size) +{ + struct inode *inode = dentry->d_inode; + ssize_t result = 0; + int error; + + error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS, + POSIX_ACL_XATTR_ACCESS, data, size, &result); + if (error) + return error; + + error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT, + POSIX_ACL_XATTR_DEFAULT, data, size, &result); + if (error) + return error; + return result; } diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c new file mode 100644 index 00000000000..b3fc65ef39c --- /dev/null +++ b/fs/nfs/nfs3client.c @@ -0,0 +1,65 @@ +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include "internal.h" + +#ifdef CONFIG_NFS_V3_ACL +static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; +static const struct rpc_version *nfsacl_version[] = { + [3] = &nfsacl_version3, +}; + +const struct rpc_program nfsacl_program = { + .name = "nfsacl", + .number = NFS_ACL_PROGRAM, + .nrvers = ARRAY_SIZE(nfsacl_version), + .version = nfsacl_version, + .stats = &nfsacl_rpcstat, +}; + +/* + * Initialise an NFSv3 ACL client connection + */ +static void nfs_init_server_aclclient(struct nfs_server *server) +{ + if (server->flags & NFS_MOUNT_NOACL) + goto out_noacl; + + server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3); + if (IS_ERR(server->client_acl)) + goto out_noacl; + + /* No errors! Assume that Sun nfsacls are supported */ + server->caps |= NFS_CAP_ACLS; + return; + +out_noacl: + server->caps &= ~NFS_CAP_ACLS; +} +#else +static inline void nfs_init_server_aclclient(struct nfs_server *server) +{ + server->flags &= ~NFS_MOUNT_NOACL; + server->caps &= ~NFS_CAP_ACLS; +} +#endif + +struct nfs_server *nfs3_create_server(struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + struct nfs_server *server = nfs_create_server(mount_info, nfs_mod); + /* Create a client RPC handle for the NFS v3 ACL management interface */ + if (!IS_ERR(server)) + nfs_init_server_aclclient(server); + return server; +} + +struct nfs_server *nfs3_clone_server(struct nfs_server *source, + struct nfs_fh *fh, + struct nfs_fattr *fattr, + rpc_authflavor_t flavor) +{ + struct nfs_server *server = nfs_clone_server(source, fh, fattr, flavor); + if (!IS_ERR(server) && !IS_ERR(source->client_acl)) + nfs_init_server_aclclient(server); + return server; +} diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 549dbce714a..f0afa291fd5 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -7,23 +7,25 @@ */ #include <linux/mm.h> -#include <linux/utsname.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/sunrpc/clnt.h> +#include <linux/slab.h> #include <linux/nfs.h> #include <linux/nfs3.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/lockd/bind.h> #include <linux/nfs_mount.h> +#include <linux/freezer.h> +#include <linux/xattr.h> #include "iostat.h" #include "internal.h" #define NFSDBG_FACILITY NFSDBG_PROC -/* A wrapper to handle the EJUKEBOX error message */ +/* A wrapper to handle the EJUKEBOX error messages */ static int nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) { @@ -32,7 +34,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) res = rpc_call_sync(clnt, msg, flags); if (res != -EJUKEBOX) break; - schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); + freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; } while (!fatal_signal_pending(current)); return res; @@ -45,7 +47,8 @@ nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) { if (task->tk_status != -EJUKEBOX) return 0; - nfs_inc_stats(inode, NFSIOS_DELAY); + if (task->tk_status == -EJUKEBOX) + nfs_inc_stats(inode, NFSIOS_DELAY); task->tk_status = 0; rpc_restart_call(task); rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); @@ -63,15 +66,15 @@ do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle, }; int status; - dprintk("%s: call fsinfo\n", __FUNCTION__); + dprintk("%s: call fsinfo\n", __func__); nfs_fattr_init(info->fattr); status = rpc_call_sync(client, &msg, 0); - dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status); - if (!(info->fattr->valid & NFS_ATTR_FATTR)) { + dprintk("%s: reply fsinfo: %d\n", __func__, status); + if (status == 0 && !(info->fattr->valid & NFS_ATTR_FATTR)) { msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; msg.rpc_resp = info->fattr; status = rpc_call_sync(client, &msg, 0); - dprintk("%s: reply getattr: %d\n", __FUNCTION__, status); + dprintk("%s: reply getattr: %d\n", __func__, status); } return status; } @@ -96,7 +99,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, */ static int nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, struct nfs4_label *label) { struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR], @@ -129,6 +132,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, int status; dprintk("NFS call setattr\n"); + if (sattr->ia_valid & ATTR_FILE) + msg.rpc_cred = nfs_file_cred(sattr->ia_file); nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) @@ -139,16 +144,15 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, static int nfs3_proc_lookup(struct inode *dir, struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + struct nfs4_label *label) { - struct nfs_fattr dir_attr; struct nfs3_diropargs arg = { .fh = NFS_FH(dir), .name = name->name, .len = name->len }; struct nfs3_diropres res = { - .dir_attr = &dir_attr, .fh = fhandle, .fattr = fattr }; @@ -160,29 +164,30 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name, int status; dprintk("NFS call lookup %s\n", name->name); - nfs_fattr_init(&dir_attr); + res.dir_attr = nfs_alloc_fattr(); + if (res.dir_attr == NULL) + return -ENOMEM; + nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_refresh_inode(dir, &dir_attr); + nfs_refresh_inode(dir, res.dir_attr); if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) { msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; msg.rpc_argp = fhandle; msg.rpc_resp = fattr; status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); } + nfs_free_fattr(res.dir_attr); dprintk("NFS reply lookup: %d\n", status); return status; } static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) { - struct nfs_fattr fattr; struct nfs3_accessargs arg = { .fh = NFS_FH(inode), }; - struct nfs3_accessres res = { - .fattr = &fattr, - }; + struct nfs3_accessres res; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], .rpc_argp = &arg, @@ -190,7 +195,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) .rpc_cred = entry->cred, }; int mode = entry->mask; - int status; + int status = -ENOMEM; dprintk("NFS call access\n"); @@ -207,9 +212,13 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) if (mode & MAY_EXEC) arg.access |= NFS3_ACCESS_EXECUTE; } - nfs_fattr_init(&fattr); + + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + goto out; + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); - nfs_refresh_inode(inode, &fattr); + nfs_refresh_inode(inode, res.fattr); if (status == 0) { entry->mask = 0; if (res.access & NFS3_ACCESS_READ) @@ -219,6 +228,8 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) entry->mask |= MAY_EXEC; } + nfs_free_fattr(res.fattr); +out: dprintk("NFS reply access: %d\n", status); return status; } @@ -226,7 +237,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) static int nfs3_proc_readlink(struct inode *inode, struct page *page, unsigned int pgbase, unsigned int pglen) { - struct nfs_fattr fattr; + struct nfs_fattr *fattr; struct nfs3_readlinkargs args = { .fh = NFS_FH(inode), .pgbase = pgbase, @@ -236,90 +247,133 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page, struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK], .rpc_argp = &args, - .rpc_resp = &fattr, }; - int status; + int status = -ENOMEM; dprintk("NFS call readlink\n"); - nfs_fattr_init(&fattr); + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out; + msg.rpc_resp = fattr; + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); - nfs_refresh_inode(inode, &fattr); + nfs_refresh_inode(inode, fattr); + nfs_free_fattr(fattr); +out: dprintk("NFS reply readlink: %d\n", status); return status; } +struct nfs3_createdata { + struct rpc_message msg; + union { + struct nfs3_createargs create; + struct nfs3_mkdirargs mkdir; + struct nfs3_symlinkargs symlink; + struct nfs3_mknodargs mknod; + } arg; + struct nfs3_diropres res; + struct nfs_fh fh; + struct nfs_fattr fattr; + struct nfs_fattr dir_attr; +}; + +static struct nfs3_createdata *nfs3_alloc_createdata(void) +{ + struct nfs3_createdata *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data != NULL) { + data->msg.rpc_argp = &data->arg; + data->msg.rpc_resp = &data->res; + data->res.fh = &data->fh; + data->res.fattr = &data->fattr; + data->res.dir_attr = &data->dir_attr; + nfs_fattr_init(data->res.fattr); + nfs_fattr_init(data->res.dir_attr); + } + return data; +} + +static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) +{ + int status; + + status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); + nfs_post_op_update_inode(dir, data->res.dir_attr); + if (status == 0) + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); + return status; +} + +static void nfs3_free_createdata(struct nfs3_createdata *data) +{ + kfree(data); +} + /* * Create a regular file. - * For now, we don't implement O_EXCL. */ static int nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags, struct nameidata *nd) + int flags) { - struct nfs_fh fhandle; - struct nfs_fattr fattr; - struct nfs_fattr dir_attr; - struct nfs3_createargs arg = { - .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len, - .sattr = sattr, - }; - struct nfs3_diropres res = { - .dir_attr = &dir_attr, - .fh = &fhandle, - .fattr = &fattr - }; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_CREATE], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - mode_t mode = sattr->ia_mode; - int status; + struct posix_acl *default_acl, *acl; + struct nfs3_createdata *data; + int status = -ENOMEM; + + dprintk("NFS call create %pd\n", dentry); + + data = nfs3_alloc_createdata(); + if (data == NULL) + goto out; - dprintk("NFS call create %s\n", dentry->d_name.name); - arg.createmode = NFS3_CREATE_UNCHECKED; + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE]; + data->arg.create.fh = NFS_FH(dir); + data->arg.create.name = dentry->d_name.name; + data->arg.create.len = dentry->d_name.len; + data->arg.create.sattr = sattr; + + data->arg.create.createmode = NFS3_CREATE_UNCHECKED; if (flags & O_EXCL) { - arg.createmode = NFS3_CREATE_EXCLUSIVE; - arg.verifier[0] = jiffies; - arg.verifier[1] = current->pid; + data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE; + data->arg.create.verifier[0] = cpu_to_be32(jiffies); + data->arg.create.verifier[1] = cpu_to_be32(current->pid); } - sattr->ia_mode &= ~current->fs->umask; + status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); + if (status) + goto out; -again: - nfs_fattr_init(&dir_attr); - nfs_fattr_init(&fattr); - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_refresh_inode(dir, &dir_attr); + for (;;) { + status = nfs3_do_create(dir, dentry, data); - /* If the server doesn't support the exclusive creation semantics, - * try again with simple 'guarded' mode. */ - if (status == -ENOTSUPP) { - switch (arg.createmode) { + if (status != -ENOTSUPP) + break; + /* If the server doesn't support the exclusive creation + * semantics, try again with simple 'guarded' mode. */ + switch (data->arg.create.createmode) { case NFS3_CREATE_EXCLUSIVE: - arg.createmode = NFS3_CREATE_GUARDED; + data->arg.create.createmode = NFS3_CREATE_GUARDED; break; case NFS3_CREATE_GUARDED: - arg.createmode = NFS3_CREATE_UNCHECKED; + data->arg.create.createmode = NFS3_CREATE_UNCHECKED; break; case NFS3_CREATE_UNCHECKED: goto out; } - goto again; + nfs_fattr_init(data->res.dir_attr); + nfs_fattr_init(data->res.fattr); } - if (status == 0) - status = nfs_instantiate(dentry, &fhandle, &fattr); if (status != 0) - goto out; + goto out_release_acls; /* When we created the file with exclusive semantics, make * sure we set the attributes afterwards. */ - if (arg.createmode == NFS3_CREATE_EXCLUSIVE) { + if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) { dprintk("NFS call setattr (post-create)\n"); if (!(sattr->ia_valid & ATTR_ATIME_SET)) @@ -330,14 +384,20 @@ again: /* Note: we could use a guarded setattr here, but I'm * not sure this buys us anything (and I'd have * to revamp the NFSv3 XDR code) */ - status = nfs3_proc_setattr(dentry, &fattr, sattr); - nfs_post_op_update_inode(dentry->d_inode, &fattr); + status = nfs3_proc_setattr(dentry, data->res.fattr, sattr); + nfs_post_op_update_inode(dentry->d_inode, data->res.fattr); dprintk("NFS reply setattr (post-create): %d\n", status); + if (status != 0) + goto out_release_acls; } - if (status != 0) - goto out; - status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); + + status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: + posix_acl_release(acl); + posix_acl_release(default_acl); out: + nfs3_free_createdata(data); dprintk("NFS reply create: %d\n", status); return status; } @@ -347,8 +407,7 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), - .name.len = name->len, - .name.name = name->name, + .name = *name, }; struct nfs_removeres res; struct rpc_message msg = { @@ -356,12 +415,17 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name) .rpc_argp = &arg, .rpc_resp = &res, }; - int status; + int status = -ENOMEM; dprintk("NFS call remove %s\n", name->name); - nfs_fattr_init(&res.dir_attr); + res.dir_attr = nfs_alloc_fattr(); + if (res.dir_attr == NULL) + goto out; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_post_op_update_inode(dir, &res.dir_attr); + nfs_post_op_update_inode(dir, res.dir_attr); + nfs_free_fattr(res.dir_attr); +out: dprintk("NFS reply remove: %d\n", status); return status; } @@ -372,6 +436,11 @@ nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE]; } +static void nfs3_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) +{ + rpc_call_start(task); +} + static int nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir) { @@ -379,71 +448,65 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir) if (nfs3_async_handle_jukebox(task, dir)) return 0; res = task->tk_msg.rpc_resp; - nfs_post_op_update_inode(dir, &res->dir_attr); + nfs_post_op_update_inode(dir, res->dir_attr); return 1; } +static void +nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +{ + msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME]; +} + +static void nfs3_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) +{ + rpc_call_start(task); +} + static int -nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) -{ - struct nfs_fattr old_dir_attr, new_dir_attr; - struct nfs3_renameargs arg = { - .fromfh = NFS_FH(old_dir), - .fromname = old_name->name, - .fromlen = old_name->len, - .tofh = NFS_FH(new_dir), - .toname = new_name->name, - .tolen = new_name->len - }; - struct nfs3_renameres res = { - .fromattr = &old_dir_attr, - .toattr = &new_dir_attr - }; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status; +nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir, + struct inode *new_dir) +{ + struct nfs_renameres *res; - dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); - nfs_fattr_init(&old_dir_attr); - nfs_fattr_init(&new_dir_attr); - status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); - nfs_post_op_update_inode(old_dir, &old_dir_attr); - nfs_post_op_update_inode(new_dir, &new_dir_attr); - dprintk("NFS reply rename: %d\n", status); - return status; + if (nfs3_async_handle_jukebox(task, old_dir)) + return 0; + res = task->tk_msg.rpc_resp; + + nfs_post_op_update_inode(old_dir, res->old_fattr); + nfs_post_op_update_inode(new_dir, res->new_fattr); + return 1; } static int nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) { - struct nfs_fattr dir_attr, fattr; struct nfs3_linkargs arg = { .fromfh = NFS_FH(inode), .tofh = NFS_FH(dir), .toname = name->name, .tolen = name->len }; - struct nfs3_linkres res = { - .dir_attr = &dir_attr, - .fattr = &fattr - }; + struct nfs3_linkres res; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_LINK], .rpc_argp = &arg, .rpc_resp = &res, }; - int status; + int status = -ENOMEM; dprintk("NFS call link %s\n", name->name); - nfs_fattr_init(&dir_attr); - nfs_fattr_init(&fattr); + res.fattr = nfs_alloc_fattr(); + res.dir_attr = nfs_alloc_fattr(); + if (res.fattr == NULL || res.dir_attr == NULL) + goto out; + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); - nfs_post_op_update_inode(dir, &dir_attr); - nfs_post_op_update_inode(inode, &fattr); + nfs_post_op_update_inode(dir, res.dir_attr); + nfs_post_op_update_inode(inode, res.fattr); +out: + nfs_free_fattr(res.dir_attr); + nfs_free_fattr(res.fattr); dprintk("NFS reply link: %d\n", status); return status; } @@ -452,40 +515,28 @@ static int nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { - struct nfs_fh fhandle; - struct nfs_fattr fattr, dir_attr; - struct nfs3_symlinkargs arg = { - .fromfh = NFS_FH(dir), - .fromname = dentry->d_name.name, - .fromlen = dentry->d_name.len, - .pages = &page, - .pathlen = len, - .sattr = sattr - }; - struct nfs3_diropres res = { - .dir_attr = &dir_attr, - .fh = &fhandle, - .fattr = &fattr - }; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status; + struct nfs3_createdata *data; + int status = -ENOMEM; if (len > NFS3_MAXPATHLEN) return -ENAMETOOLONG; - dprintk("NFS call symlink %s\n", dentry->d_name.name); + dprintk("NFS call symlink %pd\n", dentry); - nfs_fattr_init(&dir_attr); - nfs_fattr_init(&fattr); - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_post_op_update_inode(dir, &dir_attr); - if (status != 0) + data = nfs3_alloc_createdata(); + if (data == NULL) goto out; - status = nfs_instantiate(dentry, &fhandle, &fattr); + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK]; + data->arg.symlink.fromfh = NFS_FH(dir); + data->arg.symlink.fromname = dentry->d_name.name; + data->arg.symlink.fromlen = dentry->d_name.len; + data->arg.symlink.pages = &page; + data->arg.symlink.pathlen = len; + data->arg.symlink.sattr = sattr; + + status = nfs3_do_create(dir, dentry, data); + + nfs3_free_createdata(data); out: dprintk("NFS reply symlink: %d\n", status); return status; @@ -494,42 +545,37 @@ out: static int nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { - struct nfs_fh fhandle; - struct nfs_fattr fattr, dir_attr; - struct nfs3_mkdirargs arg = { - .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len, - .sattr = sattr - }; - struct nfs3_diropres res = { - .dir_attr = &dir_attr, - .fh = &fhandle, - .fattr = &fattr - }; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int mode = sattr->ia_mode; - int status; + struct posix_acl *default_acl, *acl; + struct nfs3_createdata *data; + int status = -ENOMEM; - dprintk("NFS call mkdir %s\n", dentry->d_name.name); + dprintk("NFS call mkdir %pd\n", dentry); - sattr->ia_mode &= ~current->fs->umask; + data = nfs3_alloc_createdata(); + if (data == NULL) + goto out; - nfs_fattr_init(&dir_attr); - nfs_fattr_init(&fattr); - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_post_op_update_inode(dir, &dir_attr); - if (status != 0) + status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); + if (status) goto out; - status = nfs_instantiate(dentry, &fhandle, &fattr); + + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR]; + data->arg.mkdir.fh = NFS_FH(dir); + data->arg.mkdir.name = dentry->d_name.name; + data->arg.mkdir.len = dentry->d_name.len; + data->arg.mkdir.sattr = sattr; + + status = nfs3_do_create(dir, dentry, data); if (status != 0) - goto out; - status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); + goto out_release_acls; + + status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: + posix_acl_release(acl); + posix_acl_release(default_acl); out: + nfs3_free_createdata(data); dprintk("NFS reply mkdir: %d\n", status); return status; } @@ -537,7 +583,7 @@ out: static int nfs3_proc_rmdir(struct inode *dir, struct qstr *name) { - struct nfs_fattr dir_attr; + struct nfs_fattr *dir_attr; struct nfs3_diropargs arg = { .fh = NFS_FH(dir), .name = name->name, @@ -546,14 +592,19 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name) struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR], .rpc_argp = &arg, - .rpc_resp = &dir_attr, }; - int status; + int status = -ENOMEM; dprintk("NFS call rmdir %s\n", name->name); - nfs_fattr_init(&dir_attr); + dir_attr = nfs_alloc_fattr(); + if (dir_attr == NULL) + goto out; + + msg.rpc_resp = dir_attr; status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_post_op_update_inode(dir, &dir_attr); + nfs_post_op_update_inode(dir, dir_attr); + nfs_free_fattr(dir_attr); +out: dprintk("NFS reply rmdir: %d\n", status); return status; } @@ -569,21 +620,19 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name) */ static int nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page *page, unsigned int count, int plus) + u64 cookie, struct page **pages, unsigned int count, int plus) { struct inode *dir = dentry->d_inode; - struct nfs_fattr dir_attr; - __be32 *verf = NFS_COOKIEVERF(dir); + __be32 *verf = NFS_I(dir)->cookieverf; struct nfs3_readdirargs arg = { .fh = NFS_FH(dir), .cookie = cookie, .verf = {verf[0], verf[1]}, .plus = plus, .count = count, - .pages = &page + .pages = pages }; struct nfs3_readdirres res = { - .dir_attr = &dir_attr, .verf = verf, .plus = plus }; @@ -593,7 +642,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, .rpc_resp = &res, .rpc_cred = cred }; - int status; + int status = -ENOMEM; if (plus) msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS]; @@ -601,13 +650,19 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, dprintk("NFS call readdir%s %d\n", plus? "plus" : "", (unsigned int) cookie); - nfs_fattr_init(&dir_attr); + res.dir_attr = nfs_alloc_fattr(); + if (res.dir_attr == NULL) + goto out; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_invalidate_atime(dir); + nfs_refresh_inode(dir, res.dir_attr); - nfs_refresh_inode(dir, &dir_attr); - dprintk("NFS reply readdir: %d\n", status); + nfs_free_fattr(res.dir_attr); +out: + dprintk("NFS reply readdir%s: %d\n", + plus? "plus" : "", status); return status; } @@ -615,52 +670,57 @@ static int nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { - struct nfs_fh fh; - struct nfs_fattr fattr, dir_attr; - struct nfs3_mknodargs arg = { - .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len, - .sattr = sattr, - .rdev = rdev - }; - struct nfs3_diropres res = { - .dir_attr = &dir_attr, - .fh = &fh, - .fattr = &fattr - }; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - mode_t mode = sattr->ia_mode; - int status; + struct posix_acl *default_acl, *acl; + struct nfs3_createdata *data; + int status = -ENOMEM; - switch (sattr->ia_mode & S_IFMT) { - case S_IFBLK: arg.type = NF3BLK; break; - case S_IFCHR: arg.type = NF3CHR; break; - case S_IFIFO: arg.type = NF3FIFO; break; - case S_IFSOCK: arg.type = NF3SOCK; break; - default: return -EINVAL; - } - - dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, + dprintk("NFS call mknod %pd %u:%u\n", dentry, MAJOR(rdev), MINOR(rdev)); - sattr->ia_mode &= ~current->fs->umask; + data = nfs3_alloc_createdata(); + if (data == NULL) + goto out; - nfs_fattr_init(&dir_attr); - nfs_fattr_init(&fattr); - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_post_op_update_inode(dir, &dir_attr); - if (status != 0) + status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); + if (status) goto out; - status = nfs_instantiate(dentry, &fh, &fattr); - if (status != 0) + + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD]; + data->arg.mknod.fh = NFS_FH(dir); + data->arg.mknod.name = dentry->d_name.name; + data->arg.mknod.len = dentry->d_name.len; + data->arg.mknod.sattr = sattr; + data->arg.mknod.rdev = rdev; + + switch (sattr->ia_mode & S_IFMT) { + case S_IFBLK: + data->arg.mknod.type = NF3BLK; + break; + case S_IFCHR: + data->arg.mknod.type = NF3CHR; + break; + case S_IFIFO: + data->arg.mknod.type = NF3FIFO; + break; + case S_IFSOCK: + data->arg.mknod.type = NF3SOCK; + break; + default: + status = -EINVAL; goto out; - status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); + } + + status = nfs3_do_create(dir, dentry, data); + if (status != 0) + goto out_release_acls; + + status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: + posix_acl_release(acl); + posix_acl_release(default_acl); out: + nfs3_free_createdata(data); dprintk("NFS reply mknod: %d\n", status); return status; } @@ -679,12 +739,12 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("NFS call fsstat\n"); nfs_fattr_init(stat->fattr); status = rpc_call_sync(server->client, &msg, 0); - dprintk("NFS reply statfs: %d\n", status); + dprintk("NFS reply fsstat: %d\n", status); return status; } static int -nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, +do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { struct rpc_message msg = { @@ -696,11 +756,27 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("NFS call fsinfo\n"); nfs_fattr_init(info->fattr); - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(client, &msg, 0); dprintk("NFS reply fsinfo: %d\n", status); return status; } +/* + * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via + * nfs_create_server + */ +static int +nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + + status = do_proc_fsinfo(server->client, fhandle, info); + if (status && server->nfs_client->cl_rpcclient != server->client) + status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info); + return status; +} + static int nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_pathconf *info) @@ -719,36 +795,51 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return status; } -static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data) { - if (nfs3_async_handle_jukebox(task, data->inode)) + struct inode *inode = data->header->inode; + + if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; - nfs_invalidate_atime(data->inode); - nfs_refresh_inode(data->inode, &data->fattr); + nfs_invalidate_atime(inode); + nfs_refresh_inode(inode, &data->fattr); return 0; } -static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; } -static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) { - if (nfs3_async_handle_jukebox(task, data->inode)) + rpc_call_start(task); + return 0; +} + +static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data) +{ + struct inode *inode = data->header->inode; + + if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr); + nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); return 0; } -static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE]; } -static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data) +static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) +{ + rpc_call_start(task); +} + +static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data) { if (nfs3_async_handle_jukebox(task, data->inode)) return -EAGAIN; @@ -756,7 +847,7 @@ static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data) return 0; } -static void nfs3_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT]; } @@ -764,17 +855,68 @@ static void nfs3_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa static int nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); } +static int nfs3_have_delegation(struct inode *inode, fmode_t flags) +{ + return 0; +} + +static int nfs3_return_delegation(struct inode *inode) +{ + nfs_wb_all(inode); + return 0; +} + +static const struct inode_operations nfs3_dir_inode_operations = { + .create = nfs_create, + .lookup = nfs_lookup, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +#ifdef CONFIG_NFS_V3_ACL + .listxattr = nfs3_listxattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .removexattr = generic_removexattr, + .get_acl = nfs3_get_acl, + .set_acl = nfs3_set_acl, +#endif +}; + +static const struct inode_operations nfs3_file_inode_operations = { + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +#ifdef CONFIG_NFS_V3_ACL + .listxattr = nfs3_listxattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .removexattr = generic_removexattr, + .get_acl = nfs3_get_acl, + .set_acl = nfs3_set_acl, +#endif +}; + const struct nfs_rpc_ops nfs_v3_clientops = { .version = 3, /* protocol version */ .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs3_dir_inode_operations, .file_inode_ops = &nfs3_file_inode_operations, + .file_ops = &nfs_file_operations, .getroot = nfs3_proc_get_root, + .submount = nfs_submount, + .try_mount = nfs_try_mount, .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, .lookup = nfs3_proc_lookup, @@ -783,8 +925,11 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .create = nfs3_proc_create, .remove = nfs3_proc_remove, .unlink_setup = nfs3_proc_unlink_setup, + .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare, .unlink_done = nfs3_proc_unlink_done, - .rename = nfs3_proc_rename, + .rename_setup = nfs3_proc_rename_setup, + .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare, + .rename_done = nfs3_proc_rename_done, .link = nfs3_proc_link, .symlink = nfs3_proc_symlink, .mkdir = nfs3_proc_mkdir, @@ -795,14 +940,22 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .fsinfo = nfs3_proc_fsinfo, .pathconf = nfs3_proc_pathconf, .decode_dirent = nfs3_decode_dirent, + .pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare, .read_setup = nfs3_proc_read_setup, .read_done = nfs3_read_done, .write_setup = nfs3_proc_write_setup, .write_done = nfs3_write_done, .commit_setup = nfs3_proc_commit_setup, + .commit_rpc_prepare = nfs3_proc_commit_rpc_prepare, .commit_done = nfs3_commit_done, - .file_open = nfs_open, - .file_release = nfs_release, .lock = nfs3_proc_lock, - .clear_acl_cache = nfs3_forget_cached_acls, + .clear_acl_cache = forget_all_cached_acls, + .close_context = nfs_close_context, + .have_delegation = nfs3_have_delegation, + .return_delegation = nfs3_return_delegation, + .alloc_client = nfs_alloc_client, + .init_client = nfs_init_client, + .free_client = nfs_free_client, + .create_server = nfs3_create_server, + .clone_server = nfs3_clone_server, }; diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c new file mode 100644 index 00000000000..d6a98949af1 --- /dev/null +++ b/fs/nfs/nfs3super.c @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2012 Netapp, Inc. All rights reserved. + */ +#include <linux/module.h> +#include <linux/nfs_fs.h> +#include "internal.h" +#include "nfs.h" + +static struct nfs_subversion nfs_v3 = { + .owner = THIS_MODULE, + .nfs_fs = &nfs_fs_type, + .rpc_vers = &nfs_version3, + .rpc_ops = &nfs_v3_clientops, + .sops = &nfs_sops, +#ifdef CONFIG_NFS_V3_ACL + .xattr = nfs3_xattr_handlers, +#endif +}; + +static int __init init_nfs_v3(void) +{ + register_nfs_version(&nfs_v3); + return 0; +} + +static void __exit exit_nfs_v3(void) +{ + unregister_nfs_version(&nfs_v3); +} + +MODULE_LICENSE("GPL"); + +module_init(init_nfs_v3); +module_exit(exit_nfs_v3); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 3917e2fa4e4..8f4cbe7f4aa 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -9,8 +9,6 @@ #include <linux/param.h> #include <linux/time.h> #include <linux/mm.h> -#include <linux/slab.h> -#include <linux/utsname.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/in.h> @@ -39,18 +37,16 @@ #define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2)) #define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2)) #define NFS3_fattr_sz (21) -#define NFS3_wcc_attr_sz (6) +#define NFS3_cookieverf_sz (NFS3_COOKIEVERFSIZE>>2) +#define NFS3_wcc_attr_sz (6) #define NFS3_pre_op_attr_sz (1+NFS3_wcc_attr_sz) #define NFS3_post_op_attr_sz (1+NFS3_fattr_sz) -#define NFS3_wcc_data_sz (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz) -#define NFS3_fsstat_sz -#define NFS3_fsinfo_sz -#define NFS3_pathconf_sz -#define NFS3_entry_sz (NFS3_filename_sz+3) - -#define NFS3_sattrargs_sz (NFS3_fh_sz+NFS3_sattr_sz+3) +#define NFS3_wcc_data_sz (NFS3_pre_op_attr_sz+NFS3_post_op_attr_sz) #define NFS3_diropargs_sz (NFS3_fh_sz+NFS3_filename_sz) -#define NFS3_removeargs_sz (NFS3_fh_sz+NFS3_filename_sz) + +#define NFS3_getattrargs_sz (NFS3_fh_sz) +#define NFS3_setattrargs_sz (NFS3_fh_sz+NFS3_sattr_sz+3) +#define NFS3_lookupargs_sz (NFS3_fh_sz+NFS3_filename_sz) #define NFS3_accessargs_sz (NFS3_fh_sz+1) #define NFS3_readlinkargs_sz (NFS3_fh_sz) #define NFS3_readargs_sz (NFS3_fh_sz+3) @@ -59,14 +55,16 @@ #define NFS3_mkdirargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz) #define NFS3_symlinkargs_sz (NFS3_diropargs_sz+1+NFS3_sattr_sz) #define NFS3_mknodargs_sz (NFS3_diropargs_sz+2+NFS3_sattr_sz) +#define NFS3_removeargs_sz (NFS3_fh_sz+NFS3_filename_sz) #define NFS3_renameargs_sz (NFS3_diropargs_sz+NFS3_diropargs_sz) #define NFS3_linkargs_sz (NFS3_fh_sz+NFS3_diropargs_sz) -#define NFS3_readdirargs_sz (NFS3_fh_sz+2) +#define NFS3_readdirargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+3) +#define NFS3_readdirplusargs_sz (NFS3_fh_sz+NFS3_cookieverf_sz+4) #define NFS3_commitargs_sz (NFS3_fh_sz+3) -#define NFS3_attrstat_sz (1+NFS3_fattr_sz) -#define NFS3_wccstat_sz (1+NFS3_wcc_data_sz) -#define NFS3_removeres_sz (NFS3_wccstat_sz) +#define NFS3_getattrres_sz (1+NFS3_fattr_sz) +#define NFS3_setattrres_sz (1+NFS3_wcc_data_sz) +#define NFS3_removeres_sz (NFS3_setattrres_sz) #define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz)) #define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1) #define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1) @@ -82,1089 +80,2446 @@ #define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2) #define ACL3_getaclargs_sz (NFS3_fh_sz+1) -#define ACL3_setaclargs_sz (NFS3_fh_sz+1+2*(2+5*3)) -#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+2*(2+5*3)) +#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) +#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) #define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) +static int nfs3_stat_to_errno(enum nfs_stat); + /* * Map file type to S_IFMT bits */ -static struct { - unsigned int mode; - unsigned int nfs2type; -} nfs_type2fmt[] = { - { 0, NFNON }, - { S_IFREG, NFREG }, - { S_IFDIR, NFDIR }, - { S_IFBLK, NFBLK }, - { S_IFCHR, NFCHR }, - { S_IFLNK, NFLNK }, - { S_IFSOCK, NFSOCK }, - { S_IFIFO, NFFIFO }, - { 0, NFBAD } +static const umode_t nfs_type2fmt[] = { + [NF3BAD] = 0, + [NF3REG] = S_IFREG, + [NF3DIR] = S_IFDIR, + [NF3BLK] = S_IFBLK, + [NF3CHR] = S_IFCHR, + [NF3LNK] = S_IFLNK, + [NF3SOCK] = S_IFSOCK, + [NF3FIFO] = S_IFIFO, }; /* - * Common NFS XDR functions as inlines + * While encoding arguments, set up the reply buffer in advance to + * receive reply data directly into the page cache. */ -static inline __be32 * -xdr_encode_fhandle(__be32 *p, const struct nfs_fh *fh) +static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, + unsigned int base, unsigned int len, + unsigned int bufsize) { - return xdr_encode_array(p, fh->data, fh->size); + struct rpc_auth *auth = req->rq_cred->cr_auth; + unsigned int replen; + + replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; + xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); } -static inline __be32 * -xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh) +/* + * Handle decode buffer overflows out-of-line. + */ +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) { - if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) { - memcpy(fh->data, p, fh->size); - return p + XDR_QUADLEN(fh->size); - } - return NULL; + dprintk("NFS: %s prematurely hit the end of our receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); } + /* - * Encode/decode time. + * Encode/decode NFSv3 basic data types + * + * Basic NFSv3 data types are defined in section 2.5 of RFC 1813: + * "NFS Version 3 Protocol Specification". + * + * Not all basic data types have their own encoding and decoding + * functions. For run-time efficiency, some data types are encoded + * or decoded inline. */ -static inline __be32 * -xdr_encode_time3(__be32 *p, struct timespec *timep) + +static void encode_uint32(struct xdr_stream *xdr, u32 value) { - *p++ = htonl(timep->tv_sec); - *p++ = htonl(timep->tv_nsec); - return p; + __be32 *p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(value); } -static inline __be32 * -xdr_decode_time3(__be32 *p, struct timespec *timep) +static int decode_uint32(struct xdr_stream *xdr, u32 *value) { - timep->tv_sec = ntohl(*p++); - timep->tv_nsec = ntohl(*p++); - return p; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + *value = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static __be32 * -xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr) +static int decode_uint64(struct xdr_stream *xdr, u64 *value) { - unsigned int type, major, minor; - int fmode; + __be32 *p; - type = ntohl(*p++); - if (type >= NF3BAD) - type = NF3BAD; - fmode = nfs_type2fmt[type].mode; - fattr->type = nfs_type2fmt[type].nfs2type; - fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode; - fattr->nlink = ntohl(*p++); - fattr->uid = ntohl(*p++); - fattr->gid = ntohl(*p++); - p = xdr_decode_hyper(p, &fattr->size); - p = xdr_decode_hyper(p, &fattr->du.nfs3.used); - - /* Turn remote device info into Linux-specific dev_t */ - major = ntohl(*p++); - minor = ntohl(*p++); - fattr->rdev = MKDEV(major, minor); - if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor) - fattr->rdev = 0; + p = xdr_inline_decode(xdr, 8); + if (unlikely(p == NULL)) + goto out_overflow; + xdr_decode_hyper(p, value); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} - p = xdr_decode_hyper(p, &fattr->fsid.major); - fattr->fsid.minor = 0; - p = xdr_decode_hyper(p, &fattr->fileid); - p = xdr_decode_time3(p, &fattr->atime); - p = xdr_decode_time3(p, &fattr->mtime); - p = xdr_decode_time3(p, &fattr->ctime); +/* + * fileid3 + * + * typedef uint64 fileid3; + */ +static __be32 *xdr_decode_fileid3(__be32 *p, u64 *fileid) +{ + return xdr_decode_hyper(p, fileid); +} - /* Update the mode bits */ - fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3); - return p; +static int decode_fileid3(struct xdr_stream *xdr, u64 *fileid) +{ + return decode_uint64(xdr, fileid); } -static inline __be32 * -xdr_encode_sattr(__be32 *p, struct iattr *attr) +/* + * filename3 + * + * typedef string filename3<>; + */ +static void encode_filename3(struct xdr_stream *xdr, + const char *name, u32 length) { - if (attr->ia_valid & ATTR_MODE) { - *p++ = xdr_one; - *p++ = htonl(attr->ia_mode & S_IALLUGO); - } else { - *p++ = xdr_zero; - } - if (attr->ia_valid & ATTR_UID) { - *p++ = xdr_one; - *p++ = htonl(attr->ia_uid); - } else { - *p++ = xdr_zero; - } - if (attr->ia_valid & ATTR_GID) { - *p++ = xdr_one; - *p++ = htonl(attr->ia_gid); - } else { - *p++ = xdr_zero; - } - if (attr->ia_valid & ATTR_SIZE) { - *p++ = xdr_one; - p = xdr_encode_hyper(p, (__u64) attr->ia_size); - } else { - *p++ = xdr_zero; - } - if (attr->ia_valid & ATTR_ATIME_SET) { - *p++ = xdr_two; - p = xdr_encode_time3(p, &attr->ia_atime); - } else if (attr->ia_valid & ATTR_ATIME) { - *p++ = xdr_one; - } else { - *p++ = xdr_zero; - } - if (attr->ia_valid & ATTR_MTIME_SET) { - *p++ = xdr_two; - p = xdr_encode_time3(p, &attr->ia_mtime); - } else if (attr->ia_valid & ATTR_MTIME) { - *p++ = xdr_one; - } else { - *p++ = xdr_zero; - } - return p; + __be32 *p; + + WARN_ON_ONCE(length > NFS3_MAXNAMLEN); + p = xdr_reserve_space(xdr, 4 + length); + xdr_encode_opaque(p, name, length); } -static inline __be32 * -xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr) +static int decode_inline_filename3(struct xdr_stream *xdr, + const char **name, u32 *length) { - p = xdr_decode_hyper(p, &fattr->pre_size); - p = xdr_decode_time3(p, &fattr->pre_mtime); - p = xdr_decode_time3(p, &fattr->pre_ctime); - fattr->valid |= NFS_ATTR_WCC; - return p; + __be32 *p; + u32 count; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + if (count > NFS3_MAXNAMLEN) + goto out_nametoolong; + p = xdr_inline_decode(xdr, count); + if (unlikely(p == NULL)) + goto out_overflow; + *name = (const char *)p; + *length = count; + return 0; + +out_nametoolong: + dprintk("NFS: returned filename too long: %u\n", count); + return -ENAMETOOLONG; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static inline __be32 * -xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr) +/* + * nfspath3 + * + * typedef string nfspath3<>; + */ +static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages, + const u32 length) { - if (*p++) - p = xdr_decode_fattr(p, fattr); - return p; + encode_uint32(xdr, length); + xdr_write_pages(xdr, pages, 0, length); } -static inline __be32 * -xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr) +static int decode_nfspath3(struct xdr_stream *xdr) { - if (*p++) - return xdr_decode_wcc_attr(p, fattr); - return p; + u32 recvd, count; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p); + if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN)) + goto out_nametoolong; + recvd = xdr_read_pages(xdr, count); + if (unlikely(count > recvd)) + goto out_cheating; + xdr_terminate_string(xdr->buf, count); + return 0; + +out_nametoolong: + dprintk("NFS: returned pathname too long: %u\n", count); + return -ENAMETOOLONG; +out_cheating: + dprintk("NFS: server cheating in pathname result: " + "count %u > recvd %u\n", count, recvd); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } +/* + * cookie3 + * + * typedef uint64 cookie3 + */ +static __be32 *xdr_encode_cookie3(__be32 *p, u64 cookie) +{ + return xdr_encode_hyper(p, cookie); +} -static inline __be32 * -xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr) +static int decode_cookie3(struct xdr_stream *xdr, u64 *cookie) { - p = xdr_decode_pre_op_attr(p, fattr); - return xdr_decode_post_op_attr(p, fattr); + return decode_uint64(xdr, cookie); } /* - * NFS encode functions + * cookieverf3 + * + * typedef opaque cookieverf3[NFS3_COOKIEVERFSIZE]; */ +static __be32 *xdr_encode_cookieverf3(__be32 *p, const __be32 *verifier) +{ + memcpy(p, verifier, NFS3_COOKIEVERFSIZE); + return p + XDR_QUADLEN(NFS3_COOKIEVERFSIZE); +} + +static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); + if (unlikely(p == NULL)) + goto out_overflow; + memcpy(verifier, p, NFS3_COOKIEVERFSIZE); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} /* - * Encode file handle argument + * createverf3 + * + * typedef opaque createverf3[NFS3_CREATEVERFSIZE]; */ -static int -nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh) +static void encode_createverf3(struct xdr_stream *xdr, const __be32 *verifier) { - p = xdr_encode_fhandle(p, fh); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + __be32 *p; + + p = xdr_reserve_space(xdr, NFS3_CREATEVERFSIZE); + memcpy(p, verifier, NFS3_CREATEVERFSIZE); +} + +static int decode_writeverf3(struct xdr_stream *xdr, struct nfs_write_verifier *verifier) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE); + if (unlikely(p == NULL)) + goto out_overflow; + memcpy(verifier->data, p, NFS3_WRITEVERFSIZE); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } /* - * Encode SETATTR arguments + * size3 + * + * typedef uint64 size3; */ -static int -nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args) -{ - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_sattr(p, args->sattr); - *p++ = htonl(args->guard); - if (args->guard) - p = xdr_encode_time3(p, &args->guardtime); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; +static __be32 *xdr_decode_size3(__be32 *p, u64 *size) +{ + return xdr_decode_hyper(p, size); } /* - * Encode directory ops argument + * nfsstat3 + * + * enum nfsstat3 { + * NFS3_OK = 0, + * ... + * } */ -static int -nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args) +#define NFS3_OK NFS_OK + +static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name, args->len); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + *status = be32_to_cpup(p); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } /* - * Encode REMOVE argument + * ftype3 + * + * enum ftype3 { + * NF3REG = 1, + * NF3DIR = 2, + * NF3BLK = 3, + * NF3CHR = 4, + * NF3LNK = 5, + * NF3SOCK = 6, + * NF3FIFO = 7 + * }; */ -static int -nfs3_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args) +static void encode_ftype3(struct xdr_stream *xdr, const u32 type) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name.name, args->name.len); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + encode_uint32(xdr, type); +} + +static __be32 *xdr_decode_ftype3(__be32 *p, umode_t *mode) +{ + u32 type; + + type = be32_to_cpup(p++); + if (type > NF3FIFO) + type = NF3NON; + *mode = nfs_type2fmt[type]; + return p; } /* - * Encode access() argument + * specdata3 + * + * struct specdata3 { + * uint32 specdata1; + * uint32 specdata2; + * }; */ -static int -nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args) +static void encode_specdata3(struct xdr_stream *xdr, const dev_t rdev) { - p = xdr_encode_fhandle(p, args->fh); - *p++ = htonl(args->access); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + __be32 *p; + + p = xdr_reserve_space(xdr, 8); + *p++ = cpu_to_be32(MAJOR(rdev)); + *p = cpu_to_be32(MINOR(rdev)); +} + +static __be32 *xdr_decode_specdata3(__be32 *p, dev_t *rdev) +{ + unsigned int major, minor; + + major = be32_to_cpup(p++); + minor = be32_to_cpup(p++); + *rdev = MKDEV(major, minor); + if (MAJOR(*rdev) != major || MINOR(*rdev) != minor) + *rdev = 0; + return p; } /* - * Arguments to a READ call. Since we read data directly into the page - * cache, we also set up the reply iovec here so that iov[1] points - * exactly to the page we want to fetch. + * nfs_fh3 + * + * struct nfs_fh3 { + * opaque data<NFS3_FHSIZE>; + * }; */ -static int -nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) +static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; - unsigned int replen; - u32 count = args->count; + __be32 *p; - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_hyper(p, args->offset); - *p++ = htonl(count); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + WARN_ON_ONCE(fh->size > NFS3_FHSIZE); + p = xdr_reserve_space(xdr, 4 + fh->size); + xdr_encode_opaque(p, fh->data, fh->size); +} - /* Inline the page array */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, - args->pages, args->pgbase, count); - req->rq_rcv_buf.flags |= XDRBUF_READ; +static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + u32 length; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + length = be32_to_cpup(p++); + if (unlikely(length > NFS3_FHSIZE)) + goto out_toobig; + p = xdr_inline_decode(xdr, length); + if (unlikely(p == NULL)) + goto out_overflow; + fh->size = length; + memcpy(fh->data, p, length); return 0; +out_toobig: + dprintk("NFS: file handle size (%u) too big\n", length); + return -E2BIG; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static void zero_nfs_fh3(struct nfs_fh *fh) +{ + memset(fh, 0, sizeof(*fh)); } /* - * Write arguments. Splice the buffer to be written into the iovec. + * nfstime3 + * + * struct nfstime3 { + * uint32 seconds; + * uint32 nseconds; + * }; */ -static int -nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) +static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep) { - struct xdr_buf *sndbuf = &req->rq_snd_buf; - u32 count = args->count; + *p++ = cpu_to_be32(timep->tv_sec); + *p++ = cpu_to_be32(timep->tv_nsec); + return p; +} - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_hyper(p, args->offset); - *p++ = htonl(count); - *p++ = htonl(args->stable); - *p++ = htonl(count); - sndbuf->len = xdr_adjust_iovec(sndbuf->head, p); - - /* Copy the page array */ - xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); - sndbuf->flags |= XDRBUF_WRITE; - return 0; +static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep) +{ + timep->tv_sec = be32_to_cpup(p++); + timep->tv_nsec = be32_to_cpup(p++); + return p; } /* - * Encode CREATE arguments + * sattr3 + * + * enum time_how { + * DONT_CHANGE = 0, + * SET_TO_SERVER_TIME = 1, + * SET_TO_CLIENT_TIME = 2 + * }; + * + * union set_mode3 switch (bool set_it) { + * case TRUE: + * mode3 mode; + * default: + * void; + * }; + * + * union set_uid3 switch (bool set_it) { + * case TRUE: + * uid3 uid; + * default: + * void; + * }; + * + * union set_gid3 switch (bool set_it) { + * case TRUE: + * gid3 gid; + * default: + * void; + * }; + * + * union set_size3 switch (bool set_it) { + * case TRUE: + * size3 size; + * default: + * void; + * }; + * + * union set_atime switch (time_how set_it) { + * case SET_TO_CLIENT_TIME: + * nfstime3 atime; + * default: + * void; + * }; + * + * union set_mtime switch (time_how set_it) { + * case SET_TO_CLIENT_TIME: + * nfstime3 mtime; + * default: + * void; + * }; + * + * struct sattr3 { + * set_mode3 mode; + * set_uid3 uid; + * set_gid3 gid; + * set_size3 size; + * set_atime atime; + * set_mtime mtime; + * }; */ -static int -nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args) +static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name, args->len); + u32 nbytes; + __be32 *p; + + /* + * In order to make only a single xdr_reserve_space() call, + * pre-compute the total number of bytes to be reserved. + * Six boolean values, one for each set_foo field, are always + * present in the encoded result, so start there. + */ + nbytes = 6 * 4; + if (attr->ia_valid & ATTR_MODE) + nbytes += 4; + if (attr->ia_valid & ATTR_UID) + nbytes += 4; + if (attr->ia_valid & ATTR_GID) + nbytes += 4; + if (attr->ia_valid & ATTR_SIZE) + nbytes += 8; + if (attr->ia_valid & ATTR_ATIME_SET) + nbytes += 8; + if (attr->ia_valid & ATTR_MTIME_SET) + nbytes += 8; + p = xdr_reserve_space(xdr, nbytes); - *p++ = htonl(args->createmode); - if (args->createmode == NFS3_CREATE_EXCLUSIVE) { - *p++ = args->verifier[0]; - *p++ = args->verifier[1]; + if (attr->ia_valid & ATTR_MODE) { + *p++ = xdr_one; + *p++ = cpu_to_be32(attr->ia_mode & S_IALLUGO); } else - p = xdr_encode_sattr(p, args->sattr); + *p++ = xdr_zero; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + if (attr->ia_valid & ATTR_UID) { + *p++ = xdr_one; + *p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid)); + } else + *p++ = xdr_zero; + + if (attr->ia_valid & ATTR_GID) { + *p++ = xdr_one; + *p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid)); + } else + *p++ = xdr_zero; + + if (attr->ia_valid & ATTR_SIZE) { + *p++ = xdr_one; + p = xdr_encode_hyper(p, (u64)attr->ia_size); + } else + *p++ = xdr_zero; + + if (attr->ia_valid & ATTR_ATIME_SET) { + *p++ = xdr_two; + p = xdr_encode_nfstime3(p, &attr->ia_atime); + } else if (attr->ia_valid & ATTR_ATIME) { + *p++ = xdr_one; + } else + *p++ = xdr_zero; + + if (attr->ia_valid & ATTR_MTIME_SET) { + *p++ = xdr_two; + xdr_encode_nfstime3(p, &attr->ia_mtime); + } else if (attr->ia_valid & ATTR_MTIME) { + *p = xdr_one; + } else + *p = xdr_zero; } /* - * Encode MKDIR arguments + * fattr3 + * + * struct fattr3 { + * ftype3 type; + * mode3 mode; + * uint32 nlink; + * uid3 uid; + * gid3 gid; + * size3 size; + * size3 used; + * specdata3 rdev; + * uint64 fsid; + * fileid3 fileid; + * nfstime3 atime; + * nfstime3 mtime; + * nfstime3 ctime; + * }; */ -static int -nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args) +static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) { - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name, args->len); - p = xdr_encode_sattr(p, args->sattr); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + umode_t fmode; + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + + p = xdr_decode_ftype3(p, &fmode); + + fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode; + fattr->nlink = be32_to_cpup(p++); + fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); + if (!uid_valid(fattr->uid)) + goto out_uid; + fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); + if (!gid_valid(fattr->gid)) + goto out_gid; + + p = xdr_decode_size3(p, &fattr->size); + p = xdr_decode_size3(p, &fattr->du.nfs3.used); + p = xdr_decode_specdata3(p, &fattr->rdev); + + p = xdr_decode_hyper(p, &fattr->fsid.major); + fattr->fsid.minor = 0; + + p = xdr_decode_fileid3(p, &fattr->fileid); + p = xdr_decode_nfstime3(p, &fattr->atime); + p = xdr_decode_nfstime3(p, &fattr->mtime); + xdr_decode_nfstime3(p, &fattr->ctime); + fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime); + + fattr->valid |= NFS_ATTR_FATTR_V3; return 0; +out_uid: + dprintk("NFS: returned invalid uid\n"); + return -EINVAL; +out_gid: + dprintk("NFS: returned invalid gid\n"); + return -EINVAL; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } /* - * Encode SYMLINK arguments + * post_op_attr + * + * union post_op_attr switch (bool attributes_follow) { + * case TRUE: + * fattr3 attributes; + * case FALSE: + * void; + * }; */ -static int -nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args) +static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) { - p = xdr_encode_fhandle(p, args->fromfh); - p = xdr_encode_array(p, args->fromname, args->fromlen); - p = xdr_encode_sattr(p, args->sattr); - *p++ = htonl(args->pathlen); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + __be32 *p; - /* Copy the page */ - xdr_encode_pages(&req->rq_snd_buf, args->pages, 0, args->pathlen); + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p != xdr_zero) + return decode_fattr3(xdr, fattr); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } /* - * Encode MKNOD arguments + * wcc_attr + * struct wcc_attr { + * size3 size; + * nfstime3 mtime; + * nfstime3 ctime; + * }; */ -static int -nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args) -{ - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_array(p, args->name, args->len); - *p++ = htonl(args->type); - p = xdr_encode_sattr(p, args->sattr); - if (args->type == NF3CHR || args->type == NF3BLK) { - *p++ = htonl(MAJOR(args->rdev)); - *p++ = htonl(MINOR(args->rdev)); - } +static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2); + if (unlikely(p == NULL)) + goto out_overflow; + + fattr->valid |= NFS_ATTR_FATTR_PRESIZE + | NFS_ATTR_FATTR_PRECHANGE + | NFS_ATTR_FATTR_PREMTIME + | NFS_ATTR_FATTR_PRECTIME; + + p = xdr_decode_size3(p, &fattr->pre_size); + p = xdr_decode_nfstime3(p, &fattr->pre_mtime); + xdr_decode_nfstime3(p, &fattr->pre_ctime); + fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } /* - * Encode RENAME arguments + * pre_op_attr + * union pre_op_attr switch (bool attributes_follow) { + * case TRUE: + * wcc_attr attributes; + * case FALSE: + * void; + * }; + * + * wcc_data + * + * struct wcc_data { + * pre_op_attr before; + * post_op_attr after; + * }; */ -static int -nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args) -{ - p = xdr_encode_fhandle(p, args->fromfh); - p = xdr_encode_array(p, args->fromname, args->fromlen); - p = xdr_encode_fhandle(p, args->tofh); - p = xdr_encode_array(p, args->toname, args->tolen); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); +static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p != xdr_zero) + return decode_wcc_attr(xdr, fattr); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr) +{ + int error; + + error = decode_pre_op_attr(xdr, fattr); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, fattr); +out: + return error; } /* - * Encode LINK arguments + * post_op_fh3 + * + * union post_op_fh3 switch (bool handle_follows) { + * case TRUE: + * nfs_fh3 handle; + * case FALSE: + * void; + * }; */ -static int -nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args) +static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) { - p = xdr_encode_fhandle(p, args->fromfh); - p = xdr_encode_fhandle(p, args->tofh); - p = xdr_encode_array(p, args->toname, args->tolen); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + __be32 *p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p != xdr_zero) + return decode_nfs_fh3(xdr, fh); + zero_nfs_fh3(fh); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } /* - * Encode arguments to readdir call + * diropargs3 + * + * struct diropargs3 { + * nfs_fh3 dir; + * filename3 name; + * }; */ -static int -nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args) +static void encode_diropargs3(struct xdr_stream *xdr, const struct nfs_fh *fh, + const char *name, u32 length) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; - unsigned int replen; - u32 count = args->count; - - p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_hyper(p, args->cookie); - *p++ = args->verf[0]; - *p++ = args->verf[1]; - if (args->plus) { - /* readdirplus: need dircount + buffer size. - * We just make sure we make dircount big enough */ - *p++ = htonl(count >> 3); - } - *p++ = htonl(count); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - - /* Inline the page array */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readdirres_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, count); - return 0; + encode_nfs_fh3(xdr, fh); + encode_filename3(xdr, name, length); } + /* - * Decode the result of a readdir call. - * We just check for syntactical correctness. + * NFSv3 XDR encode functions + * + * NFSv3 argument types are defined in section 3.3 of RFC 1813: + * "NFS Version 3 Protocol Specification". */ -static int -nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res) -{ - struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - struct kvec *iov = rcvbuf->head; - struct page **page; - size_t hdrlen; - u32 len, recvd, pglen; - int status, nr; - __be32 *entry, *end, *kaddr; - - status = ntohl(*p++); - /* Decode post_op_attrs */ - p = xdr_decode_post_op_attr(p, res->dir_attr); - if (status) - return -nfs_stat_to_errno(status); - /* Decode verifier cookie */ - if (res->verf) { - res->verf[0] = *p++; - res->verf[1] = *p++; - } else { - p += 2; - } - hdrlen = (u8 *) p - (u8 *) iov->iov_base; - if (iov->iov_len < hdrlen) { - dprintk("NFS: READDIR reply header overflowed:" - "length %Zu > %Zu\n", hdrlen, iov->iov_len); - return -errno_NFSERR_IO; - } else if (iov->iov_len != hdrlen) { - dprintk("NFS: READDIR header is short. iovec will be shifted.\n"); - xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); - } +/* + * 3.3.1 GETATTR3args + * + * struct GETATTR3args { + * nfs_fh3 object; + * }; + */ +static void nfs3_xdr_enc_getattr3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_fh *fh) +{ + encode_nfs_fh3(xdr, fh); +} - pglen = rcvbuf->page_len; - recvd = rcvbuf->len - hdrlen; - if (pglen > recvd) - pglen = recvd; - page = rcvbuf->pages; - kaddr = p = kmap_atomic(*page, KM_USER0); - end = (__be32 *)((char *)p + pglen); - entry = p; - for (nr = 0; *p++; nr++) { - if (p + 3 > end) - goto short_pkt; - p += 2; /* inode # */ - len = ntohl(*p++); /* string length */ - p += XDR_QUADLEN(len) + 2; /* name + cookie */ - if (len > NFS3_MAXNAMLEN) { - dprintk("NFS: giant filename in readdir (len 0x%x)!\n", - len); - goto err_unmap; - } - - if (res->plus) { - /* post_op_attr */ - if (p + 2 > end) - goto short_pkt; - if (*p++) { - p += 21; - if (p + 1 > end) - goto short_pkt; - } - /* post_op_fh3 */ - if (*p++) { - if (p + 1 > end) - goto short_pkt; - len = ntohl(*p++); - if (len > NFS3_FHSIZE) { - dprintk("NFS: giant filehandle in " - "readdir (len 0x%x)!\n", len); - goto err_unmap; - } - p += XDR_QUADLEN(len); - } - } +/* + * 3.3.2 SETATTR3args + * + * union sattrguard3 switch (bool check) { + * case TRUE: + * nfstime3 obj_ctime; + * case FALSE: + * void; + * }; + * + * struct SETATTR3args { + * nfs_fh3 object; + * sattr3 new_attributes; + * sattrguard3 guard; + * }; + */ +static void encode_sattrguard3(struct xdr_stream *xdr, + const struct nfs3_sattrargs *args) +{ + __be32 *p; - if (p + 2 > end) - goto short_pkt; - entry = p; - } - if (!nr && (entry[0] != 0 || entry[1] == 0)) - goto short_pkt; - out: - kunmap_atomic(kaddr, KM_USER0); - return nr; - short_pkt: - entry[0] = entry[1] = 0; - /* truncate listing ? */ - if (!nr) { - dprintk("NFS: readdir reply truncated!\n"); - entry[1] = 1; + if (args->guard) { + p = xdr_reserve_space(xdr, 4 + 8); + *p++ = xdr_one; + xdr_encode_nfstime3(p, &args->guardtime); + } else { + p = xdr_reserve_space(xdr, 4); + *p = xdr_zero; } - goto out; -err_unmap: - nr = -errno_NFSERR_IO; - goto out; } -__be32 * -nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_sattrargs *args) { - struct nfs_entry old = *entry; + encode_nfs_fh3(xdr, args->fh); + encode_sattr3(xdr, args->sattr); + encode_sattrguard3(xdr, args); +} - if (!*p++) { - if (!*p) - return ERR_PTR(-EAGAIN); - entry->eof = 1; - return ERR_PTR(-EBADCOOKIE); - } +/* + * 3.3.3 LOOKUP3args + * + * struct LOOKUP3args { + * diropargs3 what; + * }; + */ +static void nfs3_xdr_enc_lookup3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_diropargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name, args->len); +} - p = xdr_decode_hyper(p, &entry->ino); - entry->len = ntohl(*p++); - entry->name = (const char *) p; - p += XDR_QUADLEN(entry->len); - entry->prev_cookie = entry->cookie; - p = xdr_decode_hyper(p, &entry->cookie); +/* + * 3.3.4 ACCESS3args + * + * struct ACCESS3args { + * nfs_fh3 object; + * uint32 access; + * }; + */ +static void encode_access3args(struct xdr_stream *xdr, + const struct nfs3_accessargs *args) +{ + encode_nfs_fh3(xdr, args->fh); + encode_uint32(xdr, args->access); +} - if (plus) { - entry->fattr->valid = 0; - p = xdr_decode_post_op_attr(p, entry->fattr); - /* In fact, a post_op_fh3: */ - if (*p++) { - p = xdr_decode_fhandle(p, entry->fh); - /* Ugh -- server reply was truncated */ - if (p == NULL) { - dprintk("NFS: FH truncated\n"); - *entry = old; - return ERR_PTR(-EAGAIN); - } - } else - memset((u8*)(entry->fh), 0, sizeof(*entry->fh)); - } +static void nfs3_xdr_enc_access3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_accessargs *args) +{ + encode_access3args(xdr, args); +} - entry->eof = !p[0] && p[1]; - return p; +/* + * 3.3.5 READLINK3args + * + * struct READLINK3args { + * nfs_fh3 symlink; + * }; + */ +static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_readlinkargs *args) +{ + encode_nfs_fh3(xdr, args->fh); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->pglen, NFS3_readlinkres_sz); } /* - * Encode COMMIT arguments + * 3.3.6 READ3args + * + * struct READ3args { + * nfs_fh3 file; + * offset3 offset; + * count3 count; + * }; */ -static int -nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) +static void encode_read3args(struct xdr_stream *xdr, + const struct nfs_pgio_args *args) { - p = xdr_encode_fhandle(p, args->fh); + __be32 *p; + + encode_nfs_fh3(xdr, args->fh); + + p = xdr_reserve_space(xdr, 8 + 4); p = xdr_encode_hyper(p, args->offset); - *p++ = htonl(args->count); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); - return 0; + *p = cpu_to_be32(args->count); +} + +static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_pgio_args *args) +{ + encode_read3args(xdr, args); + prepare_reply_buffer(req, args->pages, args->pgbase, + args->count, NFS3_readres_sz); + req->rq_rcv_buf.flags |= XDRBUF_READ; } -#ifdef CONFIG_NFS_V3_ACL /* - * Encode GETACL arguments + * 3.3.7 WRITE3args + * + * enum stable_how { + * UNSTABLE = 0, + * DATA_SYNC = 1, + * FILE_SYNC = 2 + * }; + * + * struct WRITE3args { + * nfs_fh3 file; + * offset3 offset; + * count3 count; + * stable_how stable; + * opaque data<>; + * }; */ -static int -nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p, - struct nfs3_getaclargs *args) +static void encode_write3args(struct xdr_stream *xdr, + const struct nfs_pgio_args *args) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; - unsigned int replen; + __be32 *p; - p = xdr_encode_fhandle(p, args->fh); - *p++ = htonl(args->mask); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + encode_nfs_fh3(xdr, args->fh); - if (args->mask & (NFS_ACL | NFS_DFACL)) { - /* Inline the page array */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + - ACL3_getaclres_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, - NFSACL_MAXPAGES << PAGE_SHIFT); - } - return 0; + p = xdr_reserve_space(xdr, 8 + 4 + 4 + 4); + p = xdr_encode_hyper(p, args->offset); + *p++ = cpu_to_be32(args->count); + *p++ = cpu_to_be32(args->stable); + *p = cpu_to_be32(args->count); + xdr_write_pages(xdr, args->pages, args->pgbase, args->count); +} + +static void nfs3_xdr_enc_write3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_pgio_args *args) +{ + encode_write3args(xdr, args); + xdr->buf->flags |= XDRBUF_WRITE; } /* - * Encode SETACL arguments + * 3.3.8 CREATE3args + * + * enum createmode3 { + * UNCHECKED = 0, + * GUARDED = 1, + * EXCLUSIVE = 2 + * }; + * + * union createhow3 switch (createmode3 mode) { + * case UNCHECKED: + * case GUARDED: + * sattr3 obj_attributes; + * case EXCLUSIVE: + * createverf3 verf; + * }; + * + * struct CREATE3args { + * diropargs3 where; + * createhow3 how; + * }; */ -static int -nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p, - struct nfs3_setaclargs *args) -{ - struct xdr_buf *buf = &req->rq_snd_buf; - unsigned int base, len_in_head, len = nfsacl_size( - (args->mask & NFS_ACL) ? args->acl_access : NULL, - (args->mask & NFS_DFACL) ? args->acl_default : NULL); - int count, err; - - p = xdr_encode_fhandle(p, NFS_FH(args->inode)); - *p++ = htonl(args->mask); - base = (char *)p - (char *)buf->head->iov_base; - /* put as much of the acls into head as possible. */ - len_in_head = min_t(unsigned int, buf->head->iov_len - base, len); - len -= len_in_head; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p + (len_in_head >> 2)); - - for (count = 0; (count << PAGE_SHIFT) < len; count++) { - args->pages[count] = alloc_page(GFP_KERNEL); - if (!args->pages[count]) { - while (count) - __free_page(args->pages[--count]); - return -ENOMEM; - } +static void encode_createhow3(struct xdr_stream *xdr, + const struct nfs3_createargs *args) +{ + encode_uint32(xdr, args->createmode); + switch (args->createmode) { + case NFS3_CREATE_UNCHECKED: + case NFS3_CREATE_GUARDED: + encode_sattr3(xdr, args->sattr); + break; + case NFS3_CREATE_EXCLUSIVE: + encode_createverf3(xdr, args->verifier); + break; + default: + BUG(); } - xdr_encode_pages(buf, args->pages, 0, len); +} - err = nfsacl_encode(buf, base, args->inode, - (args->mask & NFS_ACL) ? - args->acl_access : NULL, 1, 0); - if (err > 0) - err = nfsacl_encode(buf, base + err, args->inode, - (args->mask & NFS_DFACL) ? - args->acl_default : NULL, 1, - NFS_ACL_DEFAULT); - return (err > 0) ? 0 : err; +static void nfs3_xdr_enc_create3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_createargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name, args->len); + encode_createhow3(xdr, args); } -#endif /* CONFIG_NFS_V3_ACL */ /* - * NFS XDR decode functions + * 3.3.9 MKDIR3args + * + * struct MKDIR3args { + * diropargs3 where; + * sattr3 attributes; + * }; */ +static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_mkdirargs *args) +{ + encode_diropargs3(xdr, args->fh, args->name, args->len); + encode_sattr3(xdr, args->sattr); +} /* - * Decode attrstat reply. + * 3.3.10 SYMLINK3args + * + * struct symlinkdata3 { + * sattr3 symlink_attributes; + * nfspath3 symlink_data; + * }; + * + * struct SYMLINK3args { + * diropargs3 where; + * symlinkdata3 symlink; + * }; */ -static int -nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) +static void encode_symlinkdata3(struct xdr_stream *xdr, + const struct nfs3_symlinkargs *args) { - int status; + encode_sattr3(xdr, args->sattr); + encode_nfspath3(xdr, args->pages, args->pathlen); +} - if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); - xdr_decode_fattr(p, fattr); - return 0; +static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_symlinkargs *args) +{ + encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen); + encode_symlinkdata3(xdr, args); } /* - * Decode status+wcc_data reply - * SATTR, REMOVE, RMDIR + * 3.3.11 MKNOD3args + * + * struct devicedata3 { + * sattr3 dev_attributes; + * specdata3 spec; + * }; + * + * union mknoddata3 switch (ftype3 type) { + * case NF3CHR: + * case NF3BLK: + * devicedata3 device; + * case NF3SOCK: + * case NF3FIFO: + * sattr3 pipe_attributes; + * default: + * void; + * }; + * + * struct MKNOD3args { + * diropargs3 where; + * mknoddata3 what; + * }; */ -static int -nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) +static void encode_devicedata3(struct xdr_stream *xdr, + const struct nfs3_mknodargs *args) { - int status; + encode_sattr3(xdr, args->sattr); + encode_specdata3(xdr, args->rdev); +} - if ((status = ntohl(*p++))) - status = -nfs_stat_to_errno(status); - xdr_decode_wcc_data(p, fattr); - return status; +static void encode_mknoddata3(struct xdr_stream *xdr, + const struct nfs3_mknodargs *args) +{ + encode_ftype3(xdr, args->type); + switch (args->type) { + case NF3CHR: + case NF3BLK: + encode_devicedata3(xdr, args); + break; + case NF3SOCK: + case NF3FIFO: + encode_sattr3(xdr, args->sattr); + break; + case NF3REG: + case NF3DIR: + break; + default: + BUG(); + } } -static int -nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res) +static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_mknodargs *args) { - return nfs3_xdr_wccstat(req, p, &res->dir_attr); + encode_diropargs3(xdr, args->fh, args->name, args->len); + encode_mknoddata3(xdr, args); } /* - * Decode LOOKUP reply + * 3.3.12 REMOVE3args + * + * struct REMOVE3args { + * diropargs3 object; + * }; */ -static int -nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) +static void nfs3_xdr_enc_remove3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_removeargs *args) { - int status; - - if ((status = ntohl(*p++))) { - status = -nfs_stat_to_errno(status); - } else { - if (!(p = xdr_decode_fhandle(p, res->fh))) - return -errno_NFSERR_IO; - p = xdr_decode_post_op_attr(p, res->fattr); - } - xdr_decode_post_op_attr(p, res->dir_attr); - return status; + encode_diropargs3(xdr, args->fh, args->name.name, args->name.len); } /* - * Decode ACCESS reply + * 3.3.14 RENAME3args + * + * struct RENAME3args { + * diropargs3 from; + * diropargs3 to; + * }; */ -static int -nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res) +static void nfs3_xdr_enc_rename3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_renameargs *args) { - int status = ntohl(*p++); + const struct qstr *old = args->old_name; + const struct qstr *new = args->new_name; - p = xdr_decode_post_op_attr(p, res->fattr); - if (status) - return -nfs_stat_to_errno(status); - res->access = ntohl(*p++); - return 0; + encode_diropargs3(xdr, args->old_dir, old->name, old->len); + encode_diropargs3(xdr, args->new_dir, new->name, new->len); } -static int -nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args) +/* + * 3.3.15 LINK3args + * + * struct LINK3args { + * nfs_fh3 file; + * diropargs3 link; + * }; + */ +static void nfs3_xdr_enc_link3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_linkargs *args) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; - unsigned int replen; + encode_nfs_fh3(xdr, args->fromfh); + encode_diropargs3(xdr, args->tofh, args->toname, args->tolen); +} - p = xdr_encode_fhandle(p, args->fh); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); +/* + * 3.3.16 READDIR3args + * + * struct READDIR3args { + * nfs_fh3 dir; + * cookie3 cookie; + * cookieverf3 cookieverf; + * count3 count; + * }; + */ +static void encode_readdir3args(struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) +{ + __be32 *p; - /* Inline the page array */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readlinkres_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen); - return 0; + encode_nfs_fh3(xdr, args->fh); + + p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4); + p = xdr_encode_cookie3(p, args->cookie); + p = xdr_encode_cookieverf3(p, args->verf); + *p = cpu_to_be32(args->count); +} + +static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) +{ + encode_readdir3args(xdr, args); + prepare_reply_buffer(req, args->pages, 0, + args->count, NFS3_readdirres_sz); } /* - * Decode READLINK reply + * 3.3.17 READDIRPLUS3args + * + * struct READDIRPLUS3args { + * nfs_fh3 dir; + * cookie3 cookie; + * cookieverf3 cookieverf; + * count3 dircount; + * count3 maxcount; + * }; */ -static int -nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) +static void encode_readdirplus3args(struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) { - struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - struct kvec *iov = rcvbuf->head; - size_t hdrlen; - u32 len, recvd; - char *kaddr; - int status; + __be32 *p; - status = ntohl(*p++); - p = xdr_decode_post_op_attr(p, fattr); + encode_nfs_fh3(xdr, args->fh); - if (status != 0) - return -nfs_stat_to_errno(status); + p = xdr_reserve_space(xdr, 8 + NFS3_COOKIEVERFSIZE + 4 + 4); + p = xdr_encode_cookie3(p, args->cookie); + p = xdr_encode_cookieverf3(p, args->verf); - /* Convert length of symlink */ - len = ntohl(*p++); - if (len >= rcvbuf->page_len) { - dprintk("nfs: server returned giant symlink!\n"); - return -ENAMETOOLONG; - } + /* + * readdirplus: need dircount + buffer size. + * We just make sure we make dircount big enough + */ + *p++ = cpu_to_be32(args->count >> 3); - hdrlen = (u8 *) p - (u8 *) iov->iov_base; - if (iov->iov_len < hdrlen) { - dprintk("NFS: READLINK reply header overflowed:" - "length %Zu > %Zu\n", hdrlen, iov->iov_len); - return -errno_NFSERR_IO; - } else if (iov->iov_len != hdrlen) { - dprintk("NFS: READLINK header is short. " - "iovec will be shifted.\n"); - xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); - } - recvd = req->rq_rcv_buf.len - hdrlen; - if (recvd < len) { - dprintk("NFS: server cheating in readlink reply: " - "count %u > recvd %u\n", len, recvd); - return -EIO; - } + *p = cpu_to_be32(args->count); +} - /* NULL terminate the string we got */ - kaddr = (char*)kmap_atomic(rcvbuf->pages[0], KM_USER0); - kaddr[len+rcvbuf->page_base] = '\0'; - kunmap_atomic(kaddr, KM_USER0); - return 0; +static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_readdirargs *args) +{ + encode_readdirplus3args(xdr, args); + prepare_reply_buffer(req, args->pages, 0, + args->count, NFS3_readdirres_sz); } /* - * Decode READ reply + * 3.3.21 COMMIT3args + * + * struct COMMIT3args { + * nfs_fh3 file; + * offset3 offset; + * count3 count; + * }; */ -static int -nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) +static void encode_commit3args(struct xdr_stream *xdr, + const struct nfs_commitargs *args) { - struct kvec *iov = req->rq_rcv_buf.head; - size_t hdrlen; - u32 count, ocount, recvd; - int status; + __be32 *p; - status = ntohl(*p++); - p = xdr_decode_post_op_attr(p, res->fattr); + encode_nfs_fh3(xdr, args->fh); - if (status != 0) - return -nfs_stat_to_errno(status); + p = xdr_reserve_space(xdr, 8 + 4); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); +} - /* Decode reply count and EOF flag. NFSv3 is somewhat redundant - * in that it puts the count both in the res struct and in the - * opaque data count. */ - count = ntohl(*p++); - res->eof = ntohl(*p++); - ocount = ntohl(*p++); +static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs_commitargs *args) +{ + encode_commit3args(xdr, args); +} - if (ocount != count) { - dprintk("NFS: READ count doesn't match RPC opaque count.\n"); - return -errno_NFSERR_IO; - } +#ifdef CONFIG_NFS_V3_ACL - hdrlen = (u8 *) p - (u8 *) iov->iov_base; - if (iov->iov_len < hdrlen) { - dprintk("NFS: READ reply header overflowed:" - "length %Zu > %Zu\n", hdrlen, iov->iov_len); - return -errno_NFSERR_IO; - } else if (iov->iov_len != hdrlen) { - dprintk("NFS: READ header is short. iovec will be shifted.\n"); - xdr_shift_buf(&req->rq_rcv_buf, iov->iov_len - hdrlen); - } +static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_getaclargs *args) +{ + encode_nfs_fh3(xdr, args->fh); + encode_uint32(xdr, args->mask); + if (args->mask & (NFS_ACL | NFS_DFACL)) + prepare_reply_buffer(req, args->pages, 0, + NFSACL_MAXPAGES << PAGE_SHIFT, + ACL3_getaclres_sz); +} - recvd = req->rq_rcv_buf.len - hdrlen; - if (count > recvd) { - dprintk("NFS: server cheating in read reply: " - "count %u > recvd %u\n", count, recvd); - count = recvd; - res->eof = 0; - } +static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs3_setaclargs *args) +{ + unsigned int base; + int error; - if (count < res->count) - res->count = count; + encode_nfs_fh3(xdr, NFS_FH(args->inode)); + encode_uint32(xdr, args->mask); - return count; + base = req->rq_slen; + if (args->npages != 0) + xdr_write_pages(xdr, args->pages, 0, args->len); + else + xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE); + + error = nfsacl_encode(xdr->buf, base, args->inode, + (args->mask & NFS_ACL) ? + args->acl_access : NULL, 1, 0); + /* FIXME: this is just broken */ + BUG_ON(error < 0); + error = nfsacl_encode(xdr->buf, base + error, args->inode, + (args->mask & NFS_DFACL) ? + args->acl_default : NULL, 1, + NFS_ACL_DEFAULT); + BUG_ON(error < 0); } +#endif /* CONFIG_NFS_V3_ACL */ + /* - * Decode WRITE response + * NFSv3 XDR decode functions + * + * NFSv3 result types are defined in section 3.3 of RFC 1813: + * "NFS Version 3 Protocol Specification". */ -static int -nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) -{ - int status; - status = ntohl(*p++); - p = xdr_decode_wcc_data(p, res->fattr); +/* + * 3.3.1 GETATTR3res + * + * struct GETATTR3resok { + * fattr3 obj_attributes; + * }; + * + * union GETATTR3res switch (nfsstat3 status) { + * case NFS3_OK: + * GETATTR3resok resok; + * default: + * void; + * }; + */ +static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_fattr3(xdr, result); +out: + return error; +out_default: + return nfs3_stat_to_errno(status); +} - if (status != 0) - return -nfs_stat_to_errno(status); +/* + * 3.3.2 SETATTR3res + * + * struct SETATTR3resok { + * wcc_data obj_wcc; + * }; + * + * struct SETATTR3resfail { + * wcc_data obj_wcc; + * }; + * + * union SETATTR3res switch (nfsstat3 status) { + * case NFS3_OK: + * SETATTR3resok resok; + * default: + * SETATTR3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs3_stat_to_errno(status); +} - res->count = ntohl(*p++); - res->verf->committed = (enum nfs3_stable_how)ntohl(*p++); - res->verf->verifier[0] = *p++; - res->verf->verifier[1] = *p++; +/* + * 3.3.3 LOOKUP3res + * + * struct LOOKUP3resok { + * nfs_fh3 object; + * post_op_attr obj_attributes; + * post_op_attr dir_attributes; + * }; + * + * struct LOOKUP3resfail { + * post_op_attr dir_attributes; + * }; + * + * union LOOKUP3res switch (nfsstat3 status) { + * case NFS3_OK: + * LOOKUP3resok resok; + * default: + * LOOKUP3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_diropres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_nfs_fh3(xdr, result->fh); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->dir_attr); +out: + return error; +out_default: + error = decode_post_op_attr(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + return nfs3_stat_to_errno(status); +} - return res->count; +/* + * 3.3.4 ACCESS3res + * + * struct ACCESS3resok { + * post_op_attr obj_attributes; + * uint32 access; + * }; + * + * struct ACCESS3resfail { + * post_op_attr obj_attributes; + * }; + * + * union ACCESS3res switch (nfsstat3 status) { + * case NFS3_OK: + * ACCESS3resok resok; + * default: + * ACCESS3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_access3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_accessres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_uint32(xdr, &result->access); +out: + return error; +out_default: + return nfs3_stat_to_errno(status); } /* - * Decode a CREATE response + * 3.3.5 READLINK3res + * + * struct READLINK3resok { + * post_op_attr symlink_attributes; + * nfspath3 data; + * }; + * + * struct READLINK3resfail { + * post_op_attr symlink_attributes; + * }; + * + * union READLINK3res switch (nfsstat3 status) { + * case NFS3_OK: + * READLINK3resok resok; + * default: + * READLINK3resfail resfail; + * }; */ -static int -nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) -{ - int status; - - status = ntohl(*p++); - if (status == 0) { - if (*p++) { - if (!(p = xdr_decode_fhandle(p, res->fh))) - return -errno_NFSERR_IO; - p = xdr_decode_post_op_attr(p, res->fattr); - } else { - memset(res->fh, 0, sizeof(*res->fh)); - /* Do decode post_op_attr but set it to NULL */ - p = xdr_decode_post_op_attr(p, res->fattr); - res->fattr->valid = 0; - } - } else { - status = -nfs_stat_to_errno(status); - } - p = xdr_decode_wcc_data(p, res->dir_attr); - return status; +static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_nfspath3(xdr); +out: + return error; +out_default: + return nfs3_stat_to_errno(status); } /* - * Decode RENAME reply + * 3.3.6 READ3res + * + * struct READ3resok { + * post_op_attr file_attributes; + * count3 count; + * bool eof; + * opaque data<>; + * }; + * + * struct READ3resfail { + * post_op_attr file_attributes; + * }; + * + * union READ3res switch (nfsstat3 status) { + * case NFS3_OK: + * READ3resok resok; + * default: + * READ3resfail resfail; + * }; */ -static int -nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res) +static int decode_read3resok(struct xdr_stream *xdr, + struct nfs_pgio_res *result) { - int status; + u32 eof, count, ocount, recvd; + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + count = be32_to_cpup(p++); + eof = be32_to_cpup(p++); + ocount = be32_to_cpup(p++); + if (unlikely(ocount != count)) + goto out_mismatch; + recvd = xdr_read_pages(xdr, count); + if (unlikely(count > recvd)) + goto out_cheating; +out: + result->eof = eof; + result->count = count; + return count; +out_mismatch: + dprintk("NFS: READ count doesn't match length of opaque: " + "count %u != ocount %u\n", count, ocount); + return -EIO; +out_cheating: + dprintk("NFS: server cheating in read result: " + "count %u > recvd %u\n", count, recvd); + count = recvd; + eof = 0; + goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} - if ((status = ntohl(*p++)) != 0) - status = -nfs_stat_to_errno(status); - p = xdr_decode_wcc_data(p, res->fromattr); - p = xdr_decode_wcc_data(p, res->toattr); - return status; +static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_pgio_res *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_read3resok(xdr, result); +out: + return error; +out_status: + return nfs3_stat_to_errno(status); } /* - * Decode LINK reply + * 3.3.7 WRITE3res + * + * enum stable_how { + * UNSTABLE = 0, + * DATA_SYNC = 1, + * FILE_SYNC = 2 + * }; + * + * struct WRITE3resok { + * wcc_data file_wcc; + * count3 count; + * stable_how committed; + * writeverf3 verf; + * }; + * + * struct WRITE3resfail { + * wcc_data file_wcc; + * }; + * + * union WRITE3res switch (nfsstat3 status) { + * case NFS3_OK: + * WRITE3resok resok; + * default: + * WRITE3resfail resfail; + * }; */ -static int -nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res) +static int decode_write3resok(struct xdr_stream *xdr, + struct nfs_pgio_res *result) { - int status; + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + result->count = be32_to_cpup(p++); + result->verf->committed = be32_to_cpup(p++); + if (unlikely(result->verf->committed > NFS_FILE_SYNC)) + goto out_badvalue; + if (decode_writeverf3(xdr, &result->verf->verifier)) + goto out_eio; + return result->count; +out_badvalue: + dprintk("NFS: bad stable_how value: %u\n", result->verf->committed); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); +out_eio: + return -EIO; +} - if ((status = ntohl(*p++)) != 0) - status = -nfs_stat_to_errno(status); - p = xdr_decode_post_op_attr(p, res->fattr); - p = xdr_decode_wcc_data(p, res->dir_attr); - return status; +static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_pgio_res *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_write3resok(xdr, result); +out: + return error; +out_status: + return nfs3_stat_to_errno(status); } /* - * Decode FSSTAT reply + * 3.3.8 CREATE3res + * + * struct CREATE3resok { + * post_op_fh3 obj; + * post_op_attr obj_attributes; + * wcc_data dir_wcc; + * }; + * + * struct CREATE3resfail { + * wcc_data dir_wcc; + * }; + * + * union CREATE3res switch (nfsstat3 status) { + * case NFS3_OK: + * CREATE3resok resok; + * default: + * CREATE3resfail resfail; + * }; */ -static int -nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res) +static int decode_create3resok(struct xdr_stream *xdr, + struct nfs3_diropres *result) { - int status; - - status = ntohl(*p++); + int error; + + error = decode_post_op_fh3(xdr, result->fh); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + /* The server isn't required to return a file handle. + * If it didn't, force the client to perform a LOOKUP + * to determine the correct file handle and attribute + * values for the new object. */ + if (result->fh->size == 0) + result->fattr->valid = 0; + error = decode_wcc_data(xdr, result->dir_attr); +out: + return error; +} - p = xdr_decode_post_op_attr(p, res->fattr); - if (status != 0) - return -nfs_stat_to_errno(status); +static int nfs3_xdr_dec_create3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_diropres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_create3resok(xdr, result); +out: + return error; +out_default: + error = decode_wcc_data(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + return nfs3_stat_to_errno(status); +} - p = xdr_decode_hyper(p, &res->tbytes); - p = xdr_decode_hyper(p, &res->fbytes); - p = xdr_decode_hyper(p, &res->abytes); - p = xdr_decode_hyper(p, &res->tfiles); - p = xdr_decode_hyper(p, &res->ffiles); - p = xdr_decode_hyper(p, &res->afiles); +/* + * 3.3.12 REMOVE3res + * + * struct REMOVE3resok { + * wcc_data dir_wcc; + * }; + * + * struct REMOVE3resfail { + * wcc_data dir_wcc; + * }; + * + * union REMOVE3res switch (nfsstat3 status) { + * case NFS3_OK: + * REMOVE3resok resok; + * default: + * REMOVE3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_removeres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs3_stat_to_errno(status); +} - /* ignore invarsec */ - return 0; +/* + * 3.3.14 RENAME3res + * + * struct RENAME3resok { + * wcc_data fromdir_wcc; + * wcc_data todir_wcc; + * }; + * + * struct RENAME3resfail { + * wcc_data fromdir_wcc; + * wcc_data todir_wcc; + * }; + * + * union RENAME3res switch (nfsstat3 status) { + * case NFS3_OK: + * RENAME3resok resok; + * default: + * RENAME3resfail resfail; + * }; + */ +static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_renameres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->old_fattr); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->new_fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs3_stat_to_errno(status); } /* - * Decode FSINFO reply + * 3.3.15 LINK3res + * + * struct LINK3resok { + * post_op_attr file_attributes; + * wcc_data linkdir_wcc; + * }; + * + * struct LINK3resfail { + * post_op_attr file_attributes; + * wcc_data linkdir_wcc; + * }; + * + * union LINK3res switch (nfsstat3 status) { + * case NFS3_OK: + * LINK3resok resok; + * default: + * LINK3resfail resfail; + * }; */ -static int -nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res) +static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs3_linkres *result) { - int status; + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; +out: + return error; +out_status: + return nfs3_stat_to_errno(status); +} - status = ntohl(*p++); +/** + * nfs3_decode_dirent - Decode a single NFSv3 directory entry stored in + * the local page cache + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + * + * 3.3.16 entry3 + * + * struct entry3 { + * fileid3 fileid; + * filename3 name; + * cookie3 cookie; + * fhandle3 filehandle; + * post_op_attr3 attributes; + * entry3 *nextentry; + * }; + * + * 3.3.17 entryplus3 + * struct entryplus3 { + * fileid3 fileid; + * filename3 name; + * cookie3 cookie; + * post_op_attr name_attributes; + * post_op_fh3 name_handle; + * entryplus3 *nextentry; + * }; + */ +int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + int plus) +{ + struct nfs_entry old = *entry; + __be32 *p; + int error; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p == xdr_zero) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p == xdr_zero) + return -EAGAIN; + entry->eof = 1; + return -EBADCOOKIE; + } - p = xdr_decode_post_op_attr(p, res->fattr); - if (status != 0) - return -nfs_stat_to_errno(status); + error = decode_fileid3(xdr, &entry->ino); + if (unlikely(error)) + return error; - res->rtmax = ntohl(*p++); - res->rtpref = ntohl(*p++); - res->rtmult = ntohl(*p++); - res->wtmax = ntohl(*p++); - res->wtpref = ntohl(*p++); - res->wtmult = ntohl(*p++); - res->dtpref = ntohl(*p++); - p = xdr_decode_hyper(p, &res->maxfilesize); + error = decode_inline_filename3(xdr, &entry->name, &entry->len); + if (unlikely(error)) + return error; + + entry->prev_cookie = entry->cookie; + error = decode_cookie3(xdr, &entry->cookie); + if (unlikely(error)) + return error; + + entry->d_type = DT_UNKNOWN; + + if (plus) { + entry->fattr->valid = 0; + error = decode_post_op_attr(xdr, entry->fattr); + if (unlikely(error)) + return error; + if (entry->fattr->valid & NFS_ATTR_FATTR_V3) + entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + + /* In fact, a post_op_fh3: */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(p == NULL)) + goto out_overflow; + if (*p != xdr_zero) { + error = decode_nfs_fh3(xdr, entry->fh); + if (unlikely(error)) { + if (error == -E2BIG) + goto out_truncated; + return error; + } + } else + zero_nfs_fh3(entry->fh); + } - /* ignore time_delta and properties */ - res->lease_time = 0; return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EAGAIN; +out_truncated: + dprintk("NFS: directory entry contains invalid file handle\n"); + *entry = old; + return -EAGAIN; } /* - * Decode PATHCONF reply + * 3.3.16 READDIR3res + * + * struct dirlist3 { + * entry3 *entries; + * bool eof; + * }; + * + * struct READDIR3resok { + * post_op_attr dir_attributes; + * cookieverf3 cookieverf; + * dirlist3 reply; + * }; + * + * struct READDIR3resfail { + * post_op_attr dir_attributes; + * }; + * + * union READDIR3res switch (nfsstat3 status) { + * case NFS3_OK: + * READDIR3resok resok; + * default: + * READDIR3resfail resfail; + * }; + * + * Read the directory contents into the page cache, but otherwise + * don't touch them. The actual decoding is done by nfs3_decode_entry() + * during subsequent nfs_readdir() calls. */ -static int -nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res) +static int decode_dirlist3(struct xdr_stream *xdr) { - int status; + return xdr_read_pages(xdr, xdr->buf->page_len); +} - status = ntohl(*p++); +static int decode_readdir3resok(struct xdr_stream *xdr, + struct nfs3_readdirres *result) +{ + int error; + + error = decode_post_op_attr(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + /* XXX: do we need to check if result->verf != NULL ? */ + error = decode_cookieverf3(xdr, result->verf); + if (unlikely(error)) + goto out; + error = decode_dirlist3(xdr); +out: + return error; +} - p = xdr_decode_post_op_attr(p, res->fattr); - if (status != 0) - return -nfs_stat_to_errno(status); - res->max_link = ntohl(*p++); - res->max_namelen = ntohl(*p++); +static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_readdirres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_readdir3resok(xdr, result); +out: + return error; +out_default: + error = decode_post_op_attr(xdr, result->dir_attr); + if (unlikely(error)) + goto out; + return nfs3_stat_to_errno(status); +} - /* ignore remaining fields */ +/* + * 3.3.18 FSSTAT3res + * + * struct FSSTAT3resok { + * post_op_attr obj_attributes; + * size3 tbytes; + * size3 fbytes; + * size3 abytes; + * size3 tfiles; + * size3 ffiles; + * size3 afiles; + * uint32 invarsec; + * }; + * + * struct FSSTAT3resfail { + * post_op_attr obj_attributes; + * }; + * + * union FSSTAT3res switch (nfsstat3 status) { + * case NFS3_OK: + * FSSTAT3resok resok; + * default: + * FSSTAT3resfail resfail; + * }; + */ +static int decode_fsstat3resok(struct xdr_stream *xdr, + struct nfs_fsstat *result) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 8 * 6 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + p = xdr_decode_size3(p, &result->tbytes); + p = xdr_decode_size3(p, &result->fbytes); + p = xdr_decode_size3(p, &result->abytes); + p = xdr_decode_size3(p, &result->tfiles); + p = xdr_decode_size3(p, &result->ffiles); + xdr_decode_size3(p, &result->afiles); + /* ignore invarsec */ return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fsstat *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_fsstat3resok(xdr, result); +out: + return error; +out_status: + return nfs3_stat_to_errno(status); } /* - * Decode COMMIT reply + * 3.3.19 FSINFO3res + * + * struct FSINFO3resok { + * post_op_attr obj_attributes; + * uint32 rtmax; + * uint32 rtpref; + * uint32 rtmult; + * uint32 wtmax; + * uint32 wtpref; + * uint32 wtmult; + * uint32 dtpref; + * size3 maxfilesize; + * nfstime3 time_delta; + * uint32 properties; + * }; + * + * struct FSINFO3resfail { + * post_op_attr obj_attributes; + * }; + * + * union FSINFO3res switch (nfsstat3 status) { + * case NFS3_OK: + * FSINFO3resok resok; + * default: + * FSINFO3resfail resfail; + * }; */ -static int -nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) +static int decode_fsinfo3resok(struct xdr_stream *xdr, + struct nfs_fsinfo *result) { - int status; + __be32 *p; + + p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4); + if (unlikely(p == NULL)) + goto out_overflow; + result->rtmax = be32_to_cpup(p++); + result->rtpref = be32_to_cpup(p++); + result->rtmult = be32_to_cpup(p++); + result->wtmax = be32_to_cpup(p++); + result->wtpref = be32_to_cpup(p++); + result->wtmult = be32_to_cpup(p++); + result->dtpref = be32_to_cpup(p++); + p = xdr_decode_size3(p, &result->maxfilesize); + xdr_decode_nfstime3(p, &result->time_delta); + + /* ignore properties */ + result->lease_time = 0; + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} - status = ntohl(*p++); - p = xdr_decode_wcc_data(p, res->fattr); - if (status != 0) - return -nfs_stat_to_errno(status); +static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fsinfo *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_fsinfo3resok(xdr, result); +out: + return error; +out_status: + return nfs3_stat_to_errno(status); +} + +/* + * 3.3.20 PATHCONF3res + * + * struct PATHCONF3resok { + * post_op_attr obj_attributes; + * uint32 linkmax; + * uint32 name_max; + * bool no_trunc; + * bool chown_restricted; + * bool case_insensitive; + * bool case_preserving; + * }; + * + * struct PATHCONF3resfail { + * post_op_attr obj_attributes; + * }; + * + * union PATHCONF3res switch (nfsstat3 status) { + * case NFS3_OK: + * PATHCONF3resok resok; + * default: + * PATHCONF3resfail resfail; + * }; + */ +static int decode_pathconf3resok(struct xdr_stream *xdr, + struct nfs_pathconf *result) +{ + __be32 *p; - res->verf->verifier[0] = *p++; - res->verf->verifier[1] = *p++; + p = xdr_inline_decode(xdr, 4 * 6); + if (unlikely(p == NULL)) + goto out_overflow; + result->max_link = be32_to_cpup(p++); + result->max_namelen = be32_to_cpup(p); + /* ignore remaining fields */ return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_pathconf *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_pathconf3resok(xdr, result); +out: + return error; +out_status: + return nfs3_stat_to_errno(status); } -#ifdef CONFIG_NFS_V3_ACL /* - * Decode GETACL reply + * 3.3.21 COMMIT3res + * + * struct COMMIT3resok { + * wcc_data file_wcc; + * writeverf3 verf; + * }; + * + * struct COMMIT3resfail { + * wcc_data file_wcc; + * }; + * + * union COMMIT3res switch (nfsstat3 status) { + * case NFS3_OK: + * COMMIT3resok resok; + * default: + * COMMIT3resfail resfail; + * }; */ -static int -nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p, - struct nfs3_getaclres *res) +static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_commitres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + error = decode_wcc_data(xdr, result->fattr); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_status; + error = decode_writeverf3(xdr, &result->verf->verifier); +out: + return error; +out_status: + return nfs3_stat_to_errno(status); +} + +#ifdef CONFIG_NFS_V3_ACL + +static inline int decode_getacl3resok(struct xdr_stream *xdr, + struct nfs3_getaclres *result) { - struct xdr_buf *buf = &req->rq_rcv_buf; - int status = ntohl(*p++); struct posix_acl **acl; unsigned int *aclcnt; - int err, base; - - if (status != 0) - return -nfs_stat_to_errno(status); - p = xdr_decode_post_op_attr(p, res->fattr); - res->mask = ntohl(*p++); - if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) - return -EINVAL; - base = (char *)p - (char *)req->rq_rcv_buf.head->iov_base; + size_t hdrlen; + int error; + + error = decode_post_op_attr(xdr, result->fattr); + if (unlikely(error)) + goto out; + error = decode_uint32(xdr, &result->mask); + if (unlikely(error)) + goto out; + error = -EINVAL; + if (result->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) + goto out; + + hdrlen = xdr_stream_pos(xdr); + + acl = NULL; + if (result->mask & NFS_ACL) + acl = &result->acl_access; + aclcnt = NULL; + if (result->mask & NFS_ACLCNT) + aclcnt = &result->acl_access_count; + error = nfsacl_decode(xdr->buf, hdrlen, aclcnt, acl); + if (unlikely(error <= 0)) + goto out; + + acl = NULL; + if (result->mask & NFS_DFACL) + acl = &result->acl_default; + aclcnt = NULL; + if (result->mask & NFS_DFACLCNT) + aclcnt = &result->acl_default_count; + error = nfsacl_decode(xdr->buf, hdrlen + error, aclcnt, acl); + if (unlikely(error <= 0)) + return error; + error = 0; +out: + return error; +} - acl = (res->mask & NFS_ACL) ? &res->acl_access : NULL; - aclcnt = (res->mask & NFS_ACLCNT) ? &res->acl_access_count : NULL; - err = nfsacl_decode(buf, base, aclcnt, acl); +static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs3_getaclres *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_getacl3resok(xdr, result); +out: + return error; +out_default: + return nfs3_stat_to_errno(status); +} - acl = (res->mask & NFS_DFACL) ? &res->acl_default : NULL; - aclcnt = (res->mask & NFS_DFACLCNT) ? &res->acl_default_count : NULL; - if (err > 0) - err = nfsacl_decode(buf, base + err, aclcnt, acl); - return (err > 0) ? 0 : err; +static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_fattr *result) +{ + enum nfs_stat status; + int error; + + error = decode_nfsstat3(xdr, &status); + if (unlikely(error)) + goto out; + if (status != NFS3_OK) + goto out_default; + error = decode_post_op_attr(xdr, result); +out: + return error; +out_default: + return nfs3_stat_to_errno(status); } +#endif /* CONFIG_NFS_V3_ACL */ + + /* - * Decode setacl reply. + * We need to translate between nfs status return values and + * the local errno values which may not be the same. */ -static int -nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) +static const struct { + int stat; + int errno; +} nfs_errtbl[] = { + { NFS_OK, 0 }, + { NFSERR_PERM, -EPERM }, + { NFSERR_NOENT, -ENOENT }, + { NFSERR_IO, -errno_NFSERR_IO}, + { NFSERR_NXIO, -ENXIO }, +/* { NFSERR_EAGAIN, -EAGAIN }, */ + { NFSERR_ACCES, -EACCES }, + { NFSERR_EXIST, -EEXIST }, + { NFSERR_XDEV, -EXDEV }, + { NFSERR_NODEV, -ENODEV }, + { NFSERR_NOTDIR, -ENOTDIR }, + { NFSERR_ISDIR, -EISDIR }, + { NFSERR_INVAL, -EINVAL }, + { NFSERR_FBIG, -EFBIG }, + { NFSERR_NOSPC, -ENOSPC }, + { NFSERR_ROFS, -EROFS }, + { NFSERR_MLINK, -EMLINK }, + { NFSERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFSERR_NOTEMPTY, -ENOTEMPTY }, + { NFSERR_DQUOT, -EDQUOT }, + { NFSERR_STALE, -ESTALE }, + { NFSERR_REMOTE, -EREMOTE }, +#ifdef EWFLUSH + { NFSERR_WFLUSH, -EWFLUSH }, +#endif + { NFSERR_BADHANDLE, -EBADHANDLE }, + { NFSERR_NOT_SYNC, -ENOTSYNC }, + { NFSERR_BAD_COOKIE, -EBADCOOKIE }, + { NFSERR_NOTSUPP, -ENOTSUPP }, + { NFSERR_TOOSMALL, -ETOOSMALL }, + { NFSERR_SERVERFAULT, -EREMOTEIO }, + { NFSERR_BADTYPE, -EBADTYPE }, + { NFSERR_JUKEBOX, -EJUKEBOX }, + { -1, -EIO } +}; + +/** + * nfs3_stat_to_errno - convert an NFS status code to a local errno + * @status: NFS status code to convert + * + * Returns a local errno value, or -EIO if the NFS status code is + * not recognized. This function is used jointly by NFSv2 and NFSv3. + */ +static int nfs3_stat_to_errno(enum nfs_stat status) { - int status = ntohl(*p++); + int i; - if (status) - return -nfs_stat_to_errno(status); - xdr_decode_post_op_attr(p, fattr); - return 0; + for (i = 0; nfs_errtbl[i].stat != -1; i++) { + if (nfs_errtbl[i].stat == (int)status) + return nfs_errtbl[i].errno; + } + dprintk("NFS: Unrecognized nfs status value: %u\n", status); + return nfs_errtbl[i].errno; } -#endif /* CONFIG_NFS_V3_ACL */ + #define PROC(proc, argtype, restype, timer) \ [NFS3PROC_##proc] = { \ .p_proc = NFS3PROC_##proc, \ - .p_encode = (kxdrproc_t) nfs3_xdr_##argtype, \ - .p_decode = (kxdrproc_t) nfs3_xdr_##restype, \ - .p_arglen = NFS3_##argtype##_sz, \ - .p_replen = NFS3_##restype##_sz, \ + .p_encode = (kxdreproc_t)nfs3_xdr_enc_##argtype##3args, \ + .p_decode = (kxdrdproc_t)nfs3_xdr_dec_##restype##3res, \ + .p_arglen = NFS3_##argtype##args_sz, \ + .p_replen = NFS3_##restype##res_sz, \ .p_timer = timer, \ .p_statidx = NFS3PROC_##proc, \ .p_name = #proc, \ } struct rpc_procinfo nfs3_procedures[] = { - PROC(GETATTR, fhandle, attrstat, 1), - PROC(SETATTR, sattrargs, wccstat, 0), - PROC(LOOKUP, diropargs, lookupres, 2), - PROC(ACCESS, accessargs, accessres, 1), - PROC(READLINK, readlinkargs, readlinkres, 3), - PROC(READ, readargs, readres, 3), - PROC(WRITE, writeargs, writeres, 4), - PROC(CREATE, createargs, createres, 0), - PROC(MKDIR, mkdirargs, createres, 0), - PROC(SYMLINK, symlinkargs, createres, 0), - PROC(MKNOD, mknodargs, createres, 0), - PROC(REMOVE, removeargs, removeres, 0), - PROC(RMDIR, diropargs, wccstat, 0), - PROC(RENAME, renameargs, renameres, 0), - PROC(LINK, linkargs, linkres, 0), - PROC(READDIR, readdirargs, readdirres, 3), - PROC(READDIRPLUS, readdirargs, readdirres, 3), - PROC(FSSTAT, fhandle, fsstatres, 0), - PROC(FSINFO, fhandle, fsinfores, 0), - PROC(PATHCONF, fhandle, pathconfres, 0), - PROC(COMMIT, commitargs, commitres, 5), + PROC(GETATTR, getattr, getattr, 1), + PROC(SETATTR, setattr, setattr, 0), + PROC(LOOKUP, lookup, lookup, 2), + PROC(ACCESS, access, access, 1), + PROC(READLINK, readlink, readlink, 3), + PROC(READ, read, read, 3), + PROC(WRITE, write, write, 4), + PROC(CREATE, create, create, 0), + PROC(MKDIR, mkdir, create, 0), + PROC(SYMLINK, symlink, create, 0), + PROC(MKNOD, mknod, create, 0), + PROC(REMOVE, remove, remove, 0), + PROC(RMDIR, lookup, setattr, 0), + PROC(RENAME, rename, rename, 0), + PROC(LINK, link, link, 0), + PROC(READDIR, readdir, readdir, 3), + PROC(READDIRPLUS, readdirplus, readdir, 3), + PROC(FSSTAT, getattr, fsstat, 0), + PROC(FSINFO, getattr, fsinfo, 0), + PROC(PATHCONF, getattr, pathconf, 0), + PROC(COMMIT, commit, commit, 5), }; -struct rpc_version nfs_version3 = { +const struct rpc_version nfs_version3 = { .number = 3, .nrprocs = ARRAY_SIZE(nfs3_procedures), .procs = nfs3_procedures @@ -1174,8 +2529,8 @@ struct rpc_version nfs_version3 = { static struct rpc_procinfo nfs3_acl_procedures[] = { [ACLPROC3_GETACL] = { .p_proc = ACLPROC3_GETACL, - .p_encode = (kxdrproc_t) nfs3_xdr_getaclargs, - .p_decode = (kxdrproc_t) nfs3_xdr_getaclres, + .p_encode = (kxdreproc_t)nfs3_xdr_enc_getacl3args, + .p_decode = (kxdrdproc_t)nfs3_xdr_dec_getacl3res, .p_arglen = ACL3_getaclargs_sz, .p_replen = ACL3_getaclres_sz, .p_timer = 1, @@ -1183,8 +2538,8 @@ static struct rpc_procinfo nfs3_acl_procedures[] = { }, [ACLPROC3_SETACL] = { .p_proc = ACLPROC3_SETACL, - .p_encode = (kxdrproc_t) nfs3_xdr_setaclargs, - .p_decode = (kxdrproc_t) nfs3_xdr_setaclres, + .p_encode = (kxdreproc_t)nfs3_xdr_enc_setacl3args, + .p_decode = (kxdrdproc_t)nfs3_xdr_dec_setacl3res, .p_arglen = ACL3_setaclargs_sz, .p_replen = ACL3_setaclres_sz, .p_timer = 0, @@ -1192,7 +2547,7 @@ static struct rpc_procinfo nfs3_acl_procedures[] = { }, }; -struct rpc_version nfsacl_version3 = { +const struct rpc_version nfsacl_version3 = { .number = 3, .nrprocs = sizeof(nfs3_acl_procedures)/ sizeof(nfs3_acl_procedures[0]), diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index bd1b9d663fb..ba2affa5194 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -9,59 +9,75 @@ #ifndef __LINUX_FS_NFS_NFS4_FS_H #define __LINUX_FS_NFS_NFS4_FS_H -#ifdef CONFIG_NFS_V4 +#if defined(CONFIG_NFS_V4_2) +#define NFS4_MAX_MINOR_VERSION 2 +#elif defined(CONFIG_NFS_V4_1) +#define NFS4_MAX_MINOR_VERSION 1 +#else +#define NFS4_MAX_MINOR_VERSION 0 +#endif -struct idmap; +#if IS_ENABLED(CONFIG_NFS_V4) -/* - * In a seqid-mutating op, this macro controls which error return - * values trigger incrementation of the seqid. - * - * from rfc 3010: - * The client MUST monotonically increment the sequence number for the - * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE - * operations. This is true even in the event that the previous - * operation that used the sequence number received an error. The only - * exception to this rule is if the previous operation received one of - * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID, - * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR, - * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE. - * - */ -#define seqid_mutating_err(err) \ -(((err) != NFSERR_STALE_CLIENTID) && \ - ((err) != NFSERR_STALE_STATEID) && \ - ((err) != NFSERR_BAD_STATEID) && \ - ((err) != NFSERR_BAD_SEQID) && \ - ((err) != NFSERR_BAD_XDR) && \ - ((err) != NFSERR_RESOURCE) && \ - ((err) != NFSERR_NOFILEHANDLE)) +#define NFS4_MAX_LOOP_ON_RECOVER (10) + +#include <linux/seqlock.h> + +struct idmap; enum nfs4_client_state { - NFS4CLNT_STATE_RECOVER = 0, + NFS4CLNT_MANAGER_RUNNING = 0, + NFS4CLNT_CHECK_LEASE, NFS4CLNT_LEASE_EXPIRED, + NFS4CLNT_RECLAIM_REBOOT, + NFS4CLNT_RECLAIM_NOGRACE, + NFS4CLNT_DELEGRETURN, + NFS4CLNT_SESSION_RESET, + NFS4CLNT_LEASE_CONFIRM, + NFS4CLNT_SERVER_SCOPE_MISMATCH, + NFS4CLNT_PURGE_STATE, + NFS4CLNT_BIND_CONN_TO_SESSION, + NFS4CLNT_MOVED, + NFS4CLNT_LEASE_MOVED, }; -/* - * struct rpc_sequence ensures that RPC calls are sent in the exact - * order that they appear on the list. - */ -struct rpc_sequence { - struct rpc_wait_queue wait; /* RPC call delay queue */ - spinlock_t lock; /* Protects the list */ - struct list_head list; /* Defines sequence of RPC calls */ +#define NFS4_RENEW_TIMEOUT 0x01 +#define NFS4_RENEW_DELEGATION_CB 0x02 + +struct nfs4_minor_version_ops { + u32 minor_version; + unsigned init_caps; + + int (*init_client)(struct nfs_client *); + void (*shutdown_client)(struct nfs_client *); + bool (*match_stateid)(const nfs4_stateid *, + const nfs4_stateid *); + int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, + struct nfs_fsinfo *); + int (*free_lock_state)(struct nfs_server *, + struct nfs4_lock_state *); + const struct rpc_call_ops *call_sync_ops; + const struct nfs4_state_recovery_ops *reboot_recovery_ops; + const struct nfs4_state_recovery_ops *nograce_recovery_ops; + const struct nfs4_state_maintenance_ops *state_renewal_ops; + const struct nfs4_mig_recovery_ops *mig_recovery_ops; }; #define NFS_SEQID_CONFIRMED 1 struct nfs_seqid_counter { - struct rpc_sequence *sequence; + ktime_t create_time; + int owner_id; int flags; u32 counter; + spinlock_t lock; /* Protects the list */ + struct list_head list; /* Defines sequence of RPC calls */ + struct rpc_wait_queue wait; /* RPC call delay queue */ }; struct nfs_seqid { struct nfs_seqid_counter *sequence; struct list_head list; + struct rpc_task *task; }; static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status) @@ -70,32 +86,37 @@ static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status seqid->flags |= NFS_SEQID_CONFIRMED; } -struct nfs_unique_id { - struct rb_node rb_node; - __u64 id; -}; - /* * NFS4 state_owners and lock_owners are simply labels for ordered * sequences of RPC calls. Their sole purpose is to provide once-only * semantics by allowing the server to identify replayed requests. */ struct nfs4_state_owner { - struct nfs_unique_id so_owner_id; - struct nfs_client *so_client; struct nfs_server *so_server; - struct rb_node so_client_node; + struct list_head so_lru; + unsigned long so_expires; + struct rb_node so_server_node; struct rpc_cred *so_cred; /* Associated cred */ spinlock_t so_lock; atomic_t so_count; + unsigned long so_flags; struct list_head so_states; - struct list_head so_delegations; struct nfs_seqid_counter so_seqid; - struct rpc_sequence so_sequence; + seqcount_t so_reclaim_seqcount; + struct mutex so_delegreturn_mutex; +}; + +enum { + NFS_OWNER_RECLAIM_REBOOT, + NFS_OWNER_RECLAIM_NOGRACE }; +#define NFS_LOCK_NEW 0 +#define NFS_LOCK_RECLAIM 1 +#define NFS_LOCK_EXPIRED 2 + /* * struct nfs4_state maintains the client-side state for a given * (state_owner,inode) tuple (OPEN) or state_owner (LOCK). @@ -108,26 +129,41 @@ struct nfs4_state_owner { * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) */ +struct nfs4_lock_owner { + unsigned int lo_type; +#define NFS4_ANY_LOCK_TYPE (0U) +#define NFS4_FLOCK_LOCK_TYPE (1U << 0) +#define NFS4_POSIX_LOCK_TYPE (1U << 1) + union { + fl_owner_t posix_owner; + pid_t flock_owner; + } lo_u; +}; + struct nfs4_lock_state { struct list_head ls_locks; /* Other lock stateids */ struct nfs4_state * ls_state; /* Pointer to open state */ - fl_owner_t ls_owner; /* POSIX lock owner */ -#define NFS_LOCK_INITIALIZED 1 - int ls_flags; +#define NFS_LOCK_INITIALIZED 0 +#define NFS_LOCK_LOST 1 + unsigned long ls_flags; struct nfs_seqid_counter ls_seqid; - struct rpc_sequence ls_sequence; - struct nfs_unique_id ls_id; nfs4_stateid ls_stateid; atomic_t ls_count; + struct nfs4_lock_owner ls_owner; }; /* bits for nfs4_state->flags */ enum { LK_STATE_IN_USE, NFS_DELEGATED_STATE, /* Current stateid is delegation */ + NFS_OPEN_STATE, /* OPEN stateid is set */ NFS_O_RDONLY_STATE, /* OPEN stateid has read-only state */ NFS_O_WRONLY_STATE, /* OPEN stateid has write-only state */ NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */ + NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */ + NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ + NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ + NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */ }; struct nfs4_state { @@ -149,7 +185,7 @@ struct nfs4_state { unsigned int n_rdonly; /* Number of read-only references */ unsigned int n_wronly; /* Number of write-only references */ unsigned int n_rdwr; /* Number of read/write references */ - int state; /* State on the server (R,W, or RW) */ + fmode_t state; /* State on the server (R,W, or RW) */ atomic_t count; }; @@ -157,43 +193,202 @@ struct nfs4_state { struct nfs4_exception { long timeout; int retry; + struct nfs4_state *state; + struct inode *inode; }; struct nfs4_state_recovery_ops { + int owner_flag_bit; + int state_flag_bit; int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *); int (*recover_lock)(struct nfs4_state *, struct file_lock *); + int (*establish_clid)(struct nfs_client *, struct rpc_cred *); + int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *); + int (*detect_trunking)(struct nfs_client *, struct nfs_client **, + struct rpc_cred *); +}; + +struct nfs4_state_maintenance_ops { + int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned); + struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *); + int (*renew_lease)(struct nfs_client *, struct rpc_cred *); }; -extern struct dentry_operations nfs4_dentry_operations; -extern const struct inode_operations nfs4_dir_inode_operations; +struct nfs4_mig_recovery_ops { + int (*get_locations)(struct inode *, struct nfs4_fs_locations *, + struct page *, struct rpc_cred *); + int (*fsid_present)(struct inode *, struct rpc_cred *); +}; + +extern const struct dentry_operations nfs4_dentry_operations; + +/* dir.c */ +int nfs_atomic_open(struct inode *, struct dentry *, struct file *, + unsigned, umode_t, int *); -/* inode.c */ -extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t); -extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int); -extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t); +/* super.c */ +extern struct file_system_type nfs4_fs_type; +/* nfs4namespace.c */ +struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, struct qstr *); +struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, + struct nfs_fh *, struct nfs_fattr *); +int nfs4_replace_transport(struct nfs_server *server, + const struct nfs4_fs_locations *locations); /* nfs4proc.c */ -extern int nfs4_map_errors(int err); -extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *); -extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); -extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); -extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); -extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait); -extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); -extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); +extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); +extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); +extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool); +extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, struct rpc_cred *cred); +extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); +extern int nfs4_destroy_clientid(struct nfs_client *clp); +extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); +extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); +extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); -extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, - struct nfs4_fs_locations *fs_locations, struct page *page); +extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *, + struct nfs4_fs_locations *, struct page *); +extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *, + struct page *page, struct rpc_cred *); +extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *); +extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *, + struct nfs_fh *, struct nfs_fattr *); +extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); +extern const struct xattr_handler *nfs4_xattr_handlers[]; +extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode); + +#if defined(CONFIG_NFS_V4_1) +static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) +{ + return server->nfs_client->cl_session; +} + +extern int nfs41_setup_sequence(struct nfs4_session *session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + struct rpc_task *task); +extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *); +extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *); +extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *); +extern int nfs4_proc_get_lease_time(struct nfs_client *clp, + struct nfs_fsinfo *fsinfo); +extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, + bool sync); + +static inline bool +is_ds_only_client(struct nfs_client *clp) +{ + return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) == + EXCHGID4_FLAG_USE_PNFS_DS; +} + +static inline bool +is_ds_client(struct nfs_client *clp) +{ + return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS; +} + +static inline bool +_nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, + struct rpc_clnt **clntp, struct rpc_message *msg) +{ + struct rpc_cred *newcred = NULL; + rpc_authflavor_t flavor; + + if (test_bit(sp4_mode, &clp->cl_sp4_flags)) { + spin_lock(&clp->cl_lock); + if (clp->cl_machine_cred != NULL) + /* don't call get_rpccred on the machine cred - + * a reference will be held for life of clp */ + newcred = clp->cl_machine_cred; + spin_unlock(&clp->cl_lock); + msg->rpc_cred = newcred; + + flavor = clp->cl_rpcclient->cl_auth->au_flavor; + WARN_ON_ONCE(flavor != RPC_AUTH_GSS_KRB5I && + flavor != RPC_AUTH_GSS_KRB5P); + *clntp = clp->cl_rpcclient; + + return true; + } + return false; +} + +/* + * Function responsible for determining if an rpc_message should use the + * machine cred under SP4_MACH_CRED and if so switching the credential and + * authflavor (using the nfs_client's rpc_clnt which will be krb5i/p). + * Should be called before rpc_call_sync/rpc_call_async. + */ +static inline void +nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, + struct rpc_clnt **clntp, struct rpc_message *msg) +{ + _nfs4_state_protect(clp, sp4_mode, clntp, msg); +} + +/* + * Special wrapper to nfs4_state_protect for write. + * If WRITE can use machine cred but COMMIT cannot, make sure all writes + * that use machine cred use NFS_FILE_SYNC. + */ +static inline void +nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, + struct rpc_message *msg, struct nfs_pgio_data *wdata) +{ + if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) && + !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags)) + wdata->args.stable = NFS_FILE_SYNC; +} +#else /* CONFIG_NFS_v4_1 */ +static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) +{ + return NULL; +} + +static inline bool +is_ds_only_client(struct nfs_client *clp) +{ + return false; +} + +static inline bool +is_ds_client(struct nfs_client *clp) +{ + return false; +} -extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops; -extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops; +static inline void +nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags, + struct rpc_clnt **clntp, struct rpc_message *msg) +{ +} + +static inline void +nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, + struct rpc_message *msg, struct nfs_pgio_data *wdata) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; -extern const u32 nfs4_fattr_bitmap[2]; -extern const u32 nfs4_statfs_bitmap[2]; -extern const u32 nfs4_pathconf_bitmap[2]; -extern const u32 nfs4_fsinfo_bitmap[2]; -extern const u32 nfs4_fs_locations_bitmap[2]; +extern const u32 nfs4_fattr_bitmap[3]; +extern const u32 nfs4_statfs_bitmap[3]; +extern const u32 nfs4_pathconf_bitmap[3]; +extern const u32 nfs4_fsinfo_bitmap[3]; +extern const u32 nfs4_fs_locations_bitmap[3]; + +void nfs40_shutdown_client(struct nfs_client *); +void nfs41_shutdown_client(struct nfs_client *); +int nfs40_init_client(struct nfs_client *); +int nfs41_init_client(struct nfs_client *); +void nfs4_free_client(struct nfs_client *); + +struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *); /* nfs4renewd.c */ extern void nfs4_schedule_state_renewal(struct nfs_client *); @@ -202,42 +397,131 @@ extern void nfs4_kill_renewd(struct nfs_client *); extern void nfs4_renew_state(struct work_struct *); /* nfs4state.c */ -struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp); +struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp); +struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); +struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); +int nfs4_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **); +int nfs40_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **, struct rpc_cred *); +#if defined(CONFIG_NFS_V4_1) +int nfs41_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **, struct rpc_cred *); +extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); +extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp); +extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp); + +#else +static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) +{ +} +#endif /* CONFIG_NFS_V4_1 */ -extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); +extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *, gfp_t); extern void nfs4_put_state_owner(struct nfs4_state_owner *); -extern void nfs4_drop_state_owner(struct nfs4_state_owner *); +extern void nfs4_purge_state_owners(struct nfs_server *); extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); extern void nfs4_put_open_state(struct nfs4_state *); -extern void nfs4_close_state(struct path *, struct nfs4_state *, mode_t); -extern void nfs4_close_sync(struct path *, struct nfs4_state *, mode_t); -extern void nfs4_state_set_mode_locked(struct nfs4_state *, mode_t); -extern void nfs4_schedule_state_recovery(struct nfs_client *); +extern void nfs4_close_state(struct nfs4_state *, fmode_t); +extern void nfs4_close_sync(struct nfs4_state *, fmode_t); +extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); +extern void nfs_inode_find_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid); +extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *, struct nfs4_state *); +extern void nfs4_schedule_lease_recovery(struct nfs_client *); +extern int nfs4_wait_clnt_recover(struct nfs_client *clp); +extern int nfs4_client_recover_expired_lease(struct nfs_client *clp); +extern void nfs4_schedule_state_manager(struct nfs_client *); +extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); +extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); +extern int nfs4_schedule_migration_recovery(const struct nfs_server *); +extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *); +extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); +extern void nfs41_handle_server_scope(struct nfs_client *, + struct nfs41_server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); -extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); +extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, + fmode_t, const struct nfs_lockowner *); -extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter); +extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); +extern void nfs_release_seqid(struct nfs_seqid *seqid); extern void nfs_free_seqid(struct nfs_seqid *seqid); +extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp); + extern const nfs4_stateid zero_stateid; +/* nfs4super.c */ +struct nfs_mount_info; +extern struct nfs_subversion nfs_v4; +struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *); +extern bool nfs4_disable_idmapping; +extern unsigned short max_session_slots; +extern unsigned short send_implementation_id; +extern bool recover_lost_locks; + +#define NFS4_CLIENT_ID_UNIQ_LEN (64) +extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN]; + +/* nfs4sysctl.c */ +#ifdef CONFIG_SYSCTL +int nfs4_register_sysctl(void); +void nfs4_unregister_sysctl(void); +#else +static inline int nfs4_register_sysctl(void) +{ + return 0; +} + +static inline void nfs4_unregister_sysctl(void) +{ +} +#endif + /* nfs4xdr.c */ -extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus); extern struct rpc_procinfo nfs4_procedures[]; struct nfs4_mount_data; /* callback_xdr.c */ extern struct svc_version nfs4_callback_version1; +extern struct svc_version nfs4_callback_version4; + +static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src) +{ + return memcmp(dst, src, sizeof(*dst)) == 0; +} + +static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src) +{ + return memcmp(dst->other, src->other, NFS4_STATEID_OTHER_SIZE) == 0; +} + +static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stateid *s2) +{ + return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; +} + +static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) +{ + return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; +} #else -#define nfs4_close_state(a, b, c) do { } while (0) -#define nfs4_close_sync(a, b, c) do { } while (0) +#define nfs4_close_state(a, b) do { } while (0) +#define nfs4_close_sync(a, b) do { } while (0) +#define nfs4_state_protect(a, b, c, d) do { } while (0) +#define nfs4_state_protect_write(a, b, c, d) do { } while (0) #endif /* CONFIG_NFS_V4 */ #endif /* __LINUX_FS_NFS_NFS4_FS.H */ diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c new file mode 100644 index 00000000000..aa9ef487604 --- /dev/null +++ b/fs/nfs/nfs4client.c @@ -0,0 +1,1221 @@ +/* + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ +#include <linux/module.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_idmap.h> +#include <linux/nfs_mount.h> +#include <linux/sunrpc/addr.h> +#include <linux/sunrpc/auth.h> +#include <linux/sunrpc/xprt.h> +#include <linux/sunrpc/bc_xprt.h> +#include <linux/sunrpc/rpc_pipe_fs.h> +#include "internal.h" +#include "callback.h" +#include "delegation.h" +#include "nfs4session.h" +#include "pnfs.h" +#include "netns.h" + +#define NFSDBG_FACILITY NFSDBG_CLIENT + +/* + * Get a unique NFSv4.0 callback identifier which will be used + * by the V4.0 callback service to lookup the nfs_client struct + */ +static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) +{ + int ret = 0; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + + if (clp->rpc_ops->version != 4 || minorversion != 0) + return ret; + idr_preload(GFP_KERNEL); + spin_lock(&nn->nfs_client_lock); + ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT); + if (ret >= 0) + clp->cl_cb_ident = ret; + spin_unlock(&nn->nfs_client_lock); + idr_preload_end(); + return ret < 0 ? ret : 0; +} + +#ifdef CONFIG_NFS_V4_1 +/** + * Per auth flavor data server rpc clients + */ +struct nfs4_ds_server { + struct list_head list; /* ds_clp->cl_ds_clients */ + struct rpc_clnt *rpc_clnt; +}; + +/** + * Common lookup case for DS I/O + */ +static struct nfs4_ds_server * +nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor) +{ + struct nfs4_ds_server *dss; + + rcu_read_lock(); + list_for_each_entry_rcu(dss, &ds_clp->cl_ds_clients, list) { + if (dss->rpc_clnt->cl_auth->au_flavor != flavor) + continue; + goto out; + } + dss = NULL; +out: + rcu_read_unlock(); + return dss; +} + +static struct nfs4_ds_server * +nfs4_add_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor, + struct nfs4_ds_server *new) +{ + struct nfs4_ds_server *dss; + + spin_lock(&ds_clp->cl_lock); + list_for_each_entry(dss, &ds_clp->cl_ds_clients, list) { + if (dss->rpc_clnt->cl_auth->au_flavor != flavor) + continue; + goto out; + } + if (new) + list_add_rcu(&new->list, &ds_clp->cl_ds_clients); + dss = new; +out: + spin_unlock(&ds_clp->cl_lock); /* need some lock to protect list */ + return dss; +} + +static struct nfs4_ds_server * +nfs4_alloc_ds_server(struct nfs_client *ds_clp, rpc_authflavor_t flavor) +{ + struct nfs4_ds_server *dss; + + dss = kmalloc(sizeof(*dss), GFP_NOFS); + if (dss == NULL) + return ERR_PTR(-ENOMEM); + + dss->rpc_clnt = rpc_clone_client_set_auth(ds_clp->cl_rpcclient, flavor); + if (IS_ERR(dss->rpc_clnt)) { + int err = PTR_ERR(dss->rpc_clnt); + kfree (dss); + return ERR_PTR(err); + } + INIT_LIST_HEAD(&dss->list); + + return dss; +} + +static void +nfs4_free_ds_server(struct nfs4_ds_server *dss) +{ + rpc_release_client(dss->rpc_clnt); + kfree(dss); +} + +/** +* Find or create a DS rpc client with th MDS server rpc client auth flavor +* in the nfs_client cl_ds_clients list. +*/ +struct rpc_clnt * +nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode) +{ + struct nfs4_ds_server *dss, *new; + rpc_authflavor_t flavor = NFS_SERVER(inode)->client->cl_auth->au_flavor; + + dss = nfs4_find_ds_client(ds_clp, flavor); + if (dss != NULL) + goto out; + new = nfs4_alloc_ds_server(ds_clp, flavor); + if (IS_ERR(new)) + return ERR_CAST(new); + dss = nfs4_add_ds_client(ds_clp, flavor, new); + if (dss != new) + nfs4_free_ds_server(new); +out: + return dss->rpc_clnt; +} +EXPORT_SYMBOL_GPL(nfs4_find_or_create_ds_client); + +static void +nfs4_shutdown_ds_clients(struct nfs_client *clp) +{ + struct nfs4_ds_server *dss; + LIST_HEAD(shutdown_list); + + while (!list_empty(&clp->cl_ds_clients)) { + dss = list_entry(clp->cl_ds_clients.next, + struct nfs4_ds_server, list); + list_del(&dss->list); + rpc_shutdown_client(dss->rpc_clnt); + kfree (dss); + } +} + +void nfs41_shutdown_client(struct nfs_client *clp) +{ + if (nfs4_has_session(clp)) { + nfs4_shutdown_ds_clients(clp); + nfs4_destroy_session(clp->cl_session); + nfs4_destroy_clientid(clp); + } + +} +#endif /* CONFIG_NFS_V4_1 */ + +void nfs40_shutdown_client(struct nfs_client *clp) +{ + if (clp->cl_slot_tbl) { + nfs4_shutdown_slot_table(clp->cl_slot_tbl); + kfree(clp->cl_slot_tbl); + } +} + +struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) +{ + int err; + struct nfs_client *clp = nfs_alloc_client(cl_init); + if (IS_ERR(clp)) + return clp; + + err = nfs_get_cb_ident_idr(clp, cl_init->minorversion); + if (err) + goto error; + + if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) { + err = -EINVAL; + goto error; + } + + spin_lock_init(&clp->cl_lock); + INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); + INIT_LIST_HEAD(&clp->cl_ds_clients); + rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); + clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; + clp->cl_minorversion = cl_init->minorversion; + clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; + clp->cl_mig_gen = 1; + return clp; + +error: + nfs_free_client(clp); + return ERR_PTR(err); +} + +/* + * Destroy the NFS4 callback service + */ +static void nfs4_destroy_callback(struct nfs_client *clp) +{ + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) + nfs_callback_down(clp->cl_mvops->minor_version, clp->cl_net); +} + +static void nfs4_shutdown_client(struct nfs_client *clp) +{ + if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) + nfs4_kill_renewd(clp); + clp->cl_mvops->shutdown_client(clp); + nfs4_destroy_callback(clp); + if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) + nfs_idmap_delete(clp); + + rpc_destroy_wait_queue(&clp->cl_rpcwaitq); + kfree(clp->cl_serverowner); + kfree(clp->cl_serverscope); + kfree(clp->cl_implid); +} + +void nfs4_free_client(struct nfs_client *clp) +{ + nfs4_shutdown_client(clp); + nfs_free_client(clp); +} + +/* + * Initialize the NFS4 callback service + */ +static int nfs4_init_callback(struct nfs_client *clp) +{ + int error; + + if (clp->rpc_ops->version == 4) { + struct rpc_xprt *xprt; + + xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt); + + if (nfs4_has_session(clp)) { + error = xprt_setup_backchannel(xprt, + NFS41_BC_MIN_CALLBACKS); + if (error < 0) + return error; + } + + error = nfs_callback_up(clp->cl_mvops->minor_version, xprt); + if (error < 0) { + dprintk("%s: failed to start callback. Error = %d\n", + __func__, error); + return error; + } + __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state); + } + return 0; +} + +/** + * nfs40_init_client - nfs_client initialization tasks for NFSv4.0 + * @clp - nfs_client to initialize + * + * Returns zero on success, or a negative errno if some error occurred. + */ +int nfs40_init_client(struct nfs_client *clp) +{ + struct nfs4_slot_table *tbl; + int ret; + + tbl = kzalloc(sizeof(*tbl), GFP_NOFS); + if (tbl == NULL) + return -ENOMEM; + + ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE, + "NFSv4.0 transport Slot table"); + if (ret) { + kfree(tbl); + return ret; + } + + clp->cl_slot_tbl = tbl; + return 0; +} + +#if defined(CONFIG_NFS_V4_1) + +/** + * nfs41_init_client - nfs_client initialization tasks for NFSv4.1+ + * @clp - nfs_client to initialize + * + * Returns zero on success, or a negative errno if some error occurred. + */ +int nfs41_init_client(struct nfs_client *clp) +{ + struct nfs4_session *session = NULL; + + /* + * Create the session and mark it expired. + * When a SEQUENCE operation encounters the expired session + * it will do session recovery to initialize it. + */ + session = nfs4_alloc_session(clp); + if (!session) + return -ENOMEM; + + clp->cl_session = session; + + /* + * The create session reply races with the server back + * channel probe. Mark the client NFS_CS_SESSION_INITING + * so that the client back channel can find the + * nfs_client struct + */ + nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING); + return 0; +} + +#endif /* CONFIG_NFS_V4_1 */ + +/* + * Initialize the minor version specific parts of an NFS4 client record + */ +static int nfs4_init_client_minor_version(struct nfs_client *clp) +{ + int ret; + + ret = clp->cl_mvops->init_client(clp); + if (ret) + return ret; + return nfs4_init_callback(clp); +} + +/** + * nfs4_init_client - Initialise an NFS4 client record + * + * @clp: nfs_client to initialise + * @timeparms: timeout parameters for underlying RPC transport + * @ip_addr: callback IP address in presentation format + * @authflavor: authentication flavor for underlying RPC transport + * + * Returns pointer to an NFS client, or an ERR_PTR value. + */ +struct nfs_client *nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr) +{ + char buf[INET6_ADDRSTRLEN + 1]; + struct nfs_client *old; + int error; + + if (clp->cl_cons_state == NFS_CS_READY) { + /* the client is initialised already */ + dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp); + return clp; + } + + /* Check NFS protocol revision and initialize RPC op vector */ + clp->rpc_ops = &nfs_v4_clientops; + + if (clp->cl_minorversion != 0) + __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); + __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); + __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); + + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); + if (error == -EINVAL) + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); + if (error < 0) + goto error; + + /* If no clientaddr= option was specified, find a usable cb address */ + if (ip_addr == NULL) { + struct sockaddr_storage cb_addr; + struct sockaddr *sap = (struct sockaddr *)&cb_addr; + + error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr)); + if (error < 0) + goto error; + error = rpc_ntop(sap, buf, sizeof(buf)); + if (error < 0) + goto error; + ip_addr = (const char *)buf; + } + strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + + error = nfs_idmap_new(clp); + if (error < 0) { + dprintk("%s: failed to create idmapper. Error = %d\n", + __func__, error); + goto error; + } + __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); + + error = nfs4_init_client_minor_version(clp); + if (error < 0) + goto error; + + if (!nfs4_has_session(clp)) + nfs_mark_client_ready(clp, NFS_CS_READY); + + error = nfs4_discover_server_trunking(clp, &old); + if (error < 0) + goto error; + + if (clp != old) + clp->cl_preserve_clid = true; + nfs_put_client(clp); + return old; + +error: + nfs_mark_client_ready(clp, error); + nfs_put_client(clp); + dprintk("<-- nfs4_init_client() = xerror %d\n", error); + return ERR_PTR(error); +} + +/* + * SETCLIENTID just did a callback update with the callback ident in + * "drop," but server trunking discovery claims "drop" and "keep" are + * actually the same server. Swap the callback IDs so that "keep" + * will continue to use the callback ident the server now knows about, + * and so that "keep"'s original callback ident is destroyed when + * "drop" is freed. + */ +static void nfs4_swap_callback_idents(struct nfs_client *keep, + struct nfs_client *drop) +{ + struct nfs_net *nn = net_generic(keep->cl_net, nfs_net_id); + unsigned int save = keep->cl_cb_ident; + + if (keep->cl_cb_ident == drop->cl_cb_ident) + return; + + dprintk("%s: keeping callback ident %u and dropping ident %u\n", + __func__, keep->cl_cb_ident, drop->cl_cb_ident); + + spin_lock(&nn->nfs_client_lock); + + idr_replace(&nn->cb_ident_idr, keep, drop->cl_cb_ident); + keep->cl_cb_ident = drop->cl_cb_ident; + + idr_replace(&nn->cb_ident_idr, drop, save); + drop->cl_cb_ident = save; + + spin_unlock(&nn->nfs_client_lock); +} + +/** + * nfs40_walk_client_list - Find server that recognizes a client ID + * + * @new: nfs_client with client ID to test + * @result: OUT: found nfs_client, or new + * @cred: credential to use for trunking test + * + * Returns zero, a negative errno, or a negative NFS4ERR status. + * If zero is returned, an nfs_client pointer is planted in "result." + * + * NB: nfs40_walk_client_list() relies on the new nfs_client being + * the last nfs_client on the list. + */ +int nfs40_walk_client_list(struct nfs_client *new, + struct nfs_client **result, + struct rpc_cred *cred) +{ + struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); + struct nfs_client *pos, *prev = NULL; + struct nfs4_setclientid_res clid = { + .clientid = new->cl_clientid, + .confirm = new->cl_confirm, + }; + int status = -NFS4ERR_STALE_CLIENTID; + + spin_lock(&nn->nfs_client_lock); + list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + /* If "pos" isn't marked ready, we can't trust the + * remaining fields in "pos" */ + if (pos->cl_cons_state > NFS_CS_READY) { + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + if (prev) + nfs_put_client(prev); + prev = pos; + + status = nfs_wait_client_init_complete(pos); + if (status < 0) + goto out; + status = -NFS4ERR_STALE_CLIENTID; + spin_lock(&nn->nfs_client_lock); + } + if (pos->cl_cons_state != NFS_CS_READY) + continue; + + if (pos->rpc_ops != new->rpc_ops) + continue; + + if (pos->cl_proto != new->cl_proto) + continue; + + if (pos->cl_minorversion != new->cl_minorversion) + continue; + + if (pos->cl_clientid != new->cl_clientid) + continue; + + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + if (prev) + nfs_put_client(prev); + prev = pos; + + status = nfs4_proc_setclientid_confirm(pos, &clid, cred); + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + break; + case 0: + nfs4_swap_callback_idents(pos, new); + + prev = NULL; + *result = pos; + dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", + __func__, pos, atomic_read(&pos->cl_count)); + goto out; + case -ERESTARTSYS: + case -ETIMEDOUT: + /* The callback path may have been inadvertently + * changed. Schedule recovery! + */ + nfs4_schedule_path_down_recovery(pos); + default: + goto out; + } + + spin_lock(&nn->nfs_client_lock); + } + spin_unlock(&nn->nfs_client_lock); + + /* No match found. The server lost our clientid */ +out: + if (prev) + nfs_put_client(prev); + dprintk("NFS: <-- %s status = %d\n", __func__, status); + return status; +} + +#ifdef CONFIG_NFS_V4_1 +/* + * Returns true if the client IDs match + */ +static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) +{ + if (a->cl_clientid != b->cl_clientid) { + dprintk("NFS: --> %s client ID %llx does not match %llx\n", + __func__, a->cl_clientid, b->cl_clientid); + return false; + } + dprintk("NFS: --> %s client ID %llx matches %llx\n", + __func__, a->cl_clientid, b->cl_clientid); + return true; +} + +/* + * Returns true if the server owners match + */ +static bool +nfs4_match_serverowners(struct nfs_client *a, struct nfs_client *b) +{ + struct nfs41_server_owner *o1 = a->cl_serverowner; + struct nfs41_server_owner *o2 = b->cl_serverowner; + + if (o1->minor_id != o2->minor_id) { + dprintk("NFS: --> %s server owner minor IDs do not match\n", + __func__); + return false; + } + + if (o1->major_id_sz != o2->major_id_sz) + goto out_major_mismatch; + if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0) + goto out_major_mismatch; + + dprintk("NFS: --> %s server owners match\n", __func__); + return true; + +out_major_mismatch: + dprintk("NFS: --> %s server owner major IDs do not match\n", + __func__); + return false; +} + +/** + * nfs41_walk_client_list - Find nfs_client that matches a client/server owner + * + * @new: nfs_client with client ID to test + * @result: OUT: found nfs_client, or new + * @cred: credential to use for trunking test + * + * Returns zero, a negative errno, or a negative NFS4ERR status. + * If zero is returned, an nfs_client pointer is planted in "result." + * + * NB: nfs41_walk_client_list() relies on the new nfs_client being + * the last nfs_client on the list. + */ +int nfs41_walk_client_list(struct nfs_client *new, + struct nfs_client **result, + struct rpc_cred *cred) +{ + struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); + struct nfs_client *pos, *prev = NULL; + int status = -NFS4ERR_STALE_CLIENTID; + + spin_lock(&nn->nfs_client_lock); + list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + /* If "pos" isn't marked ready, we can't trust the + * remaining fields in "pos", especially the client + * ID and serverowner fields. Wait for CREATE_SESSION + * to finish. */ + if (pos->cl_cons_state > NFS_CS_READY) { + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + if (prev) + nfs_put_client(prev); + prev = pos; + + status = nfs_wait_client_init_complete(pos); + if (status == 0) { + nfs4_schedule_lease_recovery(pos); + status = nfs4_wait_clnt_recover(pos); + } + spin_lock(&nn->nfs_client_lock); + if (status < 0) + break; + status = -NFS4ERR_STALE_CLIENTID; + } + if (pos->cl_cons_state != NFS_CS_READY) + continue; + + if (pos->rpc_ops != new->rpc_ops) + continue; + + if (pos->cl_proto != new->cl_proto) + continue; + + if (pos->cl_minorversion != new->cl_minorversion) + continue; + + if (!nfs4_match_clientids(pos, new)) + continue; + + if (!nfs4_match_serverowners(pos, new)) + continue; + + atomic_inc(&pos->cl_count); + *result = pos; + status = 0; + dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", + __func__, pos, atomic_read(&pos->cl_count)); + break; + } + + /* No matching nfs_client found. */ + spin_unlock(&nn->nfs_client_lock); + dprintk("NFS: <-- %s status = %d\n", __func__, status); + if (prev) + nfs_put_client(prev); + return status; +} +#endif /* CONFIG_NFS_V4_1 */ + +static void nfs4_destroy_server(struct nfs_server *server) +{ + nfs_server_return_all_delegations(server); + unset_pnfs_layoutdriver(server); + nfs4_purge_state_owners(server); +} + +/* + * NFSv4.0 callback thread helper + * + * Find a client by callback identifier + */ +struct nfs_client * +nfs4_find_client_ident(struct net *net, int cb_ident) +{ + struct nfs_client *clp; + struct nfs_net *nn = net_generic(net, nfs_net_id); + + spin_lock(&nn->nfs_client_lock); + clp = idr_find(&nn->cb_ident_idr, cb_ident); + if (clp) + atomic_inc(&clp->cl_count); + spin_unlock(&nn->nfs_client_lock); + return clp; +} + +#if defined(CONFIG_NFS_V4_1) +/* Common match routine for v4.0 and v4.1 callback services */ +static bool nfs4_cb_match_client(const struct sockaddr *addr, + struct nfs_client *clp, u32 minorversion) +{ + struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; + + /* Don't match clients that failed to initialise */ + if (!(clp->cl_cons_state == NFS_CS_READY || + clp->cl_cons_state == NFS_CS_SESSION_INITING)) + return false; + + smp_rmb(); + + /* Match the version and minorversion */ + if (clp->rpc_ops->version != 4 || + clp->cl_minorversion != minorversion) + return false; + + /* Match only the IP address, not the port number */ + if (!nfs_sockaddr_match_ipaddr(addr, clap)) + return false; + + return true; +} + +/* + * NFSv4.1 callback thread helper + * For CB_COMPOUND calls, find a client by IP address, protocol version, + * minorversion, and sessionID + * + * Returns NULL if no such client + */ +struct nfs_client * +nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, + struct nfs4_sessionid *sid, u32 minorversion) +{ + struct nfs_client *clp; + struct nfs_net *nn = net_generic(net, nfs_net_id); + + spin_lock(&nn->nfs_client_lock); + list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { + if (nfs4_cb_match_client(addr, clp, minorversion) == false) + continue; + + if (!nfs4_has_session(clp)) + continue; + + /* Match sessionid*/ + if (memcmp(clp->cl_session->sess_id.data, + sid->data, NFS4_MAX_SESSIONID_LEN) != 0) + continue; + + atomic_inc(&clp->cl_count); + spin_unlock(&nn->nfs_client_lock); + return clp; + } + spin_unlock(&nn->nfs_client_lock); + return NULL; +} + +#else /* CONFIG_NFS_V4_1 */ + +struct nfs_client * +nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, + struct nfs4_sessionid *sid, u32 minorversion) +{ + return NULL; +} +#endif /* CONFIG_NFS_V4_1 */ + +/* + * Set up an NFS4 client + */ +static int nfs4_set_client(struct nfs_server *server, + const char *hostname, + const struct sockaddr *addr, + const size_t addrlen, + const char *ip_addr, + rpc_authflavor_t authflavour, + int proto, const struct rpc_timeout *timeparms, + u32 minorversion, struct net *net) +{ + struct nfs_client_initdata cl_init = { + .hostname = hostname, + .addr = addr, + .addrlen = addrlen, + .nfs_mod = &nfs_v4, + .proto = proto, + .minorversion = minorversion, + .net = net, + }; + struct nfs_client *clp; + int error; + + dprintk("--> nfs4_set_client()\n"); + + if (server->flags & NFS_MOUNT_NORESVPORT) + set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (server->options & NFS_OPTION_MIGRATION) + set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); + + /* Allocate or find a client reference we can use */ + clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); + if (IS_ERR(clp)) { + error = PTR_ERR(clp); + goto error; + } + + /* + * Query for the lease time on clientid setup or renewal + * + * Note that this will be set on nfs_clients that were created + * only for the DS role and did not set this bit, but now will + * serve a dual role. + */ + set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); + + server->nfs_client = clp; + dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); + return 0; +error: + dprintk("<-- nfs4_set_client() = xerror %d\n", error); + return error; +} + +/* + * Set up a pNFS Data Server client. + * + * Return any existing nfs_client that matches server address,port,version + * and minorversion. + * + * For a new nfs_client, use a soft mount (default), a low retrans and a + * low timeout interval so that if a connection is lost, we retry through + * the MDS. + */ +struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, + const struct sockaddr *ds_addr, int ds_addrlen, + int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) +{ + struct nfs_client_initdata cl_init = { + .addr = ds_addr, + .addrlen = ds_addrlen, + .nfs_mod = &nfs_v4, + .proto = ds_proto, + .minorversion = mds_clp->cl_minorversion, + .net = mds_clp->cl_net, + }; + struct rpc_timeout ds_timeout; + struct nfs_client *clp; + + /* + * Set an authflavor equual to the MDS value. Use the MDS nfs_client + * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS + * (section 13.1 RFC 5661). + */ + nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); + clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, + mds_clp->cl_rpcclient->cl_auth->au_flavor); + + dprintk("<-- %s %p\n", __func__, clp); + return clp; +} +EXPORT_SYMBOL_GPL(nfs4_set_ds_client); + +/* + * Session has been established, and the client marked ready. + * Set the mount rsize and wsize with negotiated fore channel + * attributes which will be bound checked in nfs_server_set_fsinfo. + */ +static void nfs4_session_set_rwsize(struct nfs_server *server) +{ +#ifdef CONFIG_NFS_V4_1 + struct nfs4_session *sess; + u32 server_resp_sz; + u32 server_rqst_sz; + + if (!nfs4_has_session(server->nfs_client)) + return; + sess = server->nfs_client->cl_session; + server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; + server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; + + if (server->rsize > server_resp_sz) + server->rsize = server_resp_sz; + if (server->wsize > server_rqst_sz) + server->wsize = server_rqst_sz; +#endif /* CONFIG_NFS_V4_1 */ +} + +static int nfs4_server_common_setup(struct nfs_server *server, + struct nfs_fh *mntfh, bool auth_probe) +{ + struct nfs_fattr *fattr; + int error; + + /* data servers support only a subset of NFSv4.1 */ + if (is_ds_only_client(server->nfs_client)) + return -EPROTONOSUPPORT; + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + return -ENOMEM; + + /* We must ensure the session is initialised first */ + error = nfs4_init_session(server->nfs_client); + if (error < 0) + goto out; + + /* Set the basic capabilities */ + server->caps |= server->nfs_client->cl_mvops->init_caps; + if (server->flags & NFS_MOUNT_NORDIRPLUS) + server->caps &= ~NFS_CAP_READDIRPLUS; + /* + * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower + * authentication. + */ + if (nfs4_disable_idmapping && + server->client->cl_auth->au_flavor == RPC_AUTH_UNIX) + server->caps |= NFS_CAP_UIDGID_NOMAP; + + + /* Probe the root fh to retrieve its FSID and filehandle */ + error = nfs4_get_rootfh(server, mntfh, auth_probe); + if (error < 0) + goto out; + + dprintk("Server FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + nfs_display_fhandle(mntfh, "Pseudo-fs root FH"); + + nfs4_session_set_rwsize(server); + + error = nfs_probe_fsinfo(server, mntfh, fattr); + if (error < 0) + goto out; + + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) + server->namelen = NFS4_MAXNAMLEN; + + nfs_server_insert_lists(server); + server->mount_time = jiffies; + server->destroy = nfs4_destroy_server; +out: + nfs_free_fattr(fattr); + return error; +} + +/* + * Create a version 4 volume record + */ +static int nfs4_init_server(struct nfs_server *server, + struct nfs_parsed_mount_data *data) +{ + struct rpc_timeout timeparms; + int error; + + dprintk("--> nfs4_init_server()\n"); + + nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, + data->timeo, data->retrans); + + /* Initialise the client representation from the mount data */ + server->flags = data->flags; + server->options = data->options; + server->auth_info = data->auth_info; + + /* Use the first specified auth flavor. If this flavor isn't + * allowed by the server, use the SECINFO path to try the + * other specified flavors */ + if (data->auth_info.flavor_len >= 1) + data->selected_flavor = data->auth_info.flavors[0]; + else + data->selected_flavor = RPC_AUTH_UNIX; + + /* Get a client record */ + error = nfs4_set_client(server, + data->nfs_server.hostname, + (const struct sockaddr *)&data->nfs_server.address, + data->nfs_server.addrlen, + data->client_address, + data->selected_flavor, + data->nfs_server.protocol, + &timeparms, + data->minorversion, + data->net); + if (error < 0) + goto error; + + if (data->rsize) + server->rsize = nfs_block_size(data->rsize, NULL); + if (data->wsize) + server->wsize = nfs_block_size(data->wsize, NULL); + + server->acregmin = data->acregmin * HZ; + server->acregmax = data->acregmax * HZ; + server->acdirmin = data->acdirmin * HZ; + server->acdirmax = data->acdirmax * HZ; + + server->port = data->nfs_server.port; + + error = nfs_init_server_rpcclient(server, &timeparms, + data->selected_flavor); + +error: + /* Done */ + dprintk("<-- nfs4_init_server() = %d\n", error); + return error; +} + +/* + * Create a version 4 volume record + * - keyed on server and FSID + */ +/*struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, + struct nfs_fh *mntfh)*/ +struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + struct nfs_server *server; + bool auth_probe; + int error; + + dprintk("--> nfs4_create_server()\n"); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + auth_probe = mount_info->parsed->auth_info.flavor_len < 1; + + /* set up the general RPC client */ + error = nfs4_init_server(server, mount_info->parsed); + if (error < 0) + goto error; + + error = nfs4_server_common_setup(server, mount_info->mntfh, auth_probe); + if (error < 0) + goto error; + + dprintk("<-- nfs4_create_server() = %p\n", server); + return server; + +error: + nfs_free_server(server); + dprintk("<-- nfs4_create_server() = error %d\n", error); + return ERR_PTR(error); +} + +/* + * Create an NFS4 referral server record + */ +struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, + struct nfs_fh *mntfh) +{ + struct nfs_client *parent_client; + struct nfs_server *server, *parent_server; + bool auth_probe; + int error; + + dprintk("--> nfs4_create_referral_server()\n"); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + parent_server = NFS_SB(data->sb); + parent_client = parent_server->nfs_client; + + /* Initialise the client representation from the parent server */ + nfs_server_copy_userdata(server, parent_server); + + /* Get a client representation. + * Note: NFSv4 always uses TCP, */ + error = nfs4_set_client(server, data->hostname, + data->addr, + data->addrlen, + parent_client->cl_ipaddr, + data->authflavor, + rpc_protocol(parent_server->client), + parent_server->client->cl_timeout, + parent_client->cl_mvops->minor_version, + parent_client->cl_net); + if (error < 0) + goto error; + + error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); + if (error < 0) + goto error; + + auth_probe = parent_server->auth_info.flavor_len < 1; + + error = nfs4_server_common_setup(server, mntfh, auth_probe); + if (error < 0) + goto error; + + dprintk("<-- nfs_create_referral_server() = %p\n", server); + return server; + +error: + nfs_free_server(server); + dprintk("<-- nfs4_create_referral_server() = error %d\n", error); + return ERR_PTR(error); +} + +/* + * Grab the destination's particulars, including lease expiry time. + * + * Returns zero if probe succeeded and retrieved FSID matches the FSID + * we have cached. + */ +static int nfs_probe_destination(struct nfs_server *server) +{ + struct inode *inode = server->super->s_root->d_inode; + struct nfs_fattr *fattr; + int error; + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + return -ENOMEM; + + /* Sanity: the probe won't work if the destination server + * does not recognize the migrated FH. */ + error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr); + + nfs_free_fattr(fattr); + return error; +} + +/** + * nfs4_update_server - Move an nfs_server to a different nfs_client + * + * @server: represents FSID to be moved + * @hostname: new end-point's hostname + * @sap: new end-point's socket address + * @salen: size of "sap" + * @net: net namespace + * + * The nfs_server must be quiescent before this function is invoked. + * Either its session is drained (NFSv4.1+), or its transport is + * plugged and drained (NFSv4.0). + * + * Returns zero on success, or a negative errno value. + */ +int nfs4_update_server(struct nfs_server *server, const char *hostname, + struct sockaddr *sap, size_t salen, struct net *net) +{ + struct nfs_client *clp = server->nfs_client; + struct rpc_clnt *clnt = server->client; + struct xprt_create xargs = { + .ident = clp->cl_proto, + .net = net, + .dstaddr = sap, + .addrlen = salen, + .servername = hostname, + }; + char buf[INET6_ADDRSTRLEN + 1]; + struct sockaddr_storage address; + struct sockaddr *localaddr = (struct sockaddr *)&address; + int error; + + dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + hostname); + + error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout); + if (error != 0) { + dprintk("<-- %s(): rpc_switch_client_transport returned %d\n", + __func__, error); + goto out; + } + + error = rpc_localaddr(clnt, localaddr, sizeof(address)); + if (error != 0) { + dprintk("<-- %s(): rpc_localaddr returned %d\n", + __func__, error); + goto out; + } + + error = -EAFNOSUPPORT; + if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) { + dprintk("<-- %s(): rpc_ntop returned %d\n", + __func__, error); + goto out; + } + + nfs_server_remove_lists(server); + error = nfs4_set_client(server, hostname, sap, salen, buf, + clp->cl_rpcclient->cl_auth->au_flavor, + clp->cl_proto, clnt->cl_timeout, + clp->cl_minorversion, net); + nfs_put_client(clp); + if (error != 0) { + nfs_server_insert_lists(server); + dprintk("<-- %s(): nfs4_set_client returned %d\n", + __func__, error); + goto out; + } + + if (server->nfs_client->cl_hostname == NULL) + server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); + nfs_server_insert_lists(server); + + error = nfs_probe_destination(server); + if (error < 0) + goto out; + + dprintk("<-- %s() succeeded\n", __func__); + +out: + return error; +} diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c new file mode 100644 index 00000000000..a816f0627a6 --- /dev/null +++ b/fs/nfs/nfs4file.c @@ -0,0 +1,135 @@ +/* + * linux/fs/nfs/file.c + * + * Copyright (C) 1992 Rick Sladkey + */ +#include <linux/nfs_fs.h> +#include "internal.h" +#include "fscache.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_FILE + +static int +nfs4_file_open(struct inode *inode, struct file *filp) +{ + struct nfs_open_context *ctx; + struct dentry *dentry = filp->f_path.dentry; + struct dentry *parent = NULL; + struct inode *dir; + unsigned openflags = filp->f_flags; + struct iattr attr; + int opened = 0; + int err; + + /* + * If no cached dentry exists or if it's negative, NFSv4 handled the + * opens in ->lookup() or ->create(). + * + * We only get this far for a cached positive dentry. We skipped + * revalidation, so handle it here by dropping the dentry and returning + * -EOPENSTALE. The VFS will retry the lookup/create/open. + */ + + dprintk("NFS: open file(%pd2)\n", dentry); + + if ((openflags & O_ACCMODE) == 3) + openflags--; + + /* We can't create new files here */ + openflags &= ~(O_CREAT|O_EXCL); + + parent = dget_parent(dentry); + dir = parent->d_inode; + + ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); + err = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out; + + attr.ia_valid = ATTR_OPEN; + if (openflags & O_TRUNC) { + attr.ia_valid |= ATTR_SIZE; + attr.ia_size = 0; + nfs_wb_all(inode); + } + + inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + switch (err) { + case -EPERM: + case -EACCES: + case -EDQUOT: + case -ENOSPC: + case -EROFS: + goto out_put_ctx; + default: + goto out_drop; + } + } + if (inode != dentry->d_inode) + goto out_drop; + + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + nfs_file_set_open_context(filp, ctx); + nfs_fscache_open_file(inode, filp); + err = 0; + +out_put_ctx: + put_nfs_open_context(ctx); +out: + dput(parent); + return err; + +out_drop: + d_drop(dentry); + err = -EOPENSTALE; + goto out_put_ctx; +} + +static int +nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + int ret; + struct inode *inode = file_inode(file); + + do { + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret != 0) + break; + mutex_lock(&inode->i_mutex); + ret = nfs_file_fsync_commit(file, start, end, datasync); + if (!ret) + ret = pnfs_layoutcommit_inode(inode, true); + mutex_unlock(&inode->i_mutex); + /* + * If nfs_file_fsync_commit detected a server reboot, then + * resend all dirty pages that might have been covered by + * the NFS_CONTEXT_RESEND_WRITES flag + */ + start = 0; + end = LLONG_MAX; + } while (ret == -EAGAIN); + + return ret; +} + +const struct file_operations nfs4_file_operations = { + .llseek = nfs_file_llseek, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = nfs_file_read, + .write_iter = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs4_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs4_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = iter_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = nfs_setlease, +}; diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c new file mode 100644 index 00000000000..c0b3a16b4a0 --- /dev/null +++ b/fs/nfs/nfs4getroot.c @@ -0,0 +1,50 @@ +/* +* Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. +* Written by David Howells (dhowells@redhat.com) +*/ + +#include <linux/nfs_fs.h> +#include "nfs4_fs.h" +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_CLIENT + +int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe) +{ + struct nfs_fsinfo fsinfo; + int ret = -ENOMEM; + + dprintk("--> nfs4_get_rootfh()\n"); + + fsinfo.fattr = nfs_alloc_fattr(); + if (fsinfo.fattr == NULL) + goto out; + + /* Start by getting the root filehandle from the server */ + ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo, auth_probe); + if (ret < 0) { + dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); + goto out; + } + + if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE) + || !S_ISDIR(fsinfo.fattr->mode)) { + printk(KERN_ERR "nfs4_get_rootfh:" + " getroot encountered non-directory\n"); + ret = -ENOTDIR; + goto out; + } + + if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { + printk(KERN_ERR "nfs4_get_rootfh:" + " getroot obtained referral\n"); + ret = -EREMOTE; + goto out; + } + + memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); +out: + nfs_free_fattr(fsinfo.fattr); + dprintk("<-- nfs4_get_rootfh() = %d\n", ret); + return ret; +} diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 5f9ba41ed5b..3d83cb1fdc7 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -11,17 +11,23 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include <linux/slab.h> #include <linux/string.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/vfs.h> #include <linux/inet.h> #include "internal.h" #include "nfs4_fs.h" +#include "dns_resolve.h" #define NFSDBG_FACILITY NFSDBG_VFS /* - * Check if fs_root is valid + * Convert the NFSv4 pathname components into a standard posix path. + * + * Note that the resulting string will be placed at the end of the buffer */ static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname, char *buffer, ssize_t buflen) @@ -48,35 +54,56 @@ Elong: } /* - * Determine the mount path as a string + * return the path component of "<server>:<path>" + * nfspath - the "<server>:<path>" string + * end - one past the last char that could contain "<server>:" + * returns NULL on failure */ -static char *nfs4_path(const struct vfsmount *mnt_parent, - const struct dentry *dentry, - char *buffer, ssize_t buflen) +static char *nfs_path_component(const char *nfspath, const char *end) { - const char *srvpath; - - srvpath = strchr(mnt_parent->mnt_devname, ':'); - if (srvpath) - srvpath++; - else - srvpath = mnt_parent->mnt_devname; + char *p; + + if (*nfspath == '[') { + /* parse [] escaped IPv6 addrs */ + p = strchr(nfspath, ']'); + if (p != NULL && ++p < end && *p == ':') + return p + 1; + } else { + /* otherwise split on first colon */ + p = strchr(nfspath, ':'); + if (p != NULL && p < end) + return p + 1; + } + return NULL; +} - return nfs_path(srvpath, mnt_parent->mnt_root, dentry, buffer, buflen); +/* + * Determine the mount path as a string + */ +static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen) +{ + char *limit; + char *path = nfs_path(&limit, dentry, buffer, buflen, + NFS_PATH_CANONICAL); + if (!IS_ERR(path)) { + char *path_component = nfs_path_component(path, limit); + if (path_component) + return path_component; + } + return path; } /* * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we * believe to be the server path to this dentry */ -static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, - const struct dentry *dentry, +static int nfs4_validate_fspath(struct dentry *dentry, const struct nfs4_fs_locations *locations, char *page, char *page2) { const char *path, *fs_path; - path = nfs4_path(mnt_parent, dentry, page, PAGE_SIZE); + path = nfs4_path(dentry, page, PAGE_SIZE); if (IS_ERR(path)) return PTR_ERR(path); @@ -86,56 +113,202 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, if (strncmp(path, fs_path, strlen(fs_path)) != 0) { dprintk("%s: path %s does not begin with fsroot %s\n", - __FUNCTION__, path, fs_path); + __func__, path, fs_path); return -ENOENT; } return 0; } -/* - * Check if the string represents a "valid" IPv4 address +static size_t nfs_parse_server_name(char *string, size_t len, + struct sockaddr *sa, size_t salen, struct net *net) +{ + ssize_t ret; + + ret = rpc_pton(net, string, len, sa, salen); + if (ret == 0) { + ret = nfs_dns_resolve_name(net, string, len, sa, salen); + if (ret < 0) + ret = 0; + } + return ret; +} + +/** + * nfs_find_best_sec - Find a security mechanism supported locally + * @server: NFS server struct + * @flavors: List of security tuples returned by SECINFO procedure + * + * Return an rpc client that uses the first security mechanism in + * "flavors" that is locally supported. The "flavors" array + * is searched in the order returned from the server, per RFC 3530 + * recommendation and each flavor is checked for membership in the + * sec= mount option list if it exists. + * + * Return -EPERM if no matching flavor is found in the array. + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + * */ -static inline int valid_ipaddr4(const char *buf) +static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt, + struct nfs_server *server, + struct nfs4_secinfo_flavors *flavors) { - int rc, count, in[4]; - - rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]); - if (rc != 4) - return -EINVAL; - for (count = 0; count < 4; count++) { - if (in[count] > 255) - return -EINVAL; + rpc_authflavor_t pflavor; + struct nfs4_secinfo4 *secinfo; + unsigned int i; + + for (i = 0; i < flavors->num_flavors; i++) { + secinfo = &flavors->flavors[i]; + + switch (secinfo->flavor) { + case RPC_AUTH_NULL: + case RPC_AUTH_UNIX: + case RPC_AUTH_GSS: + pflavor = rpcauth_get_pseudoflavor(secinfo->flavor, + &secinfo->flavor_info); + /* does the pseudoflavor match a sec= mount opt? */ + if (pflavor != RPC_AUTH_MAXFLAVOR && + nfs_auth_info_match(&server->auth_info, pflavor)) { + struct rpc_clnt *new; + struct rpc_cred *cred; + + /* Cloning creates an rpc_auth for the flavor */ + new = rpc_clone_client_set_auth(clnt, pflavor); + if (IS_ERR(new)) + continue; + /** + * Check that the user actually can use the + * flavor. This is mostly for RPC_AUTH_GSS + * where cr_init obtains a gss context + */ + cred = rpcauth_lookupcred(new->cl_auth, 0); + if (IS_ERR(cred)) { + rpc_shutdown_client(new); + continue; + } + put_rpccred(cred); + return new; + } + } } - return 0; + return ERR_PTR(-EPERM); +} + +/** + * nfs4_negotiate_security - in response to an NFS4ERR_WRONGSEC on lookup, + * return an rpc_clnt that uses the best available security flavor with + * respect to the secinfo flavor list and the sec= mount options. + * + * @clnt: RPC client to clone + * @inode: directory inode + * @name: lookup name + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + */ +struct rpc_clnt * +nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode, + struct qstr *name) +{ + struct page *page; + struct nfs4_secinfo_flavors *flavors; + struct rpc_clnt *new; + int err; + + page = alloc_page(GFP_KERNEL); + if (!page) + return ERR_PTR(-ENOMEM); + + flavors = page_address(page); + + err = nfs4_proc_secinfo(inode, name, flavors); + if (err < 0) { + new = ERR_PTR(err); + goto out; + } + + new = nfs_find_best_sec(clnt, NFS_SERVER(inode), flavors); + +out: + put_page(page); + return new; +} + +static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, + char *page, char *page2, + const struct nfs4_fs_location *location) +{ + const size_t addr_bufsize = sizeof(struct sockaddr_storage); + struct net *net = rpc_net_ns(NFS_SB(mountdata->sb)->client); + struct vfsmount *mnt = ERR_PTR(-ENOENT); + char *mnt_path; + unsigned int maxbuflen; + unsigned int s; + + mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); + if (IS_ERR(mnt_path)) + return ERR_CAST(mnt_path); + mountdata->mnt_path = mnt_path; + maxbuflen = mnt_path - 1 - page2; + + mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL); + if (mountdata->addr == NULL) + return ERR_PTR(-ENOMEM); + + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; + + if (buf->len <= 0 || buf->len >= maxbuflen) + continue; + + if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) + continue; + + mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, + mountdata->addr, addr_bufsize, net); + if (mountdata->addrlen == 0) + continue; + + rpc_set_port(mountdata->addr, NFS_PORT); + + memcpy(page2, buf->data, buf->len); + page2[buf->len] = '\0'; + mountdata->hostname = page2; + + snprintf(page, PAGE_SIZE, "%s:%s", + mountdata->hostname, + mountdata->mnt_path); + + mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); + if (!IS_ERR(mnt)) + break; + } + kfree(mountdata->addr); + return mnt; } /** * nfs_follow_referral - set up mountpoint when hitting a referral on moved error - * @mnt_parent - mountpoint of parent directory * @dentry - parent directory * @locations - array of NFSv4 server location information * */ -static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, - const struct dentry *dentry, +static struct vfsmount *nfs_follow_referral(struct dentry *dentry, const struct nfs4_fs_locations *locations) { struct vfsmount *mnt = ERR_PTR(-ENOENT); struct nfs_clone_mount mountdata = { - .sb = mnt_parent->mnt_sb, + .sb = dentry->d_sb, .dentry = dentry, - .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, + .authflavor = NFS_SB(dentry->d_sb)->client->cl_auth->au_flavor, }; char *page = NULL, *page2 = NULL; - unsigned int s; int loc, error; if (locations == NULL || locations->nlocations <= 0) goto out; - dprintk("%s: referral at %s/%s\n", __FUNCTION__, - dentry->d_parent->d_name.name, dentry->d_name.name); + dprintk("%s: referral at %pd2\n", __func__, dentry); page = (char *) __get_free_page(GFP_USER); if (!page) @@ -146,75 +319,37 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, goto out; /* Ensure fs path is a prefix of current dentry path */ - error = nfs4_validate_fspath(mnt_parent, dentry, locations, page, page2); + error = nfs4_validate_fspath(dentry, locations, page, page2); if (error < 0) { mnt = ERR_PTR(error); goto out; } - loc = 0; - while (loc < locations->nlocations && IS_ERR(mnt)) { + for (loc = 0; loc < locations->nlocations; loc++) { const struct nfs4_fs_location *location = &locations->locations[loc]; - char *mnt_path; if (location == NULL || location->nservers <= 0 || - location->rootpath.ncomponents == 0) { - loc++; - continue; - } - - mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); - if (IS_ERR(mnt_path)) { - loc++; + location->rootpath.ncomponents == 0) continue; - } - mountdata.mnt_path = mnt_path; - - s = 0; - while (s < location->nservers) { - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_port = htons(NFS_PORT), - }; - - if (location->servers[s].len <= 0 || - valid_ipaddr4(location->servers[s].data) < 0) { - s++; - continue; - } - - mountdata.hostname = location->servers[s].data; - addr.sin_addr.s_addr = in_aton(mountdata.hostname), - mountdata.addr = (struct sockaddr *)&addr; - mountdata.addrlen = sizeof(addr); - snprintf(page, PAGE_SIZE, "%s:%s", - mountdata.hostname, - mountdata.mnt_path); - - mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, &mountdata); - if (!IS_ERR(mnt)) { - break; - } - s++; - } - loc++; + mnt = try_location(&mountdata, page, page2, location); + if (!IS_ERR(mnt)) + break; } out: free_page((unsigned long) page); free_page((unsigned long) page2); - dprintk("%s: done\n", __FUNCTION__); + dprintk("%s: done\n", __func__); return mnt; } /* * nfs_do_refmount - handle crossing a referral on server * @dentry - dentry of referral - * @nd - nameidata info * */ -struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) +static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry) { struct vfsmount *mnt = ERR_PTR(-ENOMEM); struct dentry *parent; @@ -223,7 +358,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr int err; /* BUG_ON(IS_ROOT(dentry)); */ - dprintk("%s: enter\n", __FUNCTION__); + dprintk("%s: enter\n", __func__); page = alloc_page(GFP_KERNEL); if (page == NULL) @@ -237,21 +372,152 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr mnt = ERR_PTR(-ENOENT); parent = dget_parent(dentry); - dprintk("%s: getting locations for %s/%s\n", - __FUNCTION__, parent->d_name.name, dentry->d_name.name); + dprintk("%s: getting locations for %pd2\n", + __func__, dentry); - err = nfs4_proc_fs_locations(parent->d_inode, &dentry->d_name, fs_locations, page); + err = nfs4_proc_fs_locations(client, parent->d_inode, &dentry->d_name, fs_locations, page); dput(parent); if (err != 0 || fs_locations->nlocations <= 0 || fs_locations->fs_path.ncomponents <= 0) goto out_free; - mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations); + mnt = nfs_follow_referral(dentry, fs_locations); out_free: __free_page(page); kfree(fs_locations); out: - dprintk("%s: done\n", __FUNCTION__); + dprintk("%s: done\n", __func__); return mnt; } + +struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, + struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + rpc_authflavor_t flavor = server->client->cl_auth->au_flavor; + struct dentry *parent = dget_parent(dentry); + struct inode *dir = parent->d_inode; + struct qstr *name = &dentry->d_name; + struct rpc_clnt *client; + struct vfsmount *mnt; + + /* Look it up again to get its attributes and sec flavor */ + client = nfs4_proc_lookup_mountpoint(dir, name, fh, fattr); + dput(parent); + if (IS_ERR(client)) + return ERR_CAST(client); + + if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { + mnt = nfs_do_refmount(client, dentry); + goto out; + } + + if (client->cl_auth->au_flavor != flavor) + flavor = client->cl_auth->au_flavor; + mnt = nfs_do_submount(dentry, fh, fattr, flavor); +out: + rpc_shutdown_client(client); + return mnt; +} + +/* + * Try one location from the fs_locations array. + * + * Returns zero on success, or a negative errno value. + */ +static int nfs4_try_replacing_one_location(struct nfs_server *server, + char *page, char *page2, + const struct nfs4_fs_location *location) +{ + const size_t addr_bufsize = sizeof(struct sockaddr_storage); + struct net *net = rpc_net_ns(server->client); + struct sockaddr *sap; + unsigned int s; + size_t salen; + int error; + + sap = kmalloc(addr_bufsize, GFP_KERNEL); + if (sap == NULL) + return -ENOMEM; + + error = -ENOENT; + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; + char *hostname; + + if (buf->len <= 0 || buf->len > PAGE_SIZE) + continue; + + if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len) != NULL) + continue; + + salen = nfs_parse_server_name(buf->data, buf->len, + sap, addr_bufsize, net); + if (salen == 0) + continue; + rpc_set_port(sap, NFS_PORT); + + error = -ENOMEM; + hostname = kstrndup(buf->data, buf->len, GFP_KERNEL); + if (hostname == NULL) + break; + + error = nfs4_update_server(server, hostname, sap, salen, net); + kfree(hostname); + if (error == 0) + break; + } + + kfree(sap); + return error; +} + +/** + * nfs4_replace_transport - set up transport to destination server + * + * @server: export being migrated + * @locations: fs_locations array + * + * Returns zero on success, or a negative errno value. + * + * The client tries all the entries in the "locations" array, in the + * order returned by the server, until one works or the end of the + * array is reached. + */ +int nfs4_replace_transport(struct nfs_server *server, + const struct nfs4_fs_locations *locations) +{ + char *page = NULL, *page2 = NULL; + int loc, error; + + error = -ENOENT; + if (locations == NULL || locations->nlocations <= 0) + goto out; + + error = -ENOMEM; + page = (char *) __get_free_page(GFP_USER); + if (!page) + goto out; + page2 = (char *) __get_free_page(GFP_USER); + if (!page2) + goto out; + + for (loc = 0; loc < locations->nlocations; loc++) { + const struct nfs4_fs_location *location = + &locations->locations[loc]; + + if (location == NULL || location->nservers <= 0 || + location->rootpath.ncomponents == 0) + continue; + + error = nfs4_try_replacing_one_location(server, page, + page2, location); + if (error == 0) + break; + } + +out: + free_page((unsigned long)page); + free_page((unsigned long)page2); + return error; +} diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7ce07862c2f..4bf3d97cc5a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -36,22 +36,37 @@ */ #include <linux/mm.h> -#include <linux/utsname.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/string.h> +#include <linux/ratelimit.h> +#include <linux/printk.h> +#include <linux/slab.h> #include <linux/sunrpc/clnt.h> #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> -#include <linux/smp_lock.h> +#include <linux/nfs_mount.h> #include <linux/namei.h> #include <linux/mount.h> +#include <linux/module.h> +#include <linux/nfs_idmap.h> +#include <linux/xattr.h> +#include <linux/utsname.h> +#include <linux/freezer.h> #include "nfs4_fs.h" #include "delegation.h" +#include "internal.h" #include "iostat.h" +#include "callback.h" +#include "pnfs.h" +#include "netns.h" +#include "nfs4session.h" +#include "fscache.h" + +#include "nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -60,28 +75,105 @@ struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); +static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); -static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *); -static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception); -static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp); -static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); -static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); +static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); +static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); +static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label); +static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); +static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, + struct nfs_fattr *fattr, struct iattr *sattr, + struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs4_label *olabel); +#ifdef CONFIG_NFS_V4_1 +static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, + struct rpc_cred *); +static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *, + struct rpc_cred *); +#endif + +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static inline struct nfs4_label * +nfs4_label_init_security(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, struct nfs4_label *label) +{ + int err; + + if (label == NULL) + return NULL; + + if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0) + return NULL; + + err = security_dentry_init_security(dentry, sattr->ia_mode, + &dentry->d_name, (void **)&label->label, &label->len); + if (err == 0) + return label; + + return NULL; +} +static inline void +nfs4_label_release_security(struct nfs4_label *label) +{ + if (label) + security_release_secctx(label->label, label->len); +} +static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label) +{ + if (label) + return server->attr_bitmask; + + return server->attr_bitmask_nl; +} +#else +static inline struct nfs4_label * +nfs4_label_init_security(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, struct nfs4_label *l) +{ return NULL; } +static inline void +nfs4_label_release_security(struct nfs4_label *label) +{ return; } +static inline u32 * +nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label) +{ return server->attr_bitmask; } +#endif /* Prevent leaks of NFSv4 errors into userland */ -int nfs4_map_errors(int err) +static int nfs4_map_errors(int err) { - if (err < -1000) { + if (err >= -1000) + return err; + switch (err) { + case -NFS4ERR_RESOURCE: + case -NFS4ERR_LAYOUTTRYLATER: + case -NFS4ERR_RECALLCONFLICT: + return -EREMOTEIO; + case -NFS4ERR_WRONGSEC: + case -NFS4ERR_WRONG_CRED: + return -EPERM; + case -NFS4ERR_BADOWNER: + case -NFS4ERR_BADNAME: + return -EINVAL; + case -NFS4ERR_SHARE_DENIED: + return -EACCES; + case -NFS4ERR_MINOR_VERS_MISMATCH: + return -EPROTONOSUPPORT; + case -NFS4ERR_ACCESS: + return -EACCES; + case -NFS4ERR_FILE_OPEN: + return -EBUSY; + default: dprintk("%s could not handle NFSv4 error %d\n", - __FUNCTION__, -err); - return -EIO; + __func__, -err); + break; } - return err; + return -EIO; } /* * This is our standard bitmap for GETATTR requests. */ -const u32 nfs4_fattr_bitmap[2] = { +const u32 nfs4_fattr_bitmap[3] = { FATTR4_WORD0_TYPE | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE @@ -95,10 +187,37 @@ const u32 nfs4_fattr_bitmap[2] = { | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_METADATA - | FATTR4_WORD1_TIME_MODIFY + | FATTR4_WORD1_TIME_MODIFY, +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + FATTR4_WORD2_SECURITY_LABEL +#endif }; -const u32 nfs4_statfs_bitmap[2] = { +static const u32 nfs4_pnfs_open_bitmap[3] = { + FATTR4_WORD0_TYPE + | FATTR4_WORD0_CHANGE + | FATTR4_WORD0_SIZE + | FATTR4_WORD0_FSID + | FATTR4_WORD0_FILEID, + FATTR4_WORD1_MODE + | FATTR4_WORD1_NUMLINKS + | FATTR4_WORD1_OWNER + | FATTR4_WORD1_OWNER_GROUP + | FATTR4_WORD1_RAWDEV + | FATTR4_WORD1_SPACE_USED + | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_METADATA + | FATTR4_WORD1_TIME_MODIFY, + FATTR4_WORD2_MDSTHRESHOLD +}; + +static const u32 nfs4_open_noattr_bitmap[3] = { + FATTR4_WORD0_TYPE + | FATTR4_WORD0_CHANGE + | FATTR4_WORD0_FILEID, +}; + +const u32 nfs4_statfs_bitmap[3] = { FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL, @@ -107,20 +226,22 @@ const u32 nfs4_statfs_bitmap[2] = { | FATTR4_WORD1_SPACE_TOTAL }; -const u32 nfs4_pathconf_bitmap[2] = { +const u32 nfs4_pathconf_bitmap[3] = { FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME, 0 }; -const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE +const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_LEASE_TIME, - 0 + FATTR4_WORD1_TIME_DELTA + | FATTR4_WORD1_FS_LAYOUT_TYPES, + FATTR4_WORD2_LAYOUT_BLKSIZE }; -const u32 nfs4_fs_locations_bitmap[2] = { +const u32 nfs4_fs_locations_bitmap[3] = { FATTR4_WORD0_TYPE | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE @@ -136,7 +257,7 @@ const u32 nfs4_fs_locations_bitmap[2] = { | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY - | FATTR4_WORD1_MOUNTED_ON_FILEID + | FATTR4_WORD1_MOUNTED_ON_FILEID, }; static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry, @@ -144,7 +265,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent { __be32 *start, *p; - BUG_ON(readdir->count < 80); if (cookie > 2) { readdir->cookie = cookie; memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier)); @@ -163,7 +283,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent * when talking to the server, we always send cookie 0 * instead of 1 or 2. */ - start = p = kmap_atomic(*readdir->pages, KM_USER0); + start = p = kmap_atomic(*readdir->pages); if (cookie == 0) { *p++ = xdr_one; /* next */ @@ -191,27 +311,592 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent readdir->pgbase = (char *)p - (char *)start; readdir->count -= readdir->pgbase; - kunmap_atomic(start, KM_USER0); + kunmap_atomic(start); } -static void renew_lease(const struct nfs_server *server, unsigned long timestamp) +static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) +{ + int res = 0; + + might_sleep(); + + if (*timeout <= 0) + *timeout = NFS4_POLL_RETRY_MIN; + if (*timeout > NFS4_POLL_RETRY_MAX) + *timeout = NFS4_POLL_RETRY_MAX; + freezable_schedule_timeout_killable_unsafe(*timeout); + if (fatal_signal_pending(current)) + res = -ERESTARTSYS; + *timeout <<= 1; + return res; +} + +/* This is the error handling routine for processes that are allowed + * to sleep. + */ +static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception) { struct nfs_client *clp = server->nfs_client; + struct nfs4_state *state = exception->state; + struct inode *inode = exception->inode; + int ret = errorcode; + + exception->retry = 0; + switch(errorcode) { + case 0: + return 0; + case -NFS4ERR_OPENMODE: + if (inode && nfs4_have_delegation(inode, FMODE_READ)) { + nfs4_inode_return_delegation(inode); + exception->retry = 1; + return 0; + } + if (state == NULL) + break; + ret = nfs4_schedule_stateid_recovery(server, state); + if (ret < 0) + break; + goto wait_on_recovery; + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) { + nfs_remove_bad_delegation(inode); + exception->retry = 1; + break; + } + if (state == NULL) + break; + ret = nfs4_schedule_stateid_recovery(server, state); + if (ret < 0) + break; + goto wait_on_recovery; + case -NFS4ERR_EXPIRED: + if (state != NULL) { + ret = nfs4_schedule_stateid_recovery(server, state); + if (ret < 0) + break; + } + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_STALE_CLIENTID: + nfs4_schedule_lease_recovery(clp); + goto wait_on_recovery; + case -NFS4ERR_MOVED: + ret = nfs4_schedule_migration_recovery(server); + if (ret < 0) + break; + goto wait_on_recovery; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(clp); + goto wait_on_recovery; +#if defined(CONFIG_NFS_V4_1) + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR: %d Reset session\n", __func__, + errorcode); + nfs4_schedule_session_recovery(clp->cl_session, errorcode); + goto wait_on_recovery; +#endif /* defined(CONFIG_NFS_V4_1) */ + case -NFS4ERR_FILE_OPEN: + if (exception->timeout > HZ) { + /* We have retried a decent amount, time to + * fail + */ + ret = -EBUSY; + break; + } + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + ret = nfs4_delay(server->client, &exception->timeout); + if (ret != 0) + break; + case -NFS4ERR_RETRY_UNCACHED_REP: + case -NFS4ERR_OLD_STATEID: + exception->retry = 1; + break; + case -NFS4ERR_BADOWNER: + /* The following works around a Linux server bug! */ + case -NFS4ERR_BADNAME: + if (server->caps & NFS_CAP_UIDGID_NOMAP) { + server->caps &= ~NFS_CAP_UIDGID_NOMAP; + exception->retry = 1; + printk(KERN_WARNING "NFS: v4 server %s " + "does not accept raw " + "uid/gids. " + "Reenabling the idmapper.\n", + server->nfs_client->cl_hostname); + } + } + /* We failed to handle the error */ + return nfs4_map_errors(ret); +wait_on_recovery: + ret = nfs4_wait_clnt_recover(clp); + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + return -EIO; + if (ret == 0) + exception->retry = 1; + return ret; +} + +/* + * Return 'true' if 'clp' is using an rpc_client that is integrity protected + * or 'false' otherwise. + */ +static bool _nfs4_is_integrity_protected(struct nfs_client *clp) +{ + rpc_authflavor_t flavor = clp->cl_rpcclient->cl_auth->au_flavor; + + if (flavor == RPC_AUTH_GSS_KRB5I || + flavor == RPC_AUTH_GSS_KRB5P) + return true; + + return false; +} + +static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp) +{ spin_lock(&clp->cl_lock); if (time_before(clp->cl_last_renewal,timestamp)) clp->cl_last_renewal = timestamp; spin_unlock(&clp->cl_lock); } +static void renew_lease(const struct nfs_server *server, unsigned long timestamp) +{ + do_renew_lease(server->nfs_client, timestamp); +} + +struct nfs4_call_sync_data { + const struct nfs_server *seq_server; + struct nfs4_sequence_args *seq_args; + struct nfs4_sequence_res *seq_res; +}; + +static void nfs4_init_sequence(struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, int cache_reply) +{ + args->sa_slot = NULL; + args->sa_cache_this = cache_reply; + args->sa_privileged = 0; + + res->sr_slot = NULL; +} + +static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) +{ + args->sa_privileged = 1; +} + +static int nfs40_setup_sequence(const struct nfs_server *server, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) +{ + struct nfs4_slot_table *tbl = server->nfs_client->cl_slot_tbl; + struct nfs4_slot *slot; + + /* slot already allocated? */ + if (res->sr_slot != NULL) + goto out_start; + + spin_lock(&tbl->slot_tbl_lock); + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) + goto out_sleep; + + slot = nfs4_alloc_slot(tbl); + if (IS_ERR(slot)) { + if (slot == ERR_PTR(-ENOMEM)) + task->tk_timeout = HZ >> 2; + goto out_sleep; + } + spin_unlock(&tbl->slot_tbl_lock); + + args->sa_slot = slot; + res->sr_slot = slot; + +out_start: + rpc_call_start(task); + return 0; + +out_sleep: + if (args->sa_privileged) + rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, + NULL, RPC_PRIORITY_PRIVILEGED); + else + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); + spin_unlock(&tbl->slot_tbl_lock); + return -EAGAIN; +} + +static int nfs40_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + struct nfs4_slot *slot = res->sr_slot; + struct nfs4_slot_table *tbl; + + if (slot == NULL) + goto out; + + tbl = slot->table; + spin_lock(&tbl->slot_tbl_lock); + if (!nfs41_wake_and_assign_slot(tbl, slot)) + nfs4_free_slot(tbl, slot); + spin_unlock(&tbl->slot_tbl_lock); + + res->sr_slot = NULL; +out: + return 1; +} + +#if defined(CONFIG_NFS_V4_1) + +static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) +{ + struct nfs4_session *session; + struct nfs4_slot_table *tbl; + struct nfs4_slot *slot = res->sr_slot; + bool send_new_highest_used_slotid = false; + + tbl = slot->table; + session = tbl->session; + + spin_lock(&tbl->slot_tbl_lock); + /* Be nice to the server: try to ensure that the last transmitted + * value for highest_user_slotid <= target_highest_slotid + */ + if (tbl->highest_used_slotid > tbl->target_highest_slotid) + send_new_highest_used_slotid = true; + + if (nfs41_wake_and_assign_slot(tbl, slot)) { + send_new_highest_used_slotid = false; + goto out_unlock; + } + nfs4_free_slot(tbl, slot); + + if (tbl->highest_used_slotid != NFS4_NO_SLOT) + send_new_highest_used_slotid = false; +out_unlock: + spin_unlock(&tbl->slot_tbl_lock); + res->sr_slot = NULL; + if (send_new_highest_used_slotid) + nfs41_server_notify_highest_slotid_update(session->clp); +} + +int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) +{ + struct nfs4_session *session; + struct nfs4_slot *slot = res->sr_slot; + struct nfs_client *clp; + bool interrupted = false; + int ret = 1; + + if (slot == NULL) + goto out_noaction; + /* don't increment the sequence number if the task wasn't sent */ + if (!RPC_WAS_SENT(task)) + goto out; + + session = slot->table->session; + + if (slot->interrupted) { + slot->interrupted = 0; + interrupted = true; + } + + trace_nfs4_sequence_done(session, res); + /* Check the SEQUENCE operation status */ + switch (res->sr_status) { + case 0: + /* Update the slot's sequence and clientid lease timer */ + ++slot->seq_nr; + clp = session->clp; + do_renew_lease(clp, res->sr_timestamp); + /* Check sequence flags */ + if (res->sr_status_flags != 0) + nfs4_schedule_lease_recovery(clp); + nfs41_update_target_slotid(slot->table, slot, res); + break; + case 1: + /* + * sr_status remains 1 if an RPC level error occurred. + * The server may or may not have processed the sequence + * operation.. + * Mark the slot as having hosted an interrupted RPC call. + */ + slot->interrupted = 1; + goto out; + case -NFS4ERR_DELAY: + /* The server detected a resend of the RPC call and + * returned NFS4ERR_DELAY as per Section 2.10.6.2 + * of RFC5661. + */ + dprintk("%s: slot=%u seq=%u: Operation in progress\n", + __func__, + slot->slot_nr, + slot->seq_nr); + goto out_retry; + case -NFS4ERR_BADSLOT: + /* + * The slot id we used was probably retired. Try again + * using a different slot id. + */ + goto retry_nowait; + case -NFS4ERR_SEQ_MISORDERED: + /* + * Was the last operation on this sequence interrupted? + * If so, retry after bumping the sequence number. + */ + if (interrupted) { + ++slot->seq_nr; + goto retry_nowait; + } + /* + * Could this slot have been previously retired? + * If so, then the server may be expecting seq_nr = 1! + */ + if (slot->seq_nr != 1) { + slot->seq_nr = 1; + goto retry_nowait; + } + break; + case -NFS4ERR_SEQ_FALSE_RETRY: + ++slot->seq_nr; + goto retry_nowait; + default: + /* Just update the slot sequence no. */ + ++slot->seq_nr; + } +out: + /* The session may be reset by one of the error handlers. */ + dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); + nfs41_sequence_free_slot(res); +out_noaction: + return ret; +retry_nowait: + if (rpc_restart_call_prepare(task)) { + task->tk_status = 0; + ret = 0; + } + goto out; +out_retry: + if (!rpc_restart_call(task)) + goto out; + rpc_delay(task, NFS4_POLL_RETRY_MAX); + return 0; +} +EXPORT_SYMBOL_GPL(nfs41_sequence_done); + +static int nfs4_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + if (res->sr_slot == NULL) + return 1; + if (!res->sr_slot->table->session) + return nfs40_sequence_done(task, res); + return nfs41_sequence_done(task, res); +} + +int nfs41_setup_sequence(struct nfs4_session *session, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) +{ + struct nfs4_slot *slot; + struct nfs4_slot_table *tbl; + + dprintk("--> %s\n", __func__); + /* slot already allocated? */ + if (res->sr_slot != NULL) + goto out_success; + + tbl = &session->fc_slot_table; + + task->tk_timeout = 0; + + spin_lock(&tbl->slot_tbl_lock); + if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state) && + !args->sa_privileged) { + /* The state manager will wait until the slot table is empty */ + dprintk("%s session is draining\n", __func__); + goto out_sleep; + } + + slot = nfs4_alloc_slot(tbl); + if (IS_ERR(slot)) { + /* If out of memory, try again in 1/4 second */ + if (slot == ERR_PTR(-ENOMEM)) + task->tk_timeout = HZ >> 2; + dprintk("<-- %s: no free slots\n", __func__); + goto out_sleep; + } + spin_unlock(&tbl->slot_tbl_lock); + + args->sa_slot = slot; + + dprintk("<-- %s slotid=%u seqid=%u\n", __func__, + slot->slot_nr, slot->seq_nr); + + res->sr_slot = slot; + res->sr_timestamp = jiffies; + res->sr_status_flags = 0; + /* + * sr_status is only set in decode_sequence, and so will remain + * set to 1 if an rpc level failure occurs. + */ + res->sr_status = 1; + trace_nfs4_setup_sequence(session, args); +out_success: + rpc_call_start(task); + return 0; +out_sleep: + /* Privileged tasks are queued with top priority */ + if (args->sa_privileged) + rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, + NULL, RPC_PRIORITY_PRIVILEGED); + else + rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); + spin_unlock(&tbl->slot_tbl_lock); + return -EAGAIN; +} +EXPORT_SYMBOL_GPL(nfs41_setup_sequence); + +static int nfs4_setup_sequence(const struct nfs_server *server, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) +{ + struct nfs4_session *session = nfs4_get_session(server); + int ret = 0; + + if (!session) + return nfs40_setup_sequence(server, args, res, task); + + dprintk("--> %s clp %p session %p sr_slot %u\n", + __func__, session->clp, session, res->sr_slot ? + res->sr_slot->slot_nr : NFS4_NO_SLOT); + + ret = nfs41_setup_sequence(session, args, res, task); + + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; +} + +static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_call_sync_data *data = calldata; + struct nfs4_session *session = nfs4_get_session(data->seq_server); + + dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); + + nfs41_setup_sequence(session, data->seq_args, data->seq_res, task); +} + +static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_call_sync_data *data = calldata; + + nfs41_sequence_done(task, data->seq_res); +} + +static const struct rpc_call_ops nfs41_call_sync_ops = { + .rpc_call_prepare = nfs41_call_sync_prepare, + .rpc_call_done = nfs41_call_sync_done, +}; + +#else /* !CONFIG_NFS_V4_1 */ + +static int nfs4_setup_sequence(const struct nfs_server *server, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) +{ + return nfs40_setup_sequence(server, args, res, task); +} + +static int nfs4_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + return nfs40_sequence_done(task, res); +} + +#endif /* !CONFIG_NFS_V4_1 */ + +static void nfs40_call_sync_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_call_sync_data *data = calldata; + nfs4_setup_sequence(data->seq_server, + data->seq_args, data->seq_res, task); +} + +static void nfs40_call_sync_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_call_sync_data *data = calldata; + nfs4_sequence_done(task, data->seq_res); +} + +static const struct rpc_call_ops nfs40_call_sync_ops = { + .rpc_call_prepare = nfs40_call_sync_prepare, + .rpc_call_done = nfs40_call_sync_done, +}; + +static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res) +{ + int ret; + struct rpc_task *task; + struct nfs_client *clp = server->nfs_client; + struct nfs4_call_sync_data data = { + .seq_server = server, + .seq_args = args, + .seq_res = res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = clnt, + .rpc_message = msg, + .callback_ops = clp->cl_mvops->call_sync_ops, + .callback_data = &data + }; + + task = rpc_run_task(&task_setup); + if (IS_ERR(task)) + ret = PTR_ERR(task); + else { + ret = task->tk_status; + rpc_put_task(task); + } + return ret; +} + +static +int nfs4_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply) +{ + nfs4_init_sequence(args, res, cache_reply); + return nfs4_call_sync_sequence(clnt, server, msg, args, res); +} + static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) { struct nfs_inode *nfsi = NFS_I(dir); spin_lock(&dir->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA; - if (!cinfo->atomic || cinfo->before != nfsi->change_attr) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + if (!cinfo->atomic || cinfo->before != dir->i_version) nfs_force_lookup_revalidate(dir); - nfsi->change_attr = cinfo->after; + dir->i_version = cinfo->after; + nfs_fscache_invalidate(dir); spin_unlock(&dir->i_lock); } @@ -221,64 +906,134 @@ struct nfs4_opendata { struct nfs_openres o_res; struct nfs_open_confirmargs c_arg; struct nfs_open_confirmres c_res; + struct nfs4_string owner_name; + struct nfs4_string group_name; struct nfs_fattr f_attr; - struct nfs_fattr dir_attr; - struct path path; + struct nfs4_label *f_label; struct dentry *dir; + struct dentry *dentry; struct nfs4_state_owner *owner; struct nfs4_state *state; struct iattr attrs; unsigned long timestamp; unsigned int rpc_done : 1; + unsigned int file_created : 1; + unsigned int is_recover : 1; int rpc_status; int cancelled; }; +static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server, + int err, struct nfs4_exception *exception) +{ + if (err != -EINVAL) + return false; + if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1)) + return false; + server->caps &= ~NFS_CAP_ATOMIC_OPEN_V1; + exception->retry = 1; + return true; +} + +static enum open_claim_type4 +nfs4_map_atomic_open_claim(struct nfs_server *server, + enum open_claim_type4 claim) +{ + if (server->caps & NFS_CAP_ATOMIC_OPEN_V1) + return claim; + switch (claim) { + default: + return claim; + case NFS4_OPEN_CLAIM_FH: + return NFS4_OPEN_CLAIM_NULL; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + return NFS4_OPEN_CLAIM_DELEGATE_CUR; + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + return NFS4_OPEN_CLAIM_DELEGATE_PREV; + } +} static void nfs4_init_opendata_res(struct nfs4_opendata *p) { p->o_res.f_attr = &p->f_attr; - p->o_res.dir_attr = &p->dir_attr; + p->o_res.f_label = p->f_label; + p->o_res.seqid = p->o_arg.seqid; + p->c_res.seqid = p->c_arg.seqid; p->o_res.server = p->o_arg.server; + p->o_res.access_request = p->o_arg.access; nfs_fattr_init(&p->f_attr); - nfs_fattr_init(&p->dir_attr); + nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name); } -static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, - struct nfs4_state_owner *sp, int flags, - const struct iattr *attrs) +static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, + struct nfs4_state_owner *sp, fmode_t fmode, int flags, + const struct iattr *attrs, + struct nfs4_label *label, + enum open_claim_type4 claim, + gfp_t gfp_mask) { - struct dentry *parent = dget_parent(path->dentry); + struct dentry *parent = dget_parent(dentry); struct inode *dir = parent->d_inode; struct nfs_server *server = NFS_SERVER(dir); struct nfs4_opendata *p; - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc(sizeof(*p), gfp_mask); if (p == NULL) goto err; - p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); + + p->f_label = nfs4_label_alloc(server, gfp_mask); + if (IS_ERR(p->f_label)) + goto err_free_p; + + p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); if (p->o_arg.seqid == NULL) - goto err_free; - p->path.mnt = mntget(path->mnt); - p->path.dentry = dget(path->dentry); + goto err_free_label; + nfs_sb_active(dentry->d_sb); + p->dentry = dget(dentry); p->dir = parent; p->owner = sp; atomic_inc(&sp->so_count); - p->o_arg.fh = NFS_FH(dir); - p->o_arg.open_flags = flags, + p->o_arg.open_flags = flags; + p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); + /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS + * will return permission denied for all bits until close */ + if (!(flags & O_EXCL)) { + /* ask server to check for all possible rights as results + * are cached */ + p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY | + NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE; + } p->o_arg.clientid = server->nfs_client->cl_clientid; - p->o_arg.id = sp->so_owner_id.id; - p->o_arg.name = &p->path.dentry->d_name; + p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time); + p->o_arg.id.uniquifier = sp->so_seqid.owner_id; + p->o_arg.name = &dentry->d_name; p->o_arg.server = server; - p->o_arg.bitmask = server->attr_bitmask; - p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; - if (flags & O_EXCL) { - u32 *s = (u32 *) p->o_arg.u.verifier.data; - s[0] = jiffies; - s[1] = current->pid; - } else if (flags & O_CREAT) { + p->o_arg.bitmask = nfs4_bitmask(server, label); + p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; + p->o_arg.label = label; + p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); + switch (p->o_arg.claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + p->o_arg.fh = NFS_FH(dir); + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + p->o_arg.fh = NFS_FH(dentry->d_inode); + } + if (attrs != NULL && attrs->ia_valid != 0) { + __u32 verf[2]; + p->o_arg.u.attrs = &p->attrs; memcpy(&p->attrs, attrs, sizeof(p->attrs)); + + verf[0] = jiffies; + verf[1] = current->pid; + memcpy(p->o_arg.u.verifier.data, verf, + sizeof(p->o_arg.u.verifier.data)); } p->c_arg.fh = &p->o_res.fh; p->c_arg.stateid = &p->o_res.stateid; @@ -286,7 +1041,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, nfs4_init_opendata_res(p); kref_init(&p->kref); return p; -err_free: + +err_free_label: + nfs4_label_free(p->f_label); +err_free_p: kfree(p); err: dput(parent); @@ -297,14 +1055,20 @@ static void nfs4_opendata_free(struct kref *kref) { struct nfs4_opendata *p = container_of(kref, struct nfs4_opendata, kref); + struct super_block *sb = p->dentry->d_sb; nfs_free_seqid(p->o_arg.seqid); if (p->state != NULL) nfs4_put_open_state(p->state); nfs4_put_state_owner(p->owner); + + nfs4_label_free(p->f_label); + dput(p->dir); - dput(p->path.dentry); - mntput(p->path.mnt); + dput(p->dentry); + nfs_sb_deactive(sb); + nfs_fattr_free_names(&p->f_attr); + kfree(p->f_attr.mdsthreshold); kfree(p); } @@ -322,34 +1086,46 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task) return ret; } -static int can_open_cached(struct nfs4_state *state, int mode) +static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode) { int ret = 0; - switch (mode & (FMODE_READ|FMODE_WRITE|O_EXCL)) { + + if (open_mode & (O_EXCL|O_TRUNC)) + goto out; + switch (mode & (FMODE_READ|FMODE_WRITE)) { case FMODE_READ: - ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0; + ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 + && state->n_rdonly != 0; break; case FMODE_WRITE: - ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0; + ret |= test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 + && state->n_wronly != 0; break; case FMODE_READ|FMODE_WRITE: - ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0; + ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 + && state->n_rdwr != 0; } +out: return ret; } -static int can_open_delegated(struct nfs_delegation *delegation, mode_t open_flags) +static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) { - if ((delegation->type & open_flags) != open_flags) + if (delegation == NULL) + return 0; + if ((delegation->type & fmode) != fmode) return 0; - if (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) + if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) return 0; + if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + return 0; + nfs_mark_delegation_referenced(delegation); return 1; } -static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags) +static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode) { - switch (open_flags) { + switch (fmode) { case FMODE_WRITE: state->n_wronly++; break; @@ -359,15 +1135,75 @@ static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags) case FMODE_READ|FMODE_WRITE: state->n_rdwr++; } - nfs4_state_set_mode_locked(state, state->state | open_flags); + nfs4_state_set_mode_locked(state, state->state | fmode); +} + +static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) +{ + struct nfs_client *clp = state->owner->so_server->nfs_client; + bool need_recover = false; + + if (test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags) && state->n_rdonly) + need_recover = true; + if (test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags) && state->n_wronly) + need_recover = true; + if (test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags) && state->n_rdwr) + need_recover = true; + if (need_recover) + nfs4_state_mark_reclaim_nograce(clp, state); } -static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags) +static bool nfs_need_update_open_stateid(struct nfs4_state *state, + nfs4_stateid *stateid) { + if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0) + return true; + if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) { + nfs_test_and_clear_all_open_stateid(state); + return true; + } + if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) + return true; + return false; +} + +static void nfs_clear_open_stateid_locked(struct nfs4_state *state, + nfs4_stateid *stateid, fmode_t fmode) +{ + clear_bit(NFS_O_RDWR_STATE, &state->flags); + switch (fmode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_WRITE: + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + break; + case FMODE_READ: + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + break; + case 0: + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); + } + if (stateid == NULL) + return; + if (!nfs_need_update_open_stateid(state, stateid)) + return; if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) - memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); - memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); - switch (open_flags) { + nfs4_stateid_copy(&state->stateid, stateid); + nfs4_stateid_copy(&state->open_stateid, stateid); +} + +static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{ + write_seqlock(&state->seqlock); + nfs_clear_open_stateid_locked(state, stateid, fmode); + write_sequnlock(&state->seqlock); + if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) + nfs4_schedule_state_manager(state->owner->so_server->nfs_client); +} + +static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{ + switch (fmode) { case FMODE_READ: set_bit(NFS_O_RDONLY_STATE, &state->flags); break; @@ -377,47 +1213,87 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid * case FMODE_READ|FMODE_WRITE: set_bit(NFS_O_RDWR_STATE, &state->flags); } + if (!nfs_need_update_open_stateid(state, stateid)) + return; + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) + nfs4_stateid_copy(&state->stateid, stateid); + nfs4_stateid_copy(&state->open_stateid, stateid); } -static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags) -{ - write_seqlock(&state->seqlock); - nfs_set_open_stateid_locked(state, stateid, open_flags); - write_sequnlock(&state->seqlock); -} - -static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *deleg_stateid, int open_flags) +static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode) { - open_flags &= (FMODE_READ|FMODE_WRITE); /* * Protect the call to nfs4_state_set_mode_locked and * serialise the stateid update */ write_seqlock(&state->seqlock); if (deleg_stateid != NULL) { - memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); + nfs4_stateid_copy(&state->stateid, deleg_stateid); set_bit(NFS_DELEGATED_STATE, &state->flags); } if (open_stateid != NULL) - nfs_set_open_stateid_locked(state, open_stateid, open_flags); + nfs_set_open_stateid_locked(state, open_stateid, fmode); write_sequnlock(&state->seqlock); spin_lock(&state->owner->so_lock); - update_open_stateflags(state, open_flags); + update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); } -static void nfs4_return_incompatible_delegation(struct inode *inode, mode_t open_flags) +static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode) +{ + struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs_delegation *deleg_cur; + int ret = 0; + + fmode &= (FMODE_READ|FMODE_WRITE); + + rcu_read_lock(); + deleg_cur = rcu_dereference(nfsi->delegation); + if (deleg_cur == NULL) + goto no_delegation; + + spin_lock(&deleg_cur->lock); + if (rcu_dereference(nfsi->delegation) != deleg_cur || + test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) || + (deleg_cur->type & fmode) != fmode) + goto no_delegation_unlock; + + if (delegation == NULL) + delegation = &deleg_cur->stateid; + else if (!nfs4_stateid_match(&deleg_cur->stateid, delegation)) + goto no_delegation_unlock; + + nfs_mark_delegation_referenced(deleg_cur); + __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode); + ret = 1; +no_delegation_unlock: + spin_unlock(&deleg_cur->lock); +no_delegation: + rcu_read_unlock(); + + if (!ret && open_stateid != NULL) { + __update_open_stateid(state, open_stateid, NULL, fmode); + ret = 1; + } + if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) + nfs4_schedule_state_manager(state->owner->so_server->nfs_client); + + return ret; +} + + +static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode) { struct nfs_delegation *delegation; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation == NULL || (delegation->type & open_flags) == open_flags) { + if (delegation == NULL || (delegation->type & fmode) == fmode) { rcu_read_unlock(); return; } rcu_read_unlock(); - nfs_inode_return_delegation(inode); + nfs4_inode_return_delegation(inode); } static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) @@ -425,49 +1301,42 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) struct nfs4_state *state = opendata->state; struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *delegation; - int open_mode = opendata->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL); + int open_mode = opendata->o_arg.open_flags; + fmode_t fmode = opendata->o_arg.fmode; nfs4_stateid stateid; int ret = -EAGAIN; - rcu_read_lock(); - delegation = rcu_dereference(nfsi->delegation); for (;;) { - if (can_open_cached(state, open_mode)) { + if (can_open_cached(state, fmode, open_mode)) { spin_lock(&state->owner->so_lock); - if (can_open_cached(state, open_mode)) { - update_open_stateflags(state, open_mode); + if (can_open_cached(state, fmode, open_mode)) { + update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); - rcu_read_unlock(); goto out_return_state; } spin_unlock(&state->owner->so_lock); } - if (delegation == NULL) - break; - if (!can_open_delegated(delegation, open_mode)) + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (!can_open_delegated(delegation, fmode)) { + rcu_read_unlock(); break; + } /* Save the delegation */ - memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); + nfs4_stateid_copy(&stateid, &delegation->stateid); rcu_read_unlock(); - lock_kernel(); - ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); - unlock_kernel(); - if (ret != 0) - goto out; + nfs_release_seqid(opendata->o_arg.seqid); + if (!opendata->is_recover) { + ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); + if (ret != 0) + goto out; + } ret = -EAGAIN; - rcu_read_lock(); - delegation = rcu_dereference(nfsi->delegation); - /* If no delegation, try a cached open */ - if (delegation == NULL) - continue; - /* Is the delegation still valid? */ - if (memcmp(stateid.data, delegation->stateid.data, sizeof(stateid.data)) != 0) - continue; - rcu_read_unlock(); - update_open_stateid(state, NULL, &stateid, open_mode); - goto out_return_state; + + /* Try to update the stateid using the delegation */ + if (update_open_stateid(state, NULL, &stateid, fmode)) + goto out_return_state; } - rcu_read_unlock(); out: return ERR_PTR(ret); out_return_state: @@ -475,12 +1344,75 @@ out_return_state: return state; } -static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +static void +nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state) +{ + struct nfs_client *clp = NFS_SERVER(state->inode)->nfs_client; + struct nfs_delegation *delegation; + int delegation_flags = 0; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(state->inode)->delegation); + if (delegation) + delegation_flags = delegation->flags; + rcu_read_unlock(); + if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) { + pr_err_ratelimited("NFS: Broken NFSv4 server %s is " + "returning a delegation for " + "OPEN(CLAIM_DELEGATE_CUR)\n", + clp->cl_hostname); + } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) + nfs_inode_set_delegation(state->inode, + data->owner->so_cred, + &data->o_res); + else + nfs_inode_reclaim_delegation(state->inode, + data->owner->so_cred, + &data->o_res); +} + +/* + * Check the inode attributes against the CLAIM_PREVIOUS returned attributes + * and update the nfs4_state. + */ +static struct nfs4_state * +_nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) +{ + struct inode *inode = data->state->inode; + struct nfs4_state *state = data->state; + int ret; + + if (!data->rpc_done) { + if (data->rpc_status) { + ret = data->rpc_status; + goto err; + } + /* cached opens have already been processed */ + goto update; + } + + ret = nfs_refresh_inode(inode, &data->f_attr); + if (ret) + goto err; + + if (data->o_res.delegation_type != 0) + nfs4_opendata_check_deleg(data, state); +update: + update_open_stateid(state, &data->o_res.stateid, NULL, + data->o_arg.fmode); + atomic_inc(&state->count); + + return state; +err: + return ERR_PTR(ret); + +} + +static struct nfs4_state * +_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) { struct inode *inode; struct nfs4_state *state = NULL; - struct nfs_delegation *delegation; - nfs4_stateid *deleg_stateid = NULL; int ret; if (!data->rpc_done) { @@ -491,7 +1423,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data ret = -EAGAIN; if (!(data->f_attr.valid & NFS_ATTR_FATTR)) goto err; - inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr); + inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label); ret = PTR_ERR(inode); if (IS_ERR(inode)) goto err; @@ -499,31 +1431,13 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data state = nfs4_get_open_state(inode, data->owner); if (state == NULL) goto err_put_inode; - if (data->o_res.delegation_type != 0) { - int delegation_flags = 0; - - rcu_read_lock(); - delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation) - delegation_flags = delegation->flags; - rcu_read_unlock(); - if (!(delegation_flags & NFS_DELEGATION_NEED_RECLAIM)) - nfs_inode_set_delegation(state->inode, - data->owner->so_cred, - &data->o_res); - else - nfs_inode_reclaim_delegation(state->inode, - data->owner->so_cred, - &data->o_res); - } - rcu_read_lock(); - delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL) - deleg_stateid = &delegation->stateid; - update_open_stateid(state, &data->o_res.stateid, deleg_stateid, data->o_arg.open_flags); - rcu_read_unlock(); + if (data->o_res.delegation_type != 0) + nfs4_opendata_check_deleg(data, state); + update_open_stateid(state, &data->o_res.stateid, NULL, + data->o_arg.fmode); iput(inode); out: + nfs_release_seqid(data->o_arg.seqid); return state; err_put_inode: iput(inode); @@ -531,6 +1445,14 @@ err: return ERR_PTR(ret); } +static struct nfs4_state * +nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +{ + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) + return _nfs4_opendata_reclaim_to_nfs4_state(data); + return _nfs4_opendata_to_nfs4_state(data); +} + static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) { struct nfs_inode *nfsi = NFS_I(state->inode); @@ -548,11 +1470,13 @@ static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state * return ERR_PTR(-ENOENT); } -static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, struct nfs4_state *state) +static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, + struct nfs4_state *state, enum open_claim_type4 claim) { struct nfs4_opendata *opendata; - opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, NULL); + opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, + NULL, NULL, claim, GFP_NOFS); if (opendata == NULL) return ERR_PTR(-ENOMEM); opendata->state = state; @@ -560,22 +1484,23 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context return opendata; } -static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openflags, struct nfs4_state **res) +static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res) { struct nfs4_state *newstate; int ret; - opendata->o_arg.open_flags = openflags; + opendata->o_arg.open_flags = 0; + opendata->o_arg.fmode = fmode; memset(&opendata->o_res, 0, sizeof(opendata->o_res)); memset(&opendata->c_res, 0, sizeof(opendata->c_res)); nfs4_init_opendata_res(opendata); - ret = _nfs4_proc_open(opendata); + ret = _nfs4_recover_proc_open(opendata); if (ret != 0) return ret; newstate = nfs4_opendata_to_nfs4_state(opendata); if (IS_ERR(newstate)) return PTR_ERR(newstate); - nfs4_close_state(&opendata->path, newstate, openflags); + nfs4_close_state(newstate, fmode); *res = newstate; return 0; } @@ -585,8 +1510,13 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * struct nfs4_state *newstate; int ret; + /* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */ + clear_bit(NFS_O_RDWR_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDONLY_STATE, &state->flags); /* memory barrier prior to reading state->n_* */ clear_bit(NFS_DELEGATED_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); smp_rmb(); if (state->n_rdwr != 0) { ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); @@ -614,10 +1544,10 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * * Check if we need to update the current stateid. */ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && - memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { + !nfs4_stateid_match(&state->stateid, &state->open_stateid)) { write_seqlock(&state->seqlock); if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) - memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); + nfs4_stateid_copy(&state->stateid, &state->open_stateid); write_sequnlock(&state->seqlock); } return 0; @@ -631,17 +1561,16 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state { struct nfs_delegation *delegation; struct nfs4_opendata *opendata; - int delegation_type = 0; + fmode_t delegation_type = 0; int status; - opendata = nfs4_open_recoverdata_alloc(ctx, state); + opendata = nfs4_open_recoverdata_alloc(ctx, state, + NFS4_OPEN_CLAIM_PREVIOUS); if (IS_ERR(opendata)) return PTR_ERR(opendata); - opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS; - opendata->o_arg.fh = NFS_FH(state->inode); rcu_read_lock(); delegation = rcu_dereference(NFS_I(state->inode)->delegation); - if (delegation != NULL && (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) != 0) + if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0) delegation_type = delegation->type; rcu_read_unlock(); opendata->o_arg.u.delegation_type = delegation_type; @@ -657,6 +1586,9 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state int err; do { err = _nfs4_do_open_reclaim(ctx, state); + trace_nfs4_open_reclaim(ctx, 0, err); + if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) + continue; if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -671,65 +1603,101 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta ctx = nfs4_state_find_open_context(state); if (IS_ERR(ctx)) - return PTR_ERR(ctx); + return -EAGAIN; ret = nfs4_do_open_reclaim(ctx, state); put_nfs_open_context(ctx); return ret; } -static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err) { + switch (err) { + default: + printk(KERN_ERR "NFS: %s: unhandled error " + "%d.\n", __func__, err); + case 0: + case -ENOENT: + case -ESTALE: + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + set_bit(NFS_DELEGATED_STATE, &state->flags); + nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); + return -EAGAIN; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_STALE_STATEID: + set_bit(NFS_DELEGATED_STATE, &state->flags); + case -NFS4ERR_EXPIRED: + /* Don't recall a delegation if it was lost */ + nfs4_schedule_lease_recovery(server->nfs_client); + return -EAGAIN; + case -NFS4ERR_MOVED: + nfs4_schedule_migration_recovery(server); + return -EAGAIN; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(server->nfs_client); + return -EAGAIN; + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OPENMODE: + nfs_inode_find_state_and_recover(state->inode, + stateid); + nfs4_schedule_stateid_recovery(server, state); + return 0; + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + set_bit(NFS_DELEGATED_STATE, &state->flags); + ssleep(1); + return -EAGAIN; + case -ENOMEM: + case -NFS4ERR_DENIED: + /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + return 0; + } + return err; +} + +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +{ + struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_opendata *opendata; - int ret; + int err; - opendata = nfs4_open_recoverdata_alloc(ctx, state); + opendata = nfs4_open_recoverdata_alloc(ctx, state, + NFS4_OPEN_CLAIM_DELEG_CUR_FH); if (IS_ERR(opendata)) return PTR_ERR(opendata); - opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; - memcpy(opendata->o_arg.u.delegation.data, stateid->data, - sizeof(opendata->o_arg.u.delegation.data)); - ret = nfs4_open_recover(opendata, state); + nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid); + err = nfs4_open_recover(opendata, state); nfs4_opendata_put(opendata); - return ret; + return nfs4_handle_delegation_recall_error(server, state, stateid, err); } -int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) { - struct nfs4_exception exception = { }; - struct nfs_server *server = NFS_SERVER(state->inode); - int err; - do { - err = _nfs4_open_delegation_recall(ctx, state, stateid); - switch (err) { - case 0: - return err; - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: - /* Don't recall a delegation if it was lost */ - nfs4_schedule_state_recovery(server->nfs_client); - return err; - } - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); - return err; + struct nfs4_opendata *data = calldata; + + nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args, + &data->c_res.seq_res, task); } static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; + nfs40_sequence_done(task, &data->c_res.seq_res); + data->rpc_status = task->tk_status; - if (RPC_ASSASSINATED(task)) - return; if (data->rpc_status == 0) { - memcpy(data->o_res.stateid.data, data->c_res.stateid.data, - sizeof(data->o_res.stateid.data)); + nfs4_stateid_copy(&data->o_res.stateid, &data->c_res.stateid); nfs_confirm_seqid(&data->owner->so_seqid, 0); renew_lease(data->o_res.server, data->timestamp); data->rpc_done = 1; } - nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid); } static void nfs4_open_confirm_release(void *calldata) @@ -745,12 +1713,13 @@ static void nfs4_open_confirm_release(void *calldata) goto out_free; state = nfs4_opendata_to_nfs4_state(data); if (!IS_ERR(state)) - nfs4_close_state(&data->path, state, data->o_arg.open_flags); + nfs4_close_state(state, data->o_arg.fmode); out_free: nfs4_opendata_put(data); } static const struct rpc_call_ops nfs4_open_confirm_ops = { + .rpc_call_prepare = nfs4_open_confirm_prepare, .rpc_call_done = nfs4_open_confirm_done, .rpc_release = nfs4_open_confirm_release, }; @@ -773,10 +1742,12 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) .rpc_message = &msg, .callback_ops = &nfs4_open_confirm_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; int status; + nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1); kref_get(&data->kref); data->rpc_done = 0; data->rpc_status = 0; @@ -798,9 +1769,10 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; struct nfs4_state_owner *sp = data->owner; + struct nfs_client *clp = sp->so_server->nfs_client; if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) - return; + goto out_wait; /* * Check if we still need to send an OPEN call, or if we can use * a delegation instead. @@ -808,30 +1780,50 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) if (data->state != NULL) { struct nfs_delegation *delegation; - if (can_open_cached(data->state, data->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL))) + if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags)) goto out_no_action; rcu_read_lock(); delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); - if (delegation != NULL && - (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) { - rcu_read_unlock(); - goto out_no_action; - } + if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR && + data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH && + can_open_delegated(delegation, data->o_arg.fmode)) + goto unlock_no_action; rcu_read_unlock(); } - /* Update sequence id. */ - data->o_arg.id = sp->so_owner_id.id; - data->o_arg.clientid = sp->so_client->cl_clientid; - if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { + /* Update client id. */ + data->o_arg.clientid = clp->cl_clientid; + switch (data->o_arg.claim) { + case NFS4_OPEN_CLAIM_PREVIOUS: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0]; + case NFS4_OPEN_CLAIM_FH: task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); } data->timestamp = jiffies; - rpc_call_start(task); + if (nfs4_setup_sequence(data->o_arg.server, + &data->o_arg.seq_args, + &data->o_res.seq_res, + task) != 0) + nfs_release_seqid(data->o_arg.seqid); + + /* Set the create mode (note dependency on the session type) */ + data->o_arg.createmode = NFS4_CREATE_UNCHECKED; + if (data->o_arg.open_flags & O_EXCL) { + data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE; + if (nfs4_has_persistent_session(clp)) + data->o_arg.createmode = NFS4_CREATE_GUARDED; + else if (clp->cl_mvops->minor_version > 0) + data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1; + } return; +unlock_no_action: + rcu_read_unlock(); out_no_action: task->tk_action = NULL; - +out_wait: + nfs4_sequence_done(task, &data->o_res.seq_res); } static void nfs4_open_done(struct rpc_task *task, void *calldata) @@ -839,10 +1831,13 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) struct nfs4_opendata *data = calldata; data->rpc_status = task->tk_status; - if (RPC_ASSASSINATED(task)) + + if (!nfs4_sequence_done(task, &data->o_res.seq_res)) return; + if (task->tk_status == 0) { - switch (data->o_res.f_attr->mode & S_IFMT) { + if (data->o_res.f_attr->valid & NFS_ATTR_FATTR_TYPE) { + switch (data->o_res.f_attr->mode & S_IFMT) { case S_IFREG: break; case S_IFLNK: @@ -853,12 +1848,12 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) break; default: data->rpc_status = -ENOTDIR; + } } renew_lease(data->o_res.server, data->timestamp); if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)) nfs_confirm_seqid(&data->owner->so_seqid, 0); } - nfs_increment_open_seqid(data->rpc_status, data->o_arg.seqid); data->rpc_done = 1; } @@ -878,7 +1873,7 @@ static void nfs4_open_release(void *calldata) goto out_free; state = nfs4_opendata_to_nfs4_state(data); if (!IS_ERR(state)) - nfs4_close_state(&data->path, state, data->o_arg.open_flags); + nfs4_close_state(state, data->o_arg.fmode); out_free: nfs4_opendata_put(data); } @@ -889,10 +1884,7 @@ static const struct rpc_call_ops nfs4_open_ops = { .rpc_release = nfs4_open_release, }; -/* - * Note: On error, nfs4_proc_open will free the struct nfs4_opendata - */ -static int _nfs4_proc_open(struct nfs4_opendata *data) +static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) { struct inode *dir = data->dir->d_inode; struct nfs_server *server = NFS_SERVER(dir); @@ -910,59 +1902,138 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) .rpc_message = &msg, .callback_ops = &nfs4_open_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; int status; + nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1); kref_get(&data->kref); data->rpc_done = 0; data->rpc_status = 0; data->cancelled = 0; + data->is_recover = 0; + if (isrecover) { + nfs4_set_sequence_privileged(&o_arg->seq_args); + data->is_recover = 1; + } task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - status = nfs4_wait_for_completion_rpc_task(task); - if (status != 0) { - data->cancelled = 1; - smp_wmb(); - } else - status = data->rpc_status; - rpc_put_task(task); + if (IS_ERR(task)) + return PTR_ERR(task); + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) { + data->cancelled = 1; + smp_wmb(); + } else + status = data->rpc_status; + rpc_put_task(task); + + return status; +} + +static int _nfs4_recover_proc_open(struct nfs4_opendata *data) +{ + struct inode *dir = data->dir->d_inode; + struct nfs_openres *o_res = &data->o_res; + int status; + + status = nfs4_run_open_task(data, 1); if (status != 0 || !data->rpc_done) return status; - if (o_res->fh.size == 0) - _nfs4_proc_lookup(dir, o_arg->name, &o_res->fh, o_res->f_attr); + nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr); + + if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { + status = _nfs4_proc_open_confirm(data); + if (status != 0) + return status; + } + + return status; +} + +static int nfs4_opendata_access(struct rpc_cred *cred, + struct nfs4_opendata *opendata, + struct nfs4_state *state, fmode_t fmode, + int openflags) +{ + struct nfs_access_entry cache; + u32 mask; + + /* access call failed or for some reason the server doesn't + * support any access modes -- defer access call until later */ + if (opendata->o_res.access_supported == 0) + return 0; + + mask = 0; + /* don't check MAY_WRITE - a newly created file may not have + * write mode bits, but POSIX allows the creating process to write. + * use openflags to check for exec, because fmode won't + * always have FMODE_EXEC set when file open for exec. */ + if (openflags & __FMODE_EXEC) { + /* ONLY check for exec rights */ + mask = MAY_EXEC; + } else if (fmode & FMODE_READ) + mask = MAY_READ; + + cache.cred = cred; + cache.jiffies = jiffies; + nfs_access_set_mask(&cache, opendata->o_res.access_result); + nfs_access_add_cache(state->inode, &cache); + + if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0) + return 0; + + /* even though OPEN succeeded, access is denied. Close the file */ + nfs4_close_state(state, fmode); + return -EACCES; +} + +/* + * Note: On error, nfs4_proc_open will free the struct nfs4_opendata + */ +static int _nfs4_proc_open(struct nfs4_opendata *data) +{ + struct inode *dir = data->dir->d_inode; + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_openargs *o_arg = &data->o_arg; + struct nfs_openres *o_res = &data->o_res; + int status; + + status = nfs4_run_open_task(data, 0); + if (!data->rpc_done) + return status; + if (status != 0) { + if (status == -NFS4ERR_BADNAME && + !(o_arg->open_flags & O_CREAT)) + return -ENOENT; + return status; + } + + nfs_fattr_map_and_free_names(server, &data->f_attr); if (o_arg->open_flags & O_CREAT) { update_changeattr(dir, &o_res->cinfo); - nfs_post_op_update_inode(dir, o_res->dir_attr); - } else - nfs_refresh_inode(dir, o_res->dir_attr); + if (o_arg->open_flags & O_EXCL) + data->file_created = 1; + else if (o_res->cinfo.before != o_res->cinfo.after) + data->file_created = 1; + } + if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0) + server->caps &= ~NFS_CAP_POSIX_LOCK; if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { status = _nfs4_proc_open_confirm(data); if (status != 0) return status; } if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) - _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr); + nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); return 0; } static int nfs4_recover_expired_lease(struct nfs_server *server) { - struct nfs_client *clp = server->nfs_client; - int ret; - - for (;;) { - ret = nfs4_wait_clnt_recover(server->client, clp); - if (ret != 0) - return ret; - if (!test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) - break; - nfs4_schedule_state_recovery(clp); - } - return 0; + return nfs4_client_recover_expired_lease(server->nfs_client); } /* @@ -975,20 +2046,18 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s struct nfs4_opendata *opendata; int ret; - opendata = nfs4_open_recoverdata_alloc(ctx, state); + opendata = nfs4_open_recoverdata_alloc(ctx, state, + NFS4_OPEN_CLAIM_FH); if (IS_ERR(opendata)) return PTR_ERR(opendata); ret = nfs4_open_recover(opendata, state); - if (ret == -ESTALE) { - /* Invalidate the state owner so we don't ever use it again */ - nfs4_drop_state_owner(state->owner); - d_drop(ctx->path.dentry); - } + if (ret == -ESTALE) + d_drop(ctx->dentry); nfs4_opendata_put(opendata); return ret; } -static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) +static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_exception exception = { }; @@ -996,9 +2065,19 @@ static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4 do { err = _nfs4_open_expired(ctx, state); - if (err == -NFS4ERR_DELAY) + trace_nfs4_open_expired(ctx, 0, err); + if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) + continue; + switch (err) { + default: + goto out; + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: nfs4_handle_exception(server, err, &exception); + err = 0; + } } while (exception.retry); +out: return err; } @@ -1009,12 +2088,103 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta ctx = nfs4_state_find_open_context(state); if (IS_ERR(ctx)) - return PTR_ERR(ctx); + return -EAGAIN; ret = nfs4_do_open_expired(ctx, state); put_nfs_open_context(ctx); return ret; } +#if defined(CONFIG_NFS_V4_1) +static void nfs41_clear_delegation_stateid(struct nfs4_state *state) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + nfs4_stateid *stateid = &state->stateid; + struct nfs_delegation *delegation; + struct rpc_cred *cred = NULL; + int status = -NFS4ERR_BAD_STATEID; + + /* If a state reset has been done, test_stateid is unneeded */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) + return; + + /* Get the delegation credential for use by test/free_stateid */ + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(state->inode)->delegation); + if (delegation != NULL && + nfs4_stateid_match(&delegation->stateid, stateid)) { + cred = get_rpccred(delegation->cred); + rcu_read_unlock(); + status = nfs41_test_stateid(server, stateid, cred); + trace_nfs4_test_delegation_stateid(state, NULL, status); + } else + rcu_read_unlock(); + + if (status != NFS_OK) { + /* Free the stateid unless the server explicitly + * informs us the stateid is unrecognized. */ + if (status != -NFS4ERR_BAD_STATEID) + nfs41_free_stateid(server, stateid, cred); + nfs_remove_bad_delegation(state->inode); + + write_seqlock(&state->seqlock); + nfs4_stateid_copy(&state->stateid, &state->open_stateid); + write_sequnlock(&state->seqlock); + clear_bit(NFS_DELEGATED_STATE, &state->flags); + } + + if (cred != NULL) + put_rpccred(cred); +} + +/** + * nfs41_check_open_stateid - possibly free an open stateid + * + * @state: NFSv4 state for an inode + * + * Returns NFS_OK if recovery for this stateid is now finished. + * Otherwise a negative NFS4ERR value is returned. + */ +static int nfs41_check_open_stateid(struct nfs4_state *state) +{ + struct nfs_server *server = NFS_SERVER(state->inode); + nfs4_stateid *stateid = &state->open_stateid; + struct rpc_cred *cred = state->owner->so_cred; + int status; + + /* If a state reset has been done, test_stateid is unneeded */ + if ((test_bit(NFS_O_RDONLY_STATE, &state->flags) == 0) && + (test_bit(NFS_O_WRONLY_STATE, &state->flags) == 0) && + (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0)) + return -NFS4ERR_BAD_STATEID; + + status = nfs41_test_stateid(server, stateid, cred); + trace_nfs4_test_open_stateid(state, NULL, status); + if (status != NFS_OK) { + /* Free the stateid unless the server explicitly + * informs us the stateid is unrecognized. */ + if (status != -NFS4ERR_BAD_STATEID) + nfs41_free_stateid(server, stateid, cred); + + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDWR_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); + } + return status; +} + +static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + int status; + + nfs41_clear_delegation_stateid(state); + status = nfs41_check_open_stateid(state); + if (status != NFS_OK) + status = nfs4_open_expired(sp, state); + return status; +} +#endif + /* * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-* * fields corresponding to attributes that were used to store the verifier. @@ -1031,74 +2201,180 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct sattr->ia_valid |= ATTR_MTIME; } +static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, + fmode_t fmode, + int flags, + struct nfs_open_context *ctx) +{ + struct nfs4_state_owner *sp = opendata->owner; + struct nfs_server *server = sp->so_server; + struct dentry *dentry; + struct nfs4_state *state; + unsigned int seq; + int ret; + + seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); + + ret = _nfs4_proc_open(opendata); + if (ret != 0) + goto out; + + state = nfs4_opendata_to_nfs4_state(opendata); + ret = PTR_ERR(state); + if (IS_ERR(state)) + goto out; + if (server->caps & NFS_CAP_POSIX_LOCK) + set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + + dentry = opendata->dentry; + if (dentry->d_inode == NULL) { + /* FIXME: Is this d_drop() ever needed? */ + d_drop(dentry); + dentry = d_add_unique(dentry, igrab(state->inode)); + if (dentry == NULL) { + dentry = opendata->dentry; + } else if (dentry != ctx->dentry) { + dput(ctx->dentry); + ctx->dentry = dget(dentry); + } + nfs_set_verifier(dentry, + nfs_save_change_attribute(opendata->dir->d_inode)); + } + + ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); + if (ret != 0) + goto out; + + ctx->state = state; + if (dentry->d_inode == state->inode) { + nfs_inode_attach_open_context(ctx); + if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) + nfs4_schedule_stateid_recovery(server, state); + } +out: + return ret; +} + /* * Returns a referenced nfs4_state */ -static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) +static int _nfs4_do_open(struct inode *dir, + struct nfs_open_context *ctx, + int flags, + struct iattr *sattr, + struct nfs4_label *label, + int *opened) { struct nfs4_state_owner *sp; struct nfs4_state *state = NULL; struct nfs_server *server = NFS_SERVER(dir); - struct nfs_client *clp = server->nfs_client; struct nfs4_opendata *opendata; + struct dentry *dentry = ctx->dentry; + struct rpc_cred *cred = ctx->cred; + struct nfs4_threshold **ctx_th = &ctx->mdsthreshold; + fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC); + enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL; + struct nfs4_label *olabel = NULL; int status; /* Protect against reboot recovery conflicts */ status = -ENOMEM; - if (!(sp = nfs4_get_state_owner(server, cred))) { + sp = nfs4_get_state_owner(server, cred, GFP_KERNEL); + if (sp == NULL) { dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); goto out_err; } status = nfs4_recover_expired_lease(server); if (status != 0) goto err_put_state_owner; - if (path->dentry->d_inode != NULL) - nfs4_return_incompatible_delegation(path->dentry->d_inode, flags & (FMODE_READ|FMODE_WRITE)); - down_read(&clp->cl_sem); + if (dentry->d_inode != NULL) + nfs4_return_incompatible_delegation(dentry->d_inode, fmode); status = -ENOMEM; - opendata = nfs4_opendata_alloc(path, sp, flags, sattr); + if (dentry->d_inode) + claim = NFS4_OPEN_CLAIM_FH; + opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, + label, claim, GFP_KERNEL); if (opendata == NULL) - goto err_release_rwsem; + goto err_put_state_owner; - if (path->dentry->d_inode != NULL) - opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp); + if (label) { + olabel = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(olabel)) { + status = PTR_ERR(olabel); + goto err_opendata_put; + } + } - status = _nfs4_proc_open(opendata); + if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { + if (!opendata->f_attr.mdsthreshold) { + opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); + if (!opendata->f_attr.mdsthreshold) + goto err_free_label; + } + opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; + } + if (dentry->d_inode != NULL) + opendata->state = nfs4_get_open_state(dentry->d_inode, sp); + + status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx); if (status != 0) - goto err_opendata_put; + goto err_free_label; + state = ctx->state; - if (opendata->o_arg.open_flags & O_EXCL) + if ((opendata->o_arg.open_flags & O_EXCL) && + (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { nfs4_exclusive_attrset(opendata, sattr); - state = nfs4_opendata_to_nfs4_state(opendata); - status = PTR_ERR(state); - if (IS_ERR(state)) - goto err_opendata_put; + nfs_fattr_init(opendata->o_res.f_attr); + status = nfs4_do_setattr(state->inode, cred, + opendata->o_res.f_attr, sattr, + state, label, olabel); + if (status == 0) { + nfs_setattr_update_inode(state->inode, sattr); + nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); + nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); + } + } + if (opendata->file_created) + *opened |= FILE_CREATED; + + if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) { + *ctx_th = opendata->f_attr.mdsthreshold; + opendata->f_attr.mdsthreshold = NULL; + } + + nfs4_label_free(olabel); + nfs4_opendata_put(opendata); nfs4_put_state_owner(sp); - up_read(&clp->cl_sem); - *res = state; return 0; +err_free_label: + nfs4_label_free(olabel); err_opendata_put: nfs4_opendata_put(opendata); -err_release_rwsem: - up_read(&clp->cl_sem); err_put_state_owner: nfs4_put_state_owner(sp); out_err: - *res = NULL; return status; } -static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred) +static struct nfs4_state *nfs4_do_open(struct inode *dir, + struct nfs_open_context *ctx, + int flags, + struct iattr *sattr, + struct nfs4_label *label, + int *opened) { + struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { }; struct nfs4_state *res; int status; do { - status = _nfs4_do_open(dir, path, flags, sattr, cred, &res); + status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened); + res = ctx->state; + trace_nfs4_open_file(ctx, flags, status); if (status == 0) break; /* NOTE: BAD_SEQID means the server and client disagree about the @@ -1113,7 +2389,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int * the user though... */ if (status == -NFS4ERR_BAD_SEQID) { - printk(KERN_WARNING "NFS: v4 server %s " + pr_warn_ratelimited("NFS: v4 server %s " " returned a bad sequence-id error!\n", NFS_SERVER(dir)->nfs_client->cl_hostname); exception.retry = 1; @@ -1134,14 +2410,18 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int exception.retry = 1; continue; } - res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), + if (nfs4_clear_cap_atomic_open_v1(server, status, &exception)) + continue; + res = ERR_PTR(nfs4_handle_exception(server, status, &exception)); } while (exception.retry); return res; } -static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, - struct iattr *sattr, struct nfs4_state *state) +static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, + struct nfs_fattr *fattr, struct iattr *sattr, + struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs4_label *olabel) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_setattrargs arg = { @@ -1149,69 +2429,114 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, .iap = sattr, .server = server, .bitmask = server->attr_bitmask, + .label = ilabel, }; struct nfs_setattrres res = { .fattr = fattr, + .label = olabel, .server = server, }; struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], - .rpc_argp = &arg, - .rpc_resp = &res, + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], + .rpc_argp = &arg, + .rpc_resp = &res, + .rpc_cred = cred, }; unsigned long timestamp = jiffies; + fmode_t fmode; + bool truncate; int status; + arg.bitmask = nfs4_bitmask(server, ilabel); + if (ilabel) + arg.bitmask = nfs4_bitmask(server, olabel); + nfs_fattr_init(fattr); - if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { + /* Servers should only apply open mode checks for file size changes */ + truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; + fmode = truncate ? FMODE_WRITE : FMODE_READ; + + if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) { /* Use that stateid */ - } else if (state != NULL) { - msg.rpc_cred = state->owner->so_cred; - nfs4_copy_stateid(&arg.stateid, state, current->files); + } else if (truncate && state != NULL) { + struct nfs_lockowner lockowner = { + .l_owner = current->files, + .l_pid = current->tgid, + }; + if (!nfs4_valid_open_stateid(state)) + return -EBADF; + if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, + &lockowner) == -EIO) + return -EBADF; } else - memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); + nfs4_stateid_copy(&arg.stateid, &zero_stateid); - status = rpc_call_sync(server->client, &msg, 0); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (status == 0 && state != NULL) renew_lease(server, timestamp); return status; } -static int nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr, - struct iattr *sattr, struct nfs4_state *state) +static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, + struct nfs_fattr *fattr, struct iattr *sattr, + struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs4_label *olabel) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .state = state, + .inode = inode, + }; int err; do { - err = nfs4_handle_exception(server, - _nfs4_do_setattr(inode, fattr, sattr, state), - &exception); + err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); + trace_nfs4_setattr(inode, err); + switch (err) { + case -NFS4ERR_OPENMODE: + if (!(sattr->ia_valid & ATTR_SIZE)) { + pr_warn_once("NFSv4: server %s is incorrectly " + "applying open mode checks to " + "a SETATTR that is not " + "changing file size.\n", + server->nfs_client->cl_hostname); + } + if (state && !(state->state & FMODE_WRITE)) { + err = -EBADF; + if (sattr->ia_valid & ATTR_OPEN) + err = -EACCES; + goto out; + } + } + err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); +out: return err; } struct nfs4_closedata { - struct path path; struct inode *inode; struct nfs4_state *state; struct nfs_closeargs arg; struct nfs_closeres res; struct nfs_fattr fattr; unsigned long timestamp; + bool roc; + u32 roc_barrier; }; static void nfs4_free_closedata(void *data) { struct nfs4_closedata *calldata = data; struct nfs4_state_owner *sp = calldata->state->owner; + struct super_block *sb = calldata->state->inode->i_sb; + if (calldata->roc) + pnfs_roc_release(calldata->state->inode); nfs4_put_open_state(calldata->state); nfs_free_seqid(calldata->arg.seqid); nfs4_put_state_owner(sp); - dput(calldata->path.dentry); - mntput(calldata->path.mnt); + nfs_sb_deactive(sb); kfree(calldata); } @@ -1221,67 +2546,99 @@ static void nfs4_close_done(struct rpc_task *task, void *data) struct nfs4_state *state = calldata->state; struct nfs_server *server = NFS_SERVER(calldata->inode); - if (RPC_ASSASSINATED(task)) + dprintk("%s: begin!\n", __func__); + if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; + trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status); /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors */ - nfs_increment_open_seqid(task->tk_status, calldata->arg.seqid); switch (task->tk_status) { case 0: - nfs_set_open_stateid(state, &calldata->res.stateid, 0); + if (calldata->roc) + pnfs_roc_set_barrier(state->inode, + calldata->roc_barrier); + nfs_clear_open_stateid(state, &calldata->res.stateid, 0); renew_lease(server, calldata->timestamp); - break; + goto out_release; + case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_BAD_STATEID: case -NFS4ERR_EXPIRED: - break; + if (calldata->arg.fmode == 0) + break; default: - if (nfs4_async_handle_error(task, server) == -EAGAIN) { - rpc_restart_call(task); - return; + if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { + rpc_restart_call_prepare(task); + goto out_release; } } + nfs_clear_open_stateid(state, NULL, calldata->arg.fmode); +out_release: + nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, calldata->res.fattr); + dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); } static void nfs4_close_prepare(struct rpc_task *task, void *data) { struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; - int clear_rd, clear_wr, clear_rdwr; + struct inode *inode = calldata->inode; + int call_close = 0; + dprintk("%s: begin!\n", __func__); if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) - return; + goto out_wait; - clear_rd = clear_wr = clear_rdwr = 0; + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; + calldata->arg.fmode = FMODE_READ|FMODE_WRITE; spin_lock(&state->owner->so_lock); /* Calculate the change in open mode */ if (state->n_rdwr == 0) { if (state->n_rdonly == 0) { - clear_rd |= test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags); - clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + calldata->arg.fmode &= ~FMODE_READ; } if (state->n_wronly == 0) { - clear_wr |= test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags); - clear_rdwr |= test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags); + call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags); + call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + calldata->arg.fmode &= ~FMODE_WRITE; } } + if (!nfs4_valid_open_stateid(state)) + call_close = 0; spin_unlock(&state->owner->so_lock); - if (!clear_rd && !clear_wr && !clear_rdwr) { + + if (!call_close) { /* Note: exit _without_ calling nfs4_close_done */ - task->tk_action = NULL; - return; + goto out_no_action; } - nfs_fattr_init(calldata->res.fattr); - if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) { - task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; - calldata->arg.open_flags = FMODE_READ; - } else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) { - task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; - calldata->arg.open_flags = FMODE_WRITE; + + if (calldata->arg.fmode == 0) { + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; + if (calldata->roc && + pnfs_roc_drain(inode, &calldata->roc_barrier, task)) { + nfs_release_seqid(calldata->arg.seqid); + goto out_wait; + } } + + nfs_fattr_init(calldata->res.fattr); calldata->timestamp = jiffies; - rpc_call_start(task); + if (nfs4_setup_sequence(NFS_SERVER(inode), + &calldata->arg.seq_args, + &calldata->res.seq_res, + task) != 0) + nfs_release_seqid(calldata->arg.seqid); + dprintk("%s: done!\n", __func__); + return; +out_no_action: + task->tk_action = NULL; +out_wait: + nfs4_sequence_done(task, &calldata->res.seq_res); } static const struct rpc_call_ops nfs4_close_ops = { @@ -1301,7 +2658,7 @@ static const struct rpc_call_ops nfs4_close_ops = { * * NOTE: Caller must be holding the sp->so_owner semaphore! */ -int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) +int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_closedata *calldata; @@ -1315,29 +2672,36 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs4_close_ops, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; int status = -ENOMEM; - calldata = kmalloc(sizeof(*calldata), GFP_KERNEL); + nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, + &task_setup_data.rpc_client, &msg); + + calldata = kzalloc(sizeof(*calldata), gfp_mask); if (calldata == NULL) goto out; + nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1); calldata->inode = state->inode; calldata->state = state; calldata->arg.fh = NFS_FH(state->inode); calldata->arg.stateid = &state->open_stateid; /* Serialization for the sequence id */ - calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); + calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask); if (calldata->arg.seqid == NULL) goto out_free_calldata; - calldata->arg.bitmask = server->attr_bitmask; + calldata->arg.fmode = 0; + calldata->arg.bitmask = server->cache_consistency_bitmask; calldata->res.fattr = &calldata->fattr; + calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; - calldata->path.mnt = mntget(path->mnt); - calldata->path.dentry = dget(path->dentry); + calldata->roc = pnfs_roc(state->inode); + nfs_sb_active(calldata->inode->i_sb); - msg.rpc_argp = &calldata->arg, - msg.rpc_resp = &calldata->res, + msg.rpc_argp = &calldata->arg; + msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) @@ -1355,141 +2719,112 @@ out: return status; } -static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state) -{ - struct file *filp; - int ret; - - /* If the open_intent is for execute, we have an extra check to make */ - if (nd->intent.open.flags & FMODE_EXEC) { - ret = nfs_may_open(state->inode, - state->owner->so_cred, - nd->intent.open.flags); - if (ret < 0) - goto out_close; - } - filp = lookup_instantiate_filp(nd, path->dentry, NULL); - if (!IS_ERR(filp)) { - struct nfs_open_context *ctx; - ctx = nfs_file_open_context(filp); - ctx->state = state; - return 0; - } - ret = PTR_ERR(filp); -out_close: - nfs4_close_sync(path, state, nd->intent.open.flags); - return ret; -} - -struct dentry * -nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +static struct inode * +nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, + int open_flags, struct iattr *attr, int *opened) { - struct path path = { - .mnt = nd->path.mnt, - .dentry = dentry, - }; - struct dentry *parent; - struct iattr attr; - struct rpc_cred *cred; struct nfs4_state *state; - struct dentry *res; + struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL; - if (nd->flags & LOOKUP_CREATE) { - attr.ia_mode = nd->intent.open.create_mode; - attr.ia_valid = ATTR_MODE; - if (!IS_POSIXACL(dir)) - attr.ia_mode &= ~current->fs->umask; - } else { - attr.ia_valid = 0; - BUG_ON(nd->intent.open.flags & O_CREAT); - } + label = nfs4_label_init_security(dir, ctx->dentry, attr, &l); - cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); - if (IS_ERR(cred)) - return (struct dentry *)cred; - parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ - nfs_block_sillyrename(parent); - state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred); - put_rpccred(cred); - if (IS_ERR(state)) { - if (PTR_ERR(state) == -ENOENT) { - d_add(dentry, NULL); - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - } - nfs_unblock_sillyrename(parent); - return (struct dentry *)state; - } - res = d_add_unique(dentry, igrab(state->inode)); - if (res != NULL) - path.dentry = res; - nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir)); - nfs_unblock_sillyrename(parent); - nfs4_intent_set_file(nd, &path, state); - return res; + state = nfs4_do_open(dir, ctx, open_flags, attr, label, opened); + + nfs4_label_release_security(label); + + if (IS_ERR(state)) + return ERR_CAST(state); + return state->inode; } -int -nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, struct nameidata *nd) +static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) { - struct path path = { - .mnt = nd->path.mnt, - .dentry = dentry, - }; - struct rpc_cred *cred; - struct nfs4_state *state; - - cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); - if (IS_ERR(cred)) - return PTR_ERR(cred); - state = nfs4_do_open(dir, &path, openflags, NULL, cred); - put_rpccred(cred); - if (IS_ERR(state)) { - switch (PTR_ERR(state)) { - case -EPERM: - case -EACCES: - case -EDQUOT: - case -ENOSPC: - case -EROFS: - lookup_instantiate_filp(nd, (struct dentry *)state, NULL); - return 1; - default: - goto out_drop; - } - } - if (state->inode == dentry->d_inode) { - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - nfs4_intent_set_file(nd, &path, state); - return 1; - } - nfs4_close_sync(&path, state, openflags); -out_drop: - d_drop(dentry); - return 0; + if (ctx->state == NULL) + return; + if (is_sync) + nfs4_close_sync(ctx->state, ctx->mode); + else + nfs4_close_state(ctx->state, ctx->mode); } +#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL) +#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL) +#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL) static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { + struct nfs4_server_caps_arg args = { + .fhandle = fhandle, + }; struct nfs4_server_caps_res res = {}; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SERVER_CAPS], - .rpc_argp = fhandle, + .rpc_argp = &args, .rpc_resp = &res, }; int status; - status = rpc_call_sync(server->client, &msg, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { + /* Sanity check the server answers */ + switch (server->nfs_client->cl_minorversion) { + case 0: + res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK; + res.attr_bitmask[2] = 0; + break; + case 1: + res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK; + break; + case 2: + res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK; + } memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); - if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) + server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| + NFS_CAP_SYMLINKS|NFS_CAP_FILEID| + NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER| + NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| + NFS_CAP_CTIME|NFS_CAP_MTIME| + NFS_CAP_SECURITY_LABEL); + if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && + res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) server->caps |= NFS_CAP_ACLS; if (res.has_links != 0) server->caps |= NFS_CAP_HARDLINKS; if (res.has_symlinks != 0) server->caps |= NFS_CAP_SYMLINKS; + if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID) + server->caps |= NFS_CAP_FILEID; + if (res.attr_bitmask[1] & FATTR4_WORD1_MODE) + server->caps |= NFS_CAP_MODE; + if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS) + server->caps |= NFS_CAP_NLINK; + if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER) + server->caps |= NFS_CAP_OWNER; + if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP) + server->caps |= NFS_CAP_OWNER_GROUP; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS) + server->caps |= NFS_CAP_ATIME; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA) + server->caps |= NFS_CAP_CTIME; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY) + server->caps |= NFS_CAP_MTIME; +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL) + server->caps |= NFS_CAP_SECURITY_LABEL; +#endif + memcpy(server->attr_bitmask_nl, res.attr_bitmask, + sizeof(server->attr_bitmask)); + server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + + memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); + server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; + server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + server->cache_consistency_bitmask[2] = 0; server->acl_bitmask = res.acl_bitmask; + server->fh_expire_type = res.fh_expire_type; } + return status; } @@ -1508,8 +2843,9 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { + u32 bitmask[3]; struct nfs4_lookup_root_arg args = { - .bitmask = nfs4_fattr_bitmap, + .bitmask = bitmask, }; struct nfs4_lookup_res res = { .server = server, @@ -1521,8 +2857,16 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_argp = &args, .rpc_resp = &res, }; + + bitmask[0] = nfs4_fattr_bitmap[0]; + bitmask[1] = nfs4_fattr_bitmap[1]; + /* + * Process the label in the upcoming getfattr + */ + bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL; + nfs_fattr_init(info->fattr); - return rpc_call_sync(server->client, &msg, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, @@ -1531,35 +2875,174 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(server, - _nfs4_lookup_root(server, fhandle, info), - &exception); + err = _nfs4_lookup_root(server, fhandle, info); + trace_nfs4_lookup_root(server, fhandle, info->fattr, err); + switch (err) { + case 0: + case -NFS4ERR_WRONGSEC: + goto out; + default: + err = nfs4_handle_exception(server, err, &exception); + } } while (exception.retry); +out: return err; } +static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, rpc_authflavor_t flavor) +{ + struct rpc_auth_create_args auth_args = { + .pseudoflavor = flavor, + }; + struct rpc_auth *auth; + int ret; + + auth = rpcauth_create(&auth_args, server->client); + if (IS_ERR(auth)) { + ret = -EACCES; + goto out; + } + ret = nfs4_lookup_root(server, fhandle, info); +out: + return ret; +} + /* - * get the file handle for the "/" directory on the server + * Retry pseudoroot lookup with various security flavors. We do this when: + * + * NFSv4.0: the PUTROOTFH operation returns NFS4ERR_WRONGSEC + * NFSv4.1: the server does not support the SECINFO_NO_NAME operation + * + * Returns zero on success, or a negative NFS4ERR value, or a + * negative errno value. */ -static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, +static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { + /* Per 3530bis 15.33.5 */ + static const rpc_authflavor_t flav_array[] = { + RPC_AUTH_GSS_KRB5P, + RPC_AUTH_GSS_KRB5I, + RPC_AUTH_GSS_KRB5, + RPC_AUTH_UNIX, /* courtesy */ + RPC_AUTH_NULL, + }; + int status = -EPERM; + size_t i; + + if (server->auth_info.flavor_len > 0) { + /* try each flavor specified by user */ + for (i = 0; i < server->auth_info.flavor_len; i++) { + status = nfs4_lookup_root_sec(server, fhandle, info, + server->auth_info.flavors[i]); + if (status == -NFS4ERR_WRONGSEC || status == -EACCES) + continue; + break; + } + } else { + /* no flavors specified by user, try default list */ + for (i = 0; i < ARRAY_SIZE(flav_array); i++) { + status = nfs4_lookup_root_sec(server, fhandle, info, + flav_array[i]); + if (status == -NFS4ERR_WRONGSEC || status == -EACCES) + continue; + break; + } + } + + /* + * -EACCESS could mean that the user doesn't have correct permissions + * to access the mount. It could also mean that we tried to mount + * with a gss auth flavor, but rpc.gssd isn't running. Either way, + * existing mount programs don't handle -EACCES very well so it should + * be mapped to -EPERM instead. + */ + if (status == -EACCES) + status = -EPERM; + return status; +} + +static int nfs4_do_find_root_sec(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs_fsinfo *info) +{ + int mv = server->nfs_client->cl_minorversion; + return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info); +} + +/** + * nfs4_proc_get_rootfh - get file handle for server's pseudoroot + * @server: initialized nfs_server handle + * @fhandle: we fill in the pseudo-fs root file handle + * @info: we fill in an FSINFO struct + * @auth_probe: probe the auth flavours + * + * Returns zero on success, or a negative errno. + */ +int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, + bool auth_probe) +{ int status; - status = nfs4_lookup_root(server, fhandle, info); + switch (auth_probe) { + case false: + status = nfs4_lookup_root(server, fhandle, info); + if (status != -NFS4ERR_WRONGSEC) + break; + default: + status = nfs4_do_find_root_sec(server, fhandle, info); + } + if (status == 0) status = nfs4_server_capabilities(server, fhandle); if (status == 0) status = nfs4_do_fsinfo(server, fhandle, info); + return nfs4_map_errors(status); } +static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh, + struct nfs_fsinfo *info) +{ + int error; + struct nfs_fattr *fattr = info->fattr; + struct nfs4_label *label = NULL; + + error = nfs4_server_capabilities(server, mntfh); + if (error < 0) { + dprintk("nfs4_get_root: getcaps error = %d\n", -error); + return error; + } + + label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(label)) + return PTR_ERR(label); + + error = nfs4_proc_getattr(server, mntfh, fattr, label); + if (error < 0) { + dprintk("nfs4_get_root: getattr error = %d\n", -error); + goto err_free_label; + } + + if (fattr->valid & NFS_ATTR_FATTR_FSID && + !nfs_fsid_equal(&server->fsid, &fattr->fsid)) + memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); + +err_free_label: + nfs4_label_free(label); + + return error; +} + /* * Get locations and (maybe) other attributes of a referral. * Note that we'll actually follow the referral later when * we detect fsid mismatch in inode revalidation */ -static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct nfs_fattr *fattr, struct nfs_fh *fhandle) +static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, + const struct qstr *name, struct nfs_fattr *fattr, + struct nfs_fh *fhandle) { int status = -ENOMEM; struct page *page = NULL; @@ -1572,30 +3055,36 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct if (locations == NULL) goto out; - status = nfs4_proc_fs_locations(dir, name, locations, page); + status = nfs4_proc_fs_locations(client, dir, name, locations, page); if (status != 0) goto out; - /* Make sure server returned a different fsid for the referral */ + + /* + * If the fsid didn't change, this is a migration event, not a + * referral. Cause us to drop into the exception handler, which + * will kick off migration recovery. + */ if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { - dprintk("%s: server did not return a different fsid for a referral at %s\n", __FUNCTION__, name->name); - status = -EIO; + dprintk("%s: server did not return a different fsid for" + " a referral at %s\n", __func__, name->name); + status = -NFS4ERR_MOVED; goto out; } + /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ + nfs_fixup_referral_attributes(&locations->fattr); + /* replace the lookup nfs_fattr with the locations nfs_fattr */ memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr)); - fattr->valid |= NFS_ATTR_FATTR_V4_REFERRAL; - if (!fattr->mode) - fattr->mode = S_IFDIR; memset(fhandle, 0, sizeof(struct nfs_fh)); out: if (page) __free_page(page); - if (locations) - kfree(locations); + kfree(locations); return status; } -static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs4_getattr_arg args = { .fh = fhandle, @@ -1603,6 +3092,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, }; struct nfs4_getattr_res res = { .fattr = fattr, + .label = label, .server = server, }; struct rpc_message msg = { @@ -1610,18 +3100,22 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_argp = &args, .rpc_resp = &res, }; - + + args.bitmask = nfs4_bitmask(server, label); + nfs_fattr_init(fattr); - return rpc_call_sync(server->client, &msg, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } -static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(server, - _nfs4_proc_getattr(server, fhandle, fattr), + err = _nfs4_proc_getattr(server, fhandle, fattr, label); + trace_nfs4_getattr(server, fhandle, fattr, err); + err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); return err; @@ -1648,45 +3142,64 @@ static int nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct iattr *sattr) { - struct rpc_cred *cred; struct inode *inode = dentry->d_inode; - struct nfs_open_context *ctx; + struct rpc_cred *cred = NULL; struct nfs4_state *state = NULL; + struct nfs4_label *label = NULL; int status; + if (pnfs_ld_layoutret_on_setattr(inode)) + pnfs_commit_and_return_layout(inode); + nfs_fattr_init(fattr); - cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0); - if (IS_ERR(cred)) - return PTR_ERR(cred); + /* Deal with open(O_TRUNC) */ + if (sattr->ia_valid & ATTR_OPEN) + sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME); + + /* Optimization: if the end result is no change, don't RPC */ + if ((sattr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0) + return 0; /* Search for an existing open(O_WRITE) file */ - ctx = nfs_find_open_context(inode, cred, FMODE_WRITE); - if (ctx != NULL) - state = ctx->state; + if (sattr->ia_valid & ATTR_FILE) { + struct nfs_open_context *ctx; - status = nfs4_do_setattr(inode, fattr, sattr, state); - if (status == 0) + ctx = nfs_file_open_context(sattr->ia_file); + if (ctx) { + cred = ctx->cred; + state = ctx->state; + } + } + + label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); + if (IS_ERR(label)) + return PTR_ERR(label); + + status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label); + if (status == 0) { nfs_setattr_update_inode(inode, sattr); - if (ctx != NULL) - put_nfs_open_context(ctx); - put_rpccred(cred); + nfs_setsecurity(inode, fattr, label); + } + nfs4_label_free(label); return status; } -static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *dirfh, +static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, struct nfs4_label *label) { + struct nfs_server *server = NFS_SERVER(dir); int status; struct nfs4_lookup_arg args = { .bitmask = server->attr_bitmask, - .dir_fh = dirfh, + .dir_fh = NFS_FH(dir), .name = name, }; struct nfs4_lookup_res res = { .server = server, .fattr = fattr, + .label = label, .fh = fhandle, }; struct rpc_message msg = { @@ -1695,68 +3208,102 @@ static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *d .rpc_resp = &res, }; + args.bitmask = nfs4_bitmask(server, label); + nfs_fattr_init(fattr); - dprintk("NFS call lookupfh %s\n", name->name); - status = rpc_call_sync(server->client, &msg, 0); - dprintk("NFS reply lookupfh: %d\n", status); + dprintk("NFS call lookup %s\n", name->name); + status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); + dprintk("NFS reply lookup: %d\n", status); return status; } -static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, - struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) +static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr) +{ + fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | + NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_MOUNTPOINT; + fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO; + fattr->nlink = 2; +} + +static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, + struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs4_exception exception = { }; + struct rpc_clnt *client = *clnt; int err; do { - err = _nfs4_proc_lookupfh(server, dirfh, name, fhandle, fattr); - /* FIXME: !!!! */ - if (err == -NFS4ERR_MOVED) { - err = -EREMOTE; + err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label); + trace_nfs4_lookup(dir, name, err); + switch (err) { + case -NFS4ERR_BADNAME: + err = -ENOENT; + goto out; + case -NFS4ERR_MOVED: + err = nfs4_get_referral(client, dir, name, fattr, fhandle); + goto out; + case -NFS4ERR_WRONGSEC: + err = -EPERM; + if (client != *clnt) + goto out; + client = nfs4_negotiate_security(client, dir, name); + if (IS_ERR(client)) + return PTR_ERR(client); + + exception.retry = 1; break; + default: + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); } - err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); + +out: + if (err == 0) + *clnt = client; + else if (client != *clnt) + rpc_shutdown_client(client); + return err; } -static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + struct nfs4_label *label) { int status; - - dprintk("NFS call lookup %s\n", name->name); - status = _nfs4_proc_lookupfh(NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); - if (status == -NFS4ERR_MOVED) - status = nfs4_get_referral(dir, name, fattr, fhandle); - dprintk("NFS reply lookup: %d\n", status); + struct rpc_clnt *client = NFS_CLIENT(dir); + + status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label); + if (client != NFS_CLIENT(dir)) { + rpc_shutdown_client(client); + nfs_fixup_secinfo_attributes(fattr); + } return status; } -static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) +struct rpc_clnt * +nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { - struct nfs4_exception exception = { }; - int err; - do { - err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_lookup(dir, name, fhandle, fattr), - &exception); - } while (exception.retry); - return err; + struct rpc_clnt *client = NFS_CLIENT(dir); + int status; + + status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL); + if (status < 0) + return ERR_PTR(status); + return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client; } static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs_fattr fattr; struct nfs4_accessargs args = { .fh = NFS_FH(inode), - .bitmask = server->attr_bitmask, + .bitmask = server->cache_consistency_bitmask, }; struct nfs4_accessres res = { .server = server, - .fattr = &fattr, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], @@ -1765,7 +3312,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry .rpc_cred = entry->cred, }; int mode = entry->mask; - int status; + int status = 0; /* * Determine which access bits we want to ask for... @@ -1783,18 +3330,17 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry if (mode & MAY_EXEC) args.access |= NFS4_ACCESS_EXECUTE; } - nfs_fattr_init(&fattr); - status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + return -ENOMEM; + + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (!status) { - entry->mask = 0; - if (res.access & NFS4_ACCESS_READ) - entry->mask |= MAY_READ; - if (res.access & (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE)) - entry->mask |= MAY_WRITE; - if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) - entry->mask |= MAY_EXEC; - nfs_refresh_inode(inode, &fattr); + nfs_access_set_mask(entry, res.access); + nfs_refresh_inode(inode, res.fattr); } + nfs_free_fattr(res.fattr); return status; } @@ -1803,8 +3349,9 @@ static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(inode), - _nfs4_proc_access(inode, entry), + err = _nfs4_proc_access(inode, entry); + trace_nfs4_access(inode, err); + err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); } while (exception.retry); return err; @@ -1827,9 +3374,7 @@ static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) * * In the case of WRITE, we also want to put the GETATTR after * the operation -- in this case because we want to make sure - * we get the post-operation mtime and size. This means that - * we can't use xdr_encode_pages() as written: we need a variant - * of it which would leave room in the 'tail' iovec. + * we get the post-operation mtime and size. * * Both of these changes to the XDR layer would in fact be quite * minor, but I decided to leave them for a subsequent patch. @@ -1843,13 +3388,14 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page, .pglen = pglen, .pages = &page, }; + struct nfs4_readlink_res res; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READLINK], .rpc_argp = &args, - .rpc_resp = NULL, + .rpc_resp = &res, }; - return rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + return nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_readlink(struct inode *inode, struct page *page, @@ -1858,66 +3404,42 @@ static int nfs4_proc_readlink(struct inode *inode, struct page *page, struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(inode), - _nfs4_proc_readlink(inode, page, pgbase, pglen), + err = _nfs4_proc_readlink(inode, page, pgbase, pglen); + trace_nfs4_readlink(inode, err); + err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); } while (exception.retry); return err; } /* - * Got race? - * We will need to arrange for the VFS layer to provide an atomic open. - * Until then, this create/open method is prone to inefficiency and race - * conditions due to the lookup, create, and open VFS calls from sys_open() - * placed on the wire. - * - * Given the above sorry state of affairs, I'm simply sending an OPEN. - * The file will be opened again in the subsequent VFS open call - * (nfs4_proc_file_open). - * - * The open for read will just hang around to be used by any process that - * opens the file O_RDONLY. This will all be resolved with the VFS changes. + * This is just for mknod. open(O_CREAT) will always do ->open_context(). */ - static int nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags, struct nameidata *nd) + int flags) { - struct path path = { - .mnt = nd->path.mnt, - .dentry = dentry, - }; + struct nfs4_label l, *ilabel = NULL; + struct nfs_open_context *ctx; struct nfs4_state *state; - struct rpc_cred *cred; + int opened = 0; int status = 0; - cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); - if (IS_ERR(cred)) { - status = PTR_ERR(cred); - goto out; - } - state = nfs4_do_open(dir, &path, flags, sattr, cred); - put_rpccred(cred); - d_drop(dentry); + ctx = alloc_nfs_open_context(dentry, FMODE_READ); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ilabel = nfs4_label_init_security(dir, dentry, sattr, &l); + + sattr->ia_mode &= ~current_umask(); + state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened); if (IS_ERR(state)) { status = PTR_ERR(state); goto out; } - d_add(dentry, igrab(state->inode)); - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - if (flags & O_EXCL) { - struct nfs_fattr fattr; - status = nfs4_do_setattr(state->inode, &fattr, sattr, state); - if (status == 0) - nfs_setattr_update_inode(state->inode, sattr); - nfs_post_op_update_inode(state->inode, &fattr); - } - if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) - status = nfs4_intent_set_file(nd, &path, state); - else - nfs4_close_sync(&path, state, flags); out: + nfs4_label_release_security(ilabel); + put_nfs_open_context(ctx); return status; } @@ -1926,9 +3448,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) struct nfs_server *server = NFS_SERVER(dir); struct nfs_removeargs args = { .fh = NFS_FH(dir), - .name.len = name->len, - .name.name = name->name, - .bitmask = server->attr_bitmask, + .name = *name, }; struct nfs_removeres res = { .server = server, @@ -1938,14 +3458,11 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) .rpc_argp = &args, .rpc_resp = &res, }; - int status; + int status; - nfs_fattr_init(&res.dir_attr); - status = rpc_call_sync(server->client, &msg, 0); - if (status == 0) { + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); + if (status == 0) update_changeattr(dir, &res.cinfo); - nfs_post_op_update_inode(dir, &res.dir_attr); - } return status; } @@ -1954,8 +3471,9 @@ static int nfs4_proc_remove(struct inode *dir, struct qstr *name) struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_remove(dir, name), + err = _nfs4_proc_remove(dir, name); + trace_nfs4_remove(dir, name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); } while (exception.retry); return err; @@ -1967,71 +3485,67 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) struct nfs_removeargs *args = msg->rpc_argp; struct nfs_removeres *res = msg->rpc_resp; - args->bitmask = server->attr_bitmask; res->server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; + nfs4_init_sequence(&args->seq_args, &res->seq_res, 1); + + nfs_fattr_init(res->dir_attr); +} + +static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) +{ + nfs4_setup_sequence(NFS_SERVER(data->dir), + &data->args.seq_args, + &data->res.seq_res, + task); } static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) { - struct nfs_removeres *res = task->tk_msg.rpc_resp; + struct nfs_unlinkdata *data = task->tk_calldata; + struct nfs_removeres *res = &data->res; - if (nfs4_async_handle_error(task, res->server) == -EAGAIN) + if (!nfs4_sequence_done(task, &res->seq_res)) + return 0; + if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) return 0; update_changeattr(dir, &res->cinfo); - nfs_post_op_update_inode(dir, &res->dir_attr); return 1; } -static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) +static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) { - struct nfs_server *server = NFS_SERVER(old_dir); - struct nfs4_rename_arg arg = { - .old_dir = NFS_FH(old_dir), - .new_dir = NFS_FH(new_dir), - .old_name = old_name, - .new_name = new_name, - .bitmask = server->attr_bitmask, - }; - struct nfs_fattr old_fattr, new_fattr; - struct nfs4_rename_res res = { - .server = server, - .old_fattr = &old_fattr, - .new_fattr = &new_fattr, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status; - - nfs_fattr_init(res.old_fattr); - nfs_fattr_init(res.new_fattr); - status = rpc_call_sync(server->client, &msg, 0); + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_renameargs *arg = msg->rpc_argp; + struct nfs_renameres *res = msg->rpc_resp; - if (!status) { - update_changeattr(old_dir, &res.old_cinfo); - nfs_post_op_update_inode(old_dir, res.old_fattr); - update_changeattr(new_dir, &res.new_cinfo); - nfs_post_op_update_inode(new_dir, res.new_fattr); - } - return status; + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; + res->server = server; + nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1); } -static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) +static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) { - struct nfs4_exception exception = { }; - int err; - do { - err = nfs4_handle_exception(NFS_SERVER(old_dir), - _nfs4_proc_rename(old_dir, old_name, - new_dir, new_name), - &exception); - } while (exception.retry); - return err; + nfs4_setup_sequence(NFS_SERVER(data->old_dir), + &data->args.seq_args, + &data->res.seq_res, + task); +} + +static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, + struct inode *new_dir) +{ + struct nfs_renamedata *data = task->tk_calldata; + struct nfs_renameres *res = &data->res; + + if (!nfs4_sequence_done(task, &res->seq_res)) + return 0; + if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) + return 0; + + update_changeattr(old_dir, &res->old_cinfo); + update_changeattr(new_dir, &res->new_cinfo); + return 1; } static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) @@ -2043,28 +3557,41 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr * .name = name, .bitmask = server->attr_bitmask, }; - struct nfs_fattr fattr, dir_attr; struct nfs4_link_res res = { .server = server, - .fattr = &fattr, - .dir_attr = &dir_attr, + .label = NULL, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], .rpc_argp = &arg, .rpc_resp = &res, }; - int status; + int status = -ENOMEM; + + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + goto out; - nfs_fattr_init(res.fattr); - nfs_fattr_init(res.dir_attr); - status = rpc_call_sync(server->client, &msg, 0); + res.label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(res.label)) { + status = PTR_ERR(res.label); + goto out; + } + arg.bitmask = nfs4_bitmask(server, res.label); + + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(dir, &res.cinfo); - nfs_post_op_update_inode(dir, res.dir_attr); - nfs_post_op_update_inode(inode, res.fattr); + status = nfs_post_op_update_inode(inode, res.fattr); + if (!status) + nfs_setsecurity(inode, res.fattr, res.label); } + + nfs4_label_free(res.label); + +out: + nfs_free_fattr(res.fattr); return status; } @@ -2080,47 +3607,90 @@ static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *n return err; } +struct nfs4_createdata { + struct rpc_message msg; + struct nfs4_create_arg arg; + struct nfs4_create_res res; + struct nfs_fh fh; + struct nfs_fattr fattr; + struct nfs4_label *label; +}; + +static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, + struct qstr *name, struct iattr *sattr, u32 ftype) +{ + struct nfs4_createdata *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data != NULL) { + struct nfs_server *server = NFS_SERVER(dir); + + data->label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(data->label)) + goto out_free; + + data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE]; + data->msg.rpc_argp = &data->arg; + data->msg.rpc_resp = &data->res; + data->arg.dir_fh = NFS_FH(dir); + data->arg.server = server; + data->arg.name = name; + data->arg.attrs = sattr; + data->arg.ftype = ftype; + data->arg.bitmask = nfs4_bitmask(server, data->label); + data->res.server = server; + data->res.fh = &data->fh; + data->res.fattr = &data->fattr; + data->res.label = data->label; + nfs_fattr_init(data->res.fattr); + } + return data; +out_free: + kfree(data); + return NULL; +} + +static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) +{ + int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, + &data->arg.seq_args, &data->res.seq_res, 1); + if (status == 0) { + update_changeattr(dir, &data->res.dir_cinfo); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label); + } + return status; +} + +static void nfs4_free_createdata(struct nfs4_createdata *data) +{ + nfs4_label_free(data->label); + kfree(data); +} + static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, - struct page *page, unsigned int len, struct iattr *sattr) + struct page *page, unsigned int len, struct iattr *sattr, + struct nfs4_label *label) { - struct nfs_server *server = NFS_SERVER(dir); - struct nfs_fh fhandle; - struct nfs_fattr fattr, dir_fattr; - struct nfs4_create_arg arg = { - .dir_fh = NFS_FH(dir), - .server = server, - .name = &dentry->d_name, - .attrs = sattr, - .ftype = NF4LNK, - .bitmask = server->attr_bitmask, - }; - struct nfs4_create_res res = { - .server = server, - .fh = &fhandle, - .fattr = &fattr, - .dir_fattr = &dir_fattr, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status; + struct nfs4_createdata *data; + int status = -ENAMETOOLONG; if (len > NFS4_MAXPATHLEN) - return -ENAMETOOLONG; + goto out; - arg.u.symlink.pages = &page; - arg.u.symlink.len = len; - nfs_fattr_init(&fattr); - nfs_fattr_init(&dir_fattr); + status = -ENOMEM; + data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK); + if (data == NULL) + goto out; + + data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK]; + data->arg.u.symlink.pages = &page; + data->arg.u.symlink.len = len; + data->arg.label = label; - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - if (!status) { - update_changeattr(dir, &res.dir_cinfo); - nfs_post_op_update_inode(dir, res.dir_fattr); - status = nfs_instantiate(dentry, &fhandle, &fattr); - } + status = nfs4_do_create(dir, dentry, data); + + nfs4_free_createdata(data); +out: return status; } @@ -2128,52 +3698,37 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { struct nfs4_exception exception = { }; + struct nfs4_label l, *label = NULL; int err; + + label = nfs4_label_init_security(dir, dentry, sattr, &l); + do { - err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_symlink(dir, dentry, page, - len, sattr), + err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label); + trace_nfs4_symlink(dir, &dentry->d_name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); } while (exception.retry); + + nfs4_label_release_security(label); return err; } static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, - struct iattr *sattr) + struct iattr *sattr, struct nfs4_label *label) { - struct nfs_server *server = NFS_SERVER(dir); - struct nfs_fh fhandle; - struct nfs_fattr fattr, dir_fattr; - struct nfs4_create_arg arg = { - .dir_fh = NFS_FH(dir), - .server = server, - .name = &dentry->d_name, - .attrs = sattr, - .ftype = NF4DIR, - .bitmask = server->attr_bitmask, - }; - struct nfs4_create_res res = { - .server = server, - .fh = &fhandle, - .fattr = &fattr, - .dir_fattr = &dir_fattr, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status; + struct nfs4_createdata *data; + int status = -ENOMEM; - nfs_fattr_init(&fattr); - nfs_fattr_init(&dir_fattr); - - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - if (!status) { - update_changeattr(dir, &res.dir_cinfo); - nfs_post_op_update_inode(dir, res.dir_fattr); - status = nfs_instantiate(dentry, &fhandle, &fattr); - } + data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR); + if (data == NULL) + goto out; + + data->arg.label = label; + status = nfs4_do_create(dir, dentry, data); + + nfs4_free_createdata(data); +out: return status; } @@ -2181,25 +3736,34 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct nfs4_exception exception = { }; + struct nfs4_label l, *label = NULL; int err; + + label = nfs4_label_init_security(dir, dentry, sattr, &l); + + sattr->ia_mode &= ~current_umask(); do { - err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_mkdir(dir, dentry, sattr), + err = _nfs4_proc_mkdir(dir, dentry, sattr, label); + trace_nfs4_mkdir(dir, &dentry->d_name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); } while (exception.retry); + nfs4_label_release_security(label); + return err; } static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page *page, unsigned int count, int plus) + u64 cookie, struct page **pages, unsigned int count, int plus) { struct inode *dir = dentry->d_inode; struct nfs4_readdir_arg args = { .fh = NFS_FH(dir), - .pages = &page, + .pages = pages, .pgbase = 0, .count = count, .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, + .plus = plus, }; struct nfs4_readdir_res res; struct rpc_message msg = { @@ -2210,89 +3774,70 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, }; int status; - dprintk("%s: dentry = %s/%s, cookie = %Lu\n", __FUNCTION__, - dentry->d_parent->d_name.name, - dentry->d_name.name, + dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__, + dentry, (unsigned long long)cookie); - nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); + nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args); res.pgbase = args.pgbase; - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - if (status == 0) - memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); + status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); + if (status >= 0) { + memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE); + status += args.pgbase; + } nfs_invalidate_atime(dir); - dprintk("%s: returns %d\n", __FUNCTION__, status); + dprintk("%s: returns %d\n", __func__, status); return status; } static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page *page, unsigned int count, int plus) + u64 cookie, struct page **pages, unsigned int count, int plus) { struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), - _nfs4_proc_readdir(dentry, cred, cookie, - page, count, plus), + err = _nfs4_proc_readdir(dentry, cred, cookie, + pages, count, plus); + trace_nfs4_readdir(dentry->d_inode, err); + err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), err, &exception); } while (exception.retry); return err; } static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, - struct iattr *sattr, dev_t rdev) + struct iattr *sattr, struct nfs4_label *label, dev_t rdev) { - struct nfs_server *server = NFS_SERVER(dir); - struct nfs_fh fh; - struct nfs_fattr fattr, dir_fattr; - struct nfs4_create_arg arg = { - .dir_fh = NFS_FH(dir), - .server = server, - .name = &dentry->d_name, - .attrs = sattr, - .bitmask = server->attr_bitmask, - }; - struct nfs4_create_res res = { - .server = server, - .fh = &fh, - .fattr = &fattr, - .dir_fattr = &dir_fattr, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status; - int mode = sattr->ia_mode; + struct nfs4_createdata *data; + int mode = sattr->ia_mode; + int status = -ENOMEM; - nfs_fattr_init(&fattr); - nfs_fattr_init(&dir_fattr); + data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK); + if (data == NULL) + goto out; - BUG_ON(!(sattr->ia_valid & ATTR_MODE)); - BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode)); if (S_ISFIFO(mode)) - arg.ftype = NF4FIFO; + data->arg.ftype = NF4FIFO; else if (S_ISBLK(mode)) { - arg.ftype = NF4BLK; - arg.u.device.specdata1 = MAJOR(rdev); - arg.u.device.specdata2 = MINOR(rdev); + data->arg.ftype = NF4BLK; + data->arg.u.device.specdata1 = MAJOR(rdev); + data->arg.u.device.specdata2 = MINOR(rdev); } else if (S_ISCHR(mode)) { - arg.ftype = NF4CHR; - arg.u.device.specdata1 = MAJOR(rdev); - arg.u.device.specdata2 = MINOR(rdev); - } - else - arg.ftype = NF4SOCK; - - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - if (status == 0) { - update_changeattr(dir, &res.dir_cinfo); - nfs_post_op_update_inode(dir, res.dir_fattr); - status = nfs_instantiate(dentry, &fh, &fattr); + data->arg.ftype = NF4CHR; + data->arg.u.device.specdata1 = MAJOR(rdev); + data->arg.u.device.specdata2 = MINOR(rdev); + } else if (!S_ISSOCK(mode)) { + status = -EINVAL; + goto out_free; } + + data->arg.label = label; + status = nfs4_do_create(dir, dentry, data); +out_free: + nfs4_free_createdata(data); +out: return status; } @@ -2300,12 +3845,21 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { struct nfs4_exception exception = { }; + struct nfs4_label l, *label = NULL; int err; + + label = nfs4_label_init_security(dir, dentry, sattr, &l); + + sattr->ia_mode &= ~current_umask(); do { - err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_mknod(dir, dentry, sattr, rdev), + err = _nfs4_proc_mknod(dir, dentry, sattr, label, rdev); + trace_nfs4_mknod(dir, &dentry->d_name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); } while (exception.retry); + + nfs4_label_release_security(label); + return err; } @@ -2316,14 +3870,17 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, .fh = fhandle, .bitmask = server->attr_bitmask, }; + struct nfs4_statfs_res res = { + .fsstat = fsstat, + }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_STATFS], .rpc_argp = &args, - .rpc_resp = fsstat, + .rpc_resp = &res, }; nfs_fattr_init(fsstat->fattr); - return rpc_call_sync(server->client, &msg, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat) @@ -2345,32 +3902,54 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, .fh = fhandle, .bitmask = server->attr_bitmask, }; + struct nfs4_fsinfo_res res = { + .fsinfo = fsinfo, + }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSINFO], .rpc_argp = &args, - .rpc_resp = fsinfo, + .rpc_resp = &res, }; - return rpc_call_sync(server->client, &msg, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) { struct nfs4_exception exception = { }; + unsigned long now = jiffies; int err; do { - err = nfs4_handle_exception(server, - _nfs4_do_fsinfo(server, fhandle, fsinfo), - &exception); + err = _nfs4_do_fsinfo(server, fhandle, fsinfo); + trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err); + if (err == 0) { + struct nfs_client *clp = server->nfs_client; + + spin_lock(&clp->cl_lock); + clp->cl_lease_time = fsinfo->lease_time * HZ; + clp->cl_last_renewal = now; + spin_unlock(&clp->cl_lock); + break; + } + err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); return err; } static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) { + int error; + nfs_fattr_init(fsinfo->fattr); - return nfs4_do_fsinfo(server, fhandle, fsinfo); + error = nfs4_do_fsinfo(server, fhandle, fsinfo); + if (error == 0) { + /* block layout checks this! */ + server->pnfs_blksize = fsinfo->blksize; + set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype); + } + + return error; } static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, @@ -2380,10 +3959,13 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle .fh = fhandle, .bitmask = server->attr_bitmask, }; + struct nfs4_pathconf_res res = { + .pathconf = pathconf, + }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PATHCONF], .rpc_argp = &args, - .rpc_resp = pathconf, + .rpc_resp = &res, }; /* None of the pathconf attributes are mandatory to implement */ @@ -2393,7 +3975,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle } nfs_fattr_init(pathconf->fattr); - return rpc_call_sync(server->client, &msg, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, @@ -2410,115 +3992,303 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return err; } -static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) +int nfs4_set_rw_stateid(nfs4_stateid *stateid, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode) { - struct nfs_server *server = NFS_SERVER(data->inode); + const struct nfs_lockowner *lockowner = NULL; - if (nfs4_async_handle_error(task, server) == -EAGAIN) { - rpc_restart_call(task); + if (l_ctx != NULL) + lockowner = &l_ctx->lockowner; + return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner); +} +EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid); + +static bool nfs4_stateid_is_current(nfs4_stateid *stateid, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode) +{ + nfs4_stateid current_stateid; + + /* If the current stateid represents a lost lock, then exit */ + if (nfs4_set_rw_stateid(¤t_stateid, ctx, l_ctx, fmode) == -EIO) + return true; + return nfs4_stateid_match(stateid, ¤t_stateid); +} + +static bool nfs4_error_stateid_expired(int err) +{ + switch (err) { + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_OPENMODE: + case -NFS4ERR_EXPIRED: + return true; + } + return false; +} + +void __nfs4_read_done_cb(struct nfs_pgio_data *data) +{ + nfs_invalidate_atime(data->header->inode); +} + +static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) +{ + struct nfs_server *server = NFS_SERVER(data->header->inode); + + trace_nfs4_read(data, task->tk_status); + if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { + rpc_restart_call_prepare(task); return -EAGAIN; } - nfs_invalidate_atime(data->inode); + __nfs4_read_done_cb(data); if (task->tk_status > 0) renew_lease(server, data->timestamp); return 0; } -static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static bool nfs4_read_stateid_changed(struct rpc_task *task, + struct nfs_pgio_args *args) +{ + + if (!nfs4_error_stateid_expired(task->tk_status) || + nfs4_stateid_is_current(&args->stateid, + args->context, + args->lock_context, + FMODE_READ)) + return false; + rpc_restart_call_prepare(task); + return true; +} + +static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data) +{ + + dprintk("--> %s\n", __func__); + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + if (nfs4_read_stateid_changed(task, &data->args)) + return -EAGAIN; + return data->pgio_done_cb ? data->pgio_done_cb(task, data) : + nfs4_read_done_cb(task, data); +} + +static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { data->timestamp = jiffies; + data->pgio_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); } -static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) { - struct inode *inode = data->inode; + if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), + &data->args.seq_args, + &data->res.seq_res, + task)) + return 0; + if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context, + data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO) + return -EIO; + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) + return -EIO; + return 0; +} + +static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) +{ + struct inode *inode = data->header->inode; - if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { - rpc_restart_call(task); + trace_nfs4_write(data, task->tk_status); + if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { + rpc_restart_call_prepare(task); return -EAGAIN; } if (task->tk_status >= 0) { renew_lease(NFS_SERVER(inode), data->timestamp); - nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); + nfs_post_op_update_inode_force_wcc(inode, &data->fattr); } return 0; } -static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static bool nfs4_write_stateid_changed(struct rpc_task *task, + struct nfs_pgio_args *args) { - struct nfs_server *server = NFS_SERVER(data->inode); - data->args.bitmask = server->attr_bitmask; + if (!nfs4_error_stateid_expired(task->tk_status) || + nfs4_stateid_is_current(&args->stateid, + args->context, + args->lock_context, + FMODE_WRITE)) + return false; + rpc_restart_call_prepare(task); + return true; +} + +static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data) +{ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + if (nfs4_write_stateid_changed(task, &data->args)) + return -EAGAIN; + return data->pgio_done_cb ? data->pgio_done_cb(task, data) : + nfs4_write_done_cb(task, data); +} + +static +bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data) +{ + const struct nfs_pgio_header *hdr = data->header; + + /* Don't request attributes for pNFS or O_DIRECT writes */ + if (data->ds_clp != NULL || hdr->dreq != NULL) + return false; + /* Otherwise, request attributes if and only if we don't hold + * a delegation + */ + return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; +} + +static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) +{ + struct nfs_server *server = NFS_SERVER(data->header->inode); + + if (!nfs4_write_need_cache_consistency_data(data)) { + data->args.bitmask = NULL; + data->res.fattr = NULL; + } else + data->args.bitmask = server->cache_consistency_bitmask; + + if (!data->pgio_done_cb) + data->pgio_done_cb = nfs4_write_done_cb; data->res.server = server; data->timestamp = jiffies; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); +} + +static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) +{ + nfs4_setup_sequence(NFS_SERVER(data->inode), + &data->args.seq_args, + &data->res.seq_res, + task); } -static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) { struct inode *inode = data->inode; - - if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { - rpc_restart_call(task); + + trace_nfs4_commit(data, task->tk_status); + if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { + rpc_restart_call_prepare(task); return -EAGAIN; } - nfs_refresh_inode(inode, data->res.fattr); return 0; } -static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) +static int nfs4_commit_done(struct rpc_task *task, struct nfs_commit_data *data) +{ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + return data->commit_done_cb(task, data); +} + +static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) { struct nfs_server *server = NFS_SERVER(data->inode); - - data->args.bitmask = server->attr_bitmask; + + if (data->commit_done_cb == NULL) + data->commit_done_cb = nfs4_commit_done_cb; data->res.server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); } +struct nfs4_renewdata { + struct nfs_client *client; + unsigned long timestamp; +}; + /* * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special * standalone procedure for queueing an asynchronous RENEW. */ -static void nfs4_renew_done(struct rpc_task *task, void *data) +static void nfs4_renew_release(void *calldata) { - struct nfs_client *clp = (struct nfs_client *)task->tk_msg.rpc_argp; - unsigned long timestamp = (unsigned long)data; + struct nfs4_renewdata *data = calldata; + struct nfs_client *clp = data->client; - if (task->tk_status < 0) { - switch (task->tk_status) { - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_EXPIRED: - case -NFS4ERR_CB_PATH_DOWN: - nfs4_schedule_state_recovery(clp); + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); + kfree(data); +} + +static void nfs4_renew_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_renewdata *data = calldata; + struct nfs_client *clp = data->client; + unsigned long timestamp = data->timestamp; + + trace_nfs4_renew_async(clp, task->tk_status); + switch (task->tk_status) { + case 0: + break; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(clp); + break; + default: + /* Unless we're shutting down, schedule state recovery! */ + if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0) + return; + if (task->tk_status != NFS4ERR_CB_PATH_DOWN) { + nfs4_schedule_lease_recovery(clp); + return; } - return; + nfs4_schedule_path_down_recovery(clp); } - spin_lock(&clp->cl_lock); - if (time_before(clp->cl_last_renewal,timestamp)) - clp->cl_last_renewal = timestamp; - spin_unlock(&clp->cl_lock); + do_renew_lease(clp, timestamp); } static const struct rpc_call_ops nfs4_renew_ops = { .rpc_call_done = nfs4_renew_done, + .rpc_release = nfs4_renew_release, }; -int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) +static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], .rpc_argp = clp, .rpc_cred = cred, }; + struct nfs4_renewdata *data; - return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, - &nfs4_renew_ops, (void *)jiffies); + if (renew_flags == 0) + return 0; + if (!atomic_inc_not_zero(&clp->cl_count)) + return -EIO; + data = kmalloc(sizeof(*data), GFP_NOFS); + if (data == NULL) + return -ENOMEM; + data->client = clp; + data->timestamp = jiffies; + return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT, + &nfs4_renew_ops, data); } -int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) +static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], @@ -2528,40 +4298,51 @@ int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) unsigned long now = jiffies; int status; - status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (status < 0) return status; - spin_lock(&clp->cl_lock); - if (time_before(clp->cl_last_renewal,now)) - clp->cl_last_renewal = now; - spin_unlock(&clp->cl_lock); + do_renew_lease(clp, now); return 0; } static inline int nfs4_server_supports_acls(struct nfs_server *server) { - return (server->caps & NFS_CAP_ACLS) - && (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) - && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL); + return server->caps & NFS_CAP_ACLS; } -/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_CACHE_SIZE, and that - * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_CACHE_SIZE) bytes on +/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that + * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_SIZE) bytes on * the stack. */ -#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT) +#define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE) -static void buf_to_pages(const void *buf, size_t buflen, +static int buf_to_pages_noslab(const void *buf, size_t buflen, struct page **pages, unsigned int *pgbase) { - const void *p = buf; + struct page *newpage, **spages; + int rc = 0; + size_t len; + spages = pages; - *pgbase = offset_in_page(buf); - p -= *pgbase; - while (p < buf + buflen) { - *(pages++) = virt_to_page(p); - p += PAGE_CACHE_SIZE; - } + do { + len = min_t(size_t, PAGE_SIZE, buflen); + newpage = alloc_page(GFP_KERNEL); + + if (newpage == NULL) + goto unwind; + memcpy(page_address(newpage), buf, len); + buf += len; + buflen -= len; + *pages++ = newpage; + rc++; + } while (buflen != 0); + + return rc; + +unwind: + for(; rc > 0; rc--) + __free_page(spages[rc-1]); + return -ENOMEM; } struct nfs4_cached_acl { @@ -2610,16 +4391,17 @@ out: return ret; } -static void nfs4_write_cached_acl(struct inode *inode, const char *buf, size_t acl_len) +static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size_t pgbase, size_t acl_len) { struct nfs4_cached_acl *acl; + size_t buflen = sizeof(*acl) + acl_len; - if (buf && acl_len <= PAGE_SIZE) { - acl = kmalloc(sizeof(*acl) + acl_len, GFP_KERNEL); + if (buflen <= PAGE_SIZE) { + acl = kmalloc(buflen, GFP_KERNEL); if (acl == NULL) goto out; acl->cached = 1; - memcpy(acl->data, buf, acl_len); + _copy_from_pages(acl->data, pages, pgbase, acl_len); } else { acl = kmalloc(sizeof(*acl), GFP_KERNEL); if (acl == NULL) @@ -2631,56 +4413,87 @@ out: nfs4_set_cached_acl(inode, acl); } +/* + * The getxattr API returns the required buffer length when called with a + * NULL buf. The NFSv4 acl tool then calls getxattr again after allocating + * the required buf. On a NULL buf, we send a page of data to the server + * guessing that the ACL request can be serviced by a page. If so, we cache + * up to the page of ACL data, and the 2nd call to getxattr is serviced by + * the cache. If not so, we throw away the page, and cache the required + * length. The next getxattr call will then produce another round trip to + * the server, this time with the input buf of the required size. + */ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) { - struct page *pages[NFS4ACL_MAXPAGES]; + struct page *pages[NFS4ACL_MAXPAGES] = {NULL, }; struct nfs_getaclargs args = { .fh = NFS_FH(inode), .acl_pages = pages, .acl_len = buflen, }; - size_t resp_len = buflen; - void *resp_buf; + struct nfs_getaclres res = { + .acl_len = buflen, + }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL], .rpc_argp = &args, - .rpc_resp = &resp_len, + .rpc_resp = &res, }; - struct page *localpage = NULL; - int ret; + unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); + int ret = -ENOMEM, i; + + /* As long as we're doing a round trip to the server anyway, + * let's be prepared for a page of acl data. */ + if (npages == 0) + npages = 1; + if (npages > ARRAY_SIZE(pages)) + return -ERANGE; - if (buflen < PAGE_SIZE) { - /* As long as we're doing a round trip to the server anyway, - * let's be prepared for a page of acl data. */ - localpage = alloc_page(GFP_KERNEL); - resp_buf = page_address(localpage); - if (localpage == NULL) - return -ENOMEM; - args.acl_pages[0] = localpage; - args.acl_pgbase = 0; - resp_len = args.acl_len = PAGE_SIZE; - } else { - resp_buf = buf; - buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase); + for (i = 0; i < npages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_free; } - ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + + /* for decoding across pages */ + res.acl_scratch = alloc_page(GFP_KERNEL); + if (!res.acl_scratch) + goto out_free; + + args.acl_len = npages * PAGE_SIZE; + args.acl_pgbase = 0; + + dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n", + __func__, buf, buflen, npages, args.acl_len); + ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), + &msg, &args.seq_args, &res.seq_res, 0); if (ret) goto out_free; - if (resp_len > args.acl_len) - nfs4_write_cached_acl(inode, NULL, resp_len); - else - nfs4_write_cached_acl(inode, resp_buf, resp_len); - if (buf) { + + /* Handle the case where the passed-in buffer is too short */ + if (res.acl_flags & NFS4_ACL_TRUNC) { + /* Did the user only issue a request for the acl length? */ + if (buf == NULL) + goto out_ok; ret = -ERANGE; - if (resp_len > buflen) + goto out_free; + } + nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len); + if (buf) { + if (res.acl_len > buflen) { + ret = -ERANGE; goto out_free; - if (localpage) - memcpy(buf, resp_buf, resp_len); + } + _copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len); } - ret = resp_len; +out_ok: + ret = res.acl_len; out_free: - if (localpage) - __free_page(localpage); + for (i = 0; i < npages; i++) + if (pages[i]) + __free_page(pages[i]); + if (res.acl_scratch) + __free_page(res.acl_scratch); return ret; } @@ -2690,6 +4503,7 @@ static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bufl ssize_t ret; do { ret = __nfs4_get_acl_uncached(inode, buf, buflen); + trace_nfs4_get_acl(inode, ret); if (ret >= 0) break; ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception); @@ -2707,8 +4521,12 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) ret = nfs_revalidate_inode(server, inode); if (ret < 0) return ret; + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); ret = nfs4_read_cached_acl(inode, buf, buflen); if (ret != -ENOENT) + /* -ENOENT is returned if there is no ACL or if there is an ACL + * but no cached acl data, just the acl length */ return ret; return nfs4_get_acl_uncached(inode, buf, buflen); } @@ -2722,19 +4540,41 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl .acl_pages = pages, .acl_len = buflen, }; + struct nfs_setaclres res; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETACL], .rpc_argp = &arg, - .rpc_resp = NULL, + .rpc_resp = &res, }; - int ret; + unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); + int ret, i; if (!nfs4_server_supports_acls(server)) return -EOPNOTSUPP; - nfs_inode_return_delegation(inode); - buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); - ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); - nfs_zap_caches(inode); + if (npages > ARRAY_SIZE(pages)) + return -ERANGE; + i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase); + if (i < 0) + return i; + nfs4_inode_return_delegation(inode); + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + + /* + * Free each page after tx, so the only ref left is + * held by the network stack + */ + for (; i > 0; i--) + put_page(pages[i-1]); + + /* + * Acl update can result in inode attribute update. + * so mark the attribute cache invalid. + */ + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + spin_unlock(&inode->i_lock); + nfs_access_zap_cache(inode); + nfs_zap_acl_cache(inode); return ret; } @@ -2743,213 +4583,390 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen struct nfs4_exception exception = { }; int err; do { - err = nfs4_handle_exception(NFS_SERVER(inode), - __nfs4_proc_set_acl(inode, buf, buflen), + err = __nfs4_proc_set_acl(inode, buf, buflen); + trace_nfs4_set_acl(inode, err); + err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); } while (exception.retry); return err; } -static int -nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static int _nfs4_get_security_label(struct inode *inode, void *buf, + size_t buflen) { - struct nfs_client *clp = server->nfs_client; + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_fattr fattr; + struct nfs4_label label = {0, 0, buflen, buf}; - if (!clp || task->tk_status >= 0) - return 0; - switch(task->tk_status) { - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: - rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL, NULL); - nfs4_schedule_state_recovery(clp); - if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0) - rpc_wake_up_task(task); - task->tk_status = 0; - return -EAGAIN; - case -NFS4ERR_DELAY: - nfs_inc_server_stats((struct nfs_server *) server, - NFSIOS_DELAY); - case -NFS4ERR_GRACE: - rpc_delay(task, NFS4_POLL_RETRY_MAX); - task->tk_status = 0; - return -EAGAIN; - case -NFS4ERR_OLD_STATEID: - task->tk_status = 0; - return -EAGAIN; - } - task->tk_status = nfs4_map_errors(task->tk_status); + u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; + struct nfs4_getattr_arg arg = { + .fh = NFS_FH(inode), + .bitmask = bitmask, + }; + struct nfs4_getattr_res res = { + .fattr = &fattr, + .label = &label, + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int ret; + + nfs_fattr_init(&fattr); + + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); + if (ret) + return ret; + if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL)) + return -ENOENT; + if (buflen < label.len) + return -ERANGE; return 0; } -static int nfs4_wait_bit_killable(void *word) +static int nfs4_get_security_label(struct inode *inode, void *buf, + size_t buflen) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; - schedule(); - return 0; + struct nfs4_exception exception = { }; + int err; + + if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) + return -EOPNOTSUPP; + + do { + err = _nfs4_get_security_label(inode, buf, buflen); + trace_nfs4_get_security_label(inode, err); + err = nfs4_handle_exception(NFS_SERVER(inode), err, + &exception); + } while (exception.retry); + return err; } -static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp) +static int _nfs4_do_set_security_label(struct inode *inode, + struct nfs4_label *ilabel, + struct nfs_fattr *fattr, + struct nfs4_label *olabel) { - int res; - might_sleep(); + struct iattr sattr = {0}; + struct nfs_server *server = NFS_SERVER(inode); + const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; + struct nfs_setattrargs arg = { + .fh = NFS_FH(inode), + .iap = &sattr, + .server = server, + .bitmask = bitmask, + .label = ilabel, + }; + struct nfs_setattrres res = { + .fattr = fattr, + .label = olabel, + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status; - rwsem_acquire(&clp->cl_sem.dep_map, 0, 0, _RET_IP_); + nfs4_stateid_copy(&arg.stateid, &zero_stateid); - res = wait_on_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER, - nfs4_wait_bit_killable, TASK_KILLABLE); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + if (status) + dprintk("%s failed: %d\n", __func__, status); - rwsem_release(&clp->cl_sem.dep_map, 1, _RET_IP_); - return res; + return status; } -static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) +static int nfs4_do_set_security_label(struct inode *inode, + struct nfs4_label *ilabel, + struct nfs_fattr *fattr, + struct nfs4_label *olabel) { - int res = 0; + struct nfs4_exception exception = { }; + int err; - might_sleep(); + do { + err = _nfs4_do_set_security_label(inode, ilabel, + fattr, olabel); + trace_nfs4_set_security_label(inode, err); + err = nfs4_handle_exception(NFS_SERVER(inode), err, + &exception); + } while (exception.retry); + return err; +} - if (*timeout <= 0) - *timeout = NFS4_POLL_RETRY_MIN; - if (*timeout > NFS4_POLL_RETRY_MAX) - *timeout = NFS4_POLL_RETRY_MAX; - schedule_timeout_killable(*timeout); - if (fatal_signal_pending(current)) - res = -ERESTARTSYS; - *timeout <<= 1; - return res; +static int +nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen) +{ + struct nfs4_label ilabel, *olabel = NULL; + struct nfs_fattr fattr; + struct rpc_cred *cred; + struct inode *inode = dentry->d_inode; + int status; + + if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) + return -EOPNOTSUPP; + + nfs_fattr_init(&fattr); + + ilabel.pi = 0; + ilabel.lfs = 0; + ilabel.label = (char *)buf; + ilabel.len = buflen; + + cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return PTR_ERR(cred); + + olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); + if (IS_ERR(olabel)) { + status = -PTR_ERR(olabel); + goto out; + } + + status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel); + if (status == 0) + nfs_setsecurity(inode, &fattr, olabel); + + nfs4_label_free(olabel); +out: + put_rpccred(cred); + return status; } +#endif /* CONFIG_NFS_V4_SECURITY_LABEL */ -/* This is the error handling routine for processes that are allowed - * to sleep. - */ -static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception) + +static int +nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) { struct nfs_client *clp = server->nfs_client; - int ret = errorcode; - exception->retry = 0; - switch(errorcode) { - case 0: - return 0; - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_STALE_STATEID: + if (task->tk_status >= 0) + return 0; + switch(task->tk_status) { + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + if (state == NULL) + break; + nfs_remove_bad_delegation(state->inode); + case -NFS4ERR_OPENMODE: + if (state == NULL) + break; + if (nfs4_schedule_stateid_recovery(server, state) < 0) + goto recovery_failed; + goto wait_on_recovery; case -NFS4ERR_EXPIRED: - nfs4_schedule_state_recovery(clp); - ret = nfs4_wait_clnt_recover(server->client, clp); - if (ret == 0) - exception->retry = 1; - break; - case -NFS4ERR_FILE_OPEN: - case -NFS4ERR_GRACE: + if (state != NULL) { + if (nfs4_schedule_stateid_recovery(server, state) < 0) + goto recovery_failed; + } + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_STALE_CLIENTID: + nfs4_schedule_lease_recovery(clp); + goto wait_on_recovery; + case -NFS4ERR_MOVED: + if (nfs4_schedule_migration_recovery(server) < 0) + goto recovery_failed; + goto wait_on_recovery; + case -NFS4ERR_LEASE_MOVED: + nfs4_schedule_lease_moved_recovery(clp); + goto wait_on_recovery; +#if defined(CONFIG_NFS_V4_1) + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR %d, Reset session\n", __func__, + task->tk_status); + nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); + goto wait_on_recovery; +#endif /* CONFIG_NFS_V4_1 */ case -NFS4ERR_DELAY: - ret = nfs4_delay(server->client, &exception->timeout); - if (ret != 0) - break; + nfs_inc_server_stats(server, NFSIOS_DELAY); + case -NFS4ERR_GRACE: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + case -NFS4ERR_RETRY_UNCACHED_REP: case -NFS4ERR_OLD_STATEID: - exception->retry = 1; + goto restart_call; } - /* We failed to handle the error */ - return nfs4_map_errors(ret); + task->tk_status = nfs4_map_errors(task->tk_status); + return 0; +recovery_failed: + task->tk_status = -EIO; + return 0; +wait_on_recovery: + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) + rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + goto recovery_failed; +restart_call: + task->tk_status = 0; + return -EAGAIN; +} + +static void nfs4_init_boot_verifier(const struct nfs_client *clp, + nfs4_verifier *bootverf) +{ + __be32 verf[2]; + + if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { + /* An impossible timestamp guarantees this value + * will never match a generated boot time. */ + verf[0] = 0; + verf[1] = cpu_to_be32(NSEC_PER_SEC + 1); + } else { + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + verf[0] = cpu_to_be32(nn->boot_time.tv_sec); + verf[1] = cpu_to_be32(nn->boot_time.tv_nsec); + } + memcpy(bootverf->data, verf, sizeof(bootverf->data)); } -int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred) +static unsigned int +nfs4_init_nonuniform_client_string(const struct nfs_client *clp, + char *buf, size_t len) +{ + unsigned int result; + + rcu_read_lock(); + result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s", + clp->cl_ipaddr, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_PROTO)); + rcu_read_unlock(); + return result; +} + +static unsigned int +nfs4_init_uniform_client_string(const struct nfs_client *clp, + char *buf, size_t len) +{ + const char *nodename = clp->cl_rpcclient->cl_nodename; + + if (nfs4_client_id_uniquifier[0] != '\0') + return scnprintf(buf, len, "Linux NFSv%u.%u %s/%s", + clp->rpc_ops->version, + clp->cl_minorversion, + nfs4_client_id_uniquifier, + nodename); + return scnprintf(buf, len, "Linux NFSv%u.%u %s", + clp->rpc_ops->version, clp->cl_minorversion, + nodename); +} + +/* + * nfs4_callback_up_net() starts only "tcp" and "tcp6" callback + * services. Advertise one based on the address family of the + * clientaddr. + */ +static unsigned int +nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len) +{ + if (strchr(clp->cl_ipaddr, ':') != NULL) + return scnprintf(buf, len, "tcp6"); + else + return scnprintf(buf, len, "tcp"); +} + +/** + * nfs4_proc_setclientid - Negotiate client ID + * @clp: state data structure + * @program: RPC program for NFSv4 callback service + * @port: IP port number for NFS4 callback service + * @cred: RPC credential to use for this call + * @res: where to place the result + * + * Returns zero, a negative errno, or a negative NFS4ERR status code. + */ +int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, + unsigned short port, struct rpc_cred *cred, + struct nfs4_setclientid_res *res) { nfs4_verifier sc_verifier; struct nfs4_setclientid setclientid = { .sc_verifier = &sc_verifier, .sc_prog = program, + .sc_cb_ident = clp->cl_cb_ident, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], .rpc_argp = &setclientid, - .rpc_resp = clp, + .rpc_resp = res, .rpc_cred = cred, }; - __be32 *p; - int loop = 0; int status; - p = (__be32*)sc_verifier.data; - *p++ = htonl((u32)clp->cl_boot_time.tv_sec); - *p = htonl((u32)clp->cl_boot_time.tv_nsec); - - for(;;) { - setclientid.sc_name_len = scnprintf(setclientid.sc_name, - sizeof(setclientid.sc_name), "%s/%s %s %s %u", - clp->cl_ipaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR), - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_PROTO), - cred->cr_ops->cr_name, - clp->cl_id_uniquifier); - setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, - sizeof(setclientid.sc_netid), - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_NETID)); - setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, + /* nfs_client_id4 */ + nfs4_init_boot_verifier(clp, &sc_verifier); + if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags)) + setclientid.sc_name_len = + nfs4_init_uniform_client_string(clp, + setclientid.sc_name, + sizeof(setclientid.sc_name)); + else + setclientid.sc_name_len = + nfs4_init_nonuniform_client_string(clp, + setclientid.sc_name, + sizeof(setclientid.sc_name)); + /* cb_client4 */ + setclientid.sc_netid_len = + nfs4_init_callback_netid(clp, + setclientid.sc_netid, + sizeof(setclientid.sc_netid)); + setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, sizeof(setclientid.sc_uaddr), "%s.%u.%u", clp->cl_ipaddr, port >> 8, port & 255); - status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); - if (status != -NFS4ERR_CLID_INUSE) - break; - if (signalled()) - break; - if (loop++ & 1) - ssleep(clp->cl_lease_time + 1); - else - if (++clp->cl_id_uniquifier == 0) - break; - } + dprintk("NFS call setclientid auth=%s, '%.*s'\n", + clp->cl_rpcclient->cl_auth->au_ops->au_name, + setclientid.sc_name_len, setclientid.sc_name); + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_setclientid(clp, status); + dprintk("NFS reply setclientid: %d\n", status); return status; } -static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) +/** + * nfs4_proc_setclientid_confirm - Confirm client ID + * @clp: state data structure + * @res: result of a previous SETCLIENTID + * @cred: RPC credential to use for this call + * + * Returns zero, a negative errno, or a negative NFS4ERR status code. + */ +int nfs4_proc_setclientid_confirm(struct nfs_client *clp, + struct nfs4_setclientid_res *arg, + struct rpc_cred *cred) { - struct nfs_fsinfo fsinfo; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], - .rpc_argp = clp, - .rpc_resp = &fsinfo, + .rpc_argp = arg, .rpc_cred = cred, }; - unsigned long now; int status; - now = jiffies; - status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); - if (status == 0) { - spin_lock(&clp->cl_lock); - clp->cl_lease_time = fsinfo.lease_time * HZ; - clp->cl_last_renewal = now; - clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - spin_unlock(&clp->cl_lock); - } + dprintk("NFS call setclientid_confirm auth=%s, (client ID %llx)\n", + clp->cl_rpcclient->cl_auth->au_ops->au_name, + clp->cl_clientid); + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_setclientid_confirm(clp, status); + dprintk("NFS reply setclientid_confirm: %d\n", status); return status; } -int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) -{ - long timeout; - int err; - do { - err = _nfs4_proc_setclientid_confirm(clp, cred); - switch (err) { - case 0: - return err; - case -NFS4ERR_RESOURCE: - /* The IBM lawyers misread another document! */ - case -NFS4ERR_DELAY: - err = nfs4_delay(clp->cl_rpcclient, &timeout); - } - } while (err == 0); - return err; -} - struct nfs4_delegreturndata { struct nfs4_delegreturnargs args; struct nfs4_delegreturnres res; @@ -2963,9 +4980,31 @@ struct nfs4_delegreturndata { static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_delegreturndata *data = calldata; - data->rpc_status = task->tk_status; - if (data->rpc_status == 0) + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status); + switch (task->tk_status) { + case 0: renew_lease(data->res.server, data->timestamp); + break; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + task->tk_status = 0; + break; + default: + if (nfs4_async_handle_error(task, data->res.server, NULL) == + -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + } + data->rpc_status = task->tk_status; } static void nfs4_delegreturn_release(void *calldata) @@ -2973,7 +5012,20 @@ static void nfs4_delegreturn_release(void *calldata) kfree(calldata); } +static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_delegreturndata *d_data; + + d_data = (struct nfs4_delegreturndata *)data; + + nfs4_setup_sequence(d_data->res.server, + &d_data->args.seq_args, + &d_data->res.seq_res, + task); +} + static const struct rpc_call_ops nfs4_delegreturn_ops = { + .rpc_call_prepare = nfs4_delegreturn_prepare, .rpc_call_done = nfs4_delegreturn_done, .rpc_release = nfs4_delegreturn_release, }; @@ -2995,14 +5047,15 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co }; int status = 0; - data = kmalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_NOFS); if (data == NULL) return -ENOMEM; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); data->args.fhandle = &data->fh; data->args.stateid = &data->stateid; - data->args.bitmask = server->attr_bitmask; + data->args.bitmask = server->cache_consistency_bitmask; nfs_copy_fh(&data->fh, NFS_FH(inode)); - memcpy(&data->stateid, stateid, sizeof(data->stateid)); + nfs4_stateid_copy(&data->stateid, stateid); data->res.fattr = &data->fattr; data->res.server = server; nfs_fattr_init(data->res.fattr); @@ -3010,8 +5063,8 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co data->rpc_status = 0; task_setup_data.callback_data = data; - msg.rpc_argp = &data->args, - msg.rpc_resp = &data->res, + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -3021,9 +5074,10 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co if (status != 0) goto out; status = data->rpc_status; - if (status != 0) - goto out; - nfs_refresh_inode(inode, &data->fattr); + if (status == 0) + nfs_post_op_update_inode_force_wcc(inode, &data->fattr); + else + nfs_refresh_inode(inode, &data->fattr); out: rpc_put_task(task); return status; @@ -3036,6 +5090,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 int err; do { err = _nfs4_proc_delegreturn(inode, cred, stateid, issync); + trace_nfs4_delegreturn(inode, err); switch (err) { case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: @@ -3056,7 +5111,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 static unsigned long nfs4_set_lock_task_retry(unsigned long timeout) { - schedule_timeout_killable(timeout); + freezable_schedule_timeout_killable_unsafe(timeout); timeout <<= 1; if (timeout > NFS4_LOCK_MAXTIMEOUT) return NFS4_LOCK_MAXTIMEOUT; @@ -3084,14 +5139,14 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock struct nfs4_lock_state *lsp; int status; - down_read(&clp->cl_sem); arg.lock_owner.clientid = clp->cl_clientid; status = nfs4_set_lock_state(state, request); if (status != 0) goto out; lsp = request->fl_u.nfs4_fl.owner; - arg.lock_owner.id = lsp->ls_id.id; - status = rpc_call_sync(server->client, &msg, 0); + arg.lock_owner.id = lsp->ls_seqid.owner_id; + arg.lock_owner.s_dev = server->s_dev; + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); switch (status) { case 0: request->fl_type = F_UNLCK; @@ -3100,8 +5155,8 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock status = 0; } request->fl_ops->fl_release_private(request); + request->fl_ops = NULL; out: - up_read(&clp->cl_sem); return status; } @@ -3111,8 +5166,9 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * int err; do { - err = nfs4_handle_exception(NFS_SERVER(state->inode), - _nfs4_proc_getlk(state, cmd, request), + err = _nfs4_proc_getlk(state, cmd, request); + trace_nfs4_get_lock(request, state, cmd, err); + err = nfs4_handle_exception(NFS_SERVER(state->inode), err, &exception); } while (exception.retry); return err; @@ -3152,12 +5208,13 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs4_unlockdata *p; struct inode *inode = lsp->ls_state->inode; - p = kmalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc(sizeof(*p), GFP_NOFS); if (p == NULL) return NULL; p->arg.fh = NFS_FH(inode); p->arg.fl = &p->fl; p->arg.seqid = seqid; + p->res.seqid = seqid; p->arg.stateid = &lsp->ls_stateid; p->lsp = lsp; atomic_inc(&lsp->ls_count); @@ -3181,23 +5238,24 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) { struct nfs4_unlockdata *calldata = data; - if (RPC_ASSASSINATED(task)) + if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; - nfs_increment_lock_seqid(task->tk_status, calldata->arg.seqid); switch (task->tk_status) { case 0: - memcpy(calldata->lsp->ls_stateid.data, - calldata->res.stateid.data, - sizeof(calldata->lsp->ls_stateid.data)); + nfs4_stateid_copy(&calldata->lsp->ls_stateid, + &calldata->res.stateid); renew_lease(calldata->server, calldata->timestamp); break; + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: break; default: - if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN) - rpc_restart_call(task); + if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); } + nfs_release_seqid(calldata->arg.seqid); } static void nfs4_locku_prepare(struct rpc_task *task, void *data) @@ -3205,14 +5263,22 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) struct nfs4_unlockdata *calldata = data; if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) - return; - if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) { + goto out_wait; + if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { /* Note: exit _without_ running nfs4_locku_done */ - task->tk_action = NULL; - return; + goto out_no_action; } calldata->timestamp = jiffies; - rpc_call_start(task); + if (nfs4_setup_sequence(calldata->server, + &calldata->arg.seq_args, + &calldata->res.seq_res, + task) != 0) + nfs_release_seqid(calldata->arg.seqid); + return; +out_no_action: + task->tk_action = NULL; +out_wait: + nfs4_sequence_done(task, &calldata->res.seq_res); } static const struct rpc_call_ops nfs4_locku_ops = { @@ -3235,9 +5301,13 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, .rpc_client = NFS_CLIENT(lsp->ls_state->inode), .rpc_message = &msg, .callback_ops = &nfs4_locku_ops, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; + nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client, + NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg); + /* Ensure this is an unlock - when canceling a lock, the * canceled lock is passed in, and it won't be an unlock. */ @@ -3249,31 +5319,45 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, return ERR_PTR(-ENOMEM); } - msg.rpc_argp = &data->arg, - msg.rpc_resp = &data->res, + nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; task_setup_data.callback_data = data; return rpc_run_task(&task_setup_data); } static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) { + struct inode *inode = state->inode; + struct nfs4_state_owner *sp = state->owner; + struct nfs_inode *nfsi = NFS_I(inode); struct nfs_seqid *seqid; struct nfs4_lock_state *lsp; struct rpc_task *task; int status = 0; + unsigned char fl_flags = request->fl_flags; status = nfs4_set_lock_state(state, request); /* Unlock _before_ we do the RPC call */ request->fl_flags |= FL_EXISTS; - if (do_vfs_lock(request->fl_file, request) == -ENOENT) + /* Exclude nfs_delegation_claim_locks() */ + mutex_lock(&sp->so_delegreturn_mutex); + /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ + down_read(&nfsi->rwsem); + if (do_vfs_lock(request->fl_file, request) == -ENOENT) { + up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); goto out; + } + up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); if (status != 0) goto out; /* Is this a delegated lock? */ - if (test_bit(NFS_DELEGATED_STATE, &state->flags)) - goto out; lsp = request->fl_u.nfs4_fl.owner; - seqid = nfs_alloc_seqid(&lsp->ls_seqid); + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0) + goto out; + seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL); status = -ENOMEM; if (seqid == NULL) goto out; @@ -3284,6 +5368,8 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * status = nfs4_wait_for_completion_rpc_task(task); rpc_put_task(task); out: + request->fl_flags = fl_flags; + trace_nfs4_unlock(request, state, F_SETLK, status); return status; } @@ -3296,31 +5382,36 @@ struct nfs4_lockdata { unsigned long timestamp; int rpc_status; int cancelled; + struct nfs_server *server; }; static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, - struct nfs_open_context *ctx, struct nfs4_lock_state *lsp) + struct nfs_open_context *ctx, struct nfs4_lock_state *lsp, + gfp_t gfp_mask) { struct nfs4_lockdata *p; struct inode *inode = lsp->ls_state->inode; struct nfs_server *server = NFS_SERVER(inode); - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc(sizeof(*p), gfp_mask); if (p == NULL) return NULL; p->arg.fh = NFS_FH(inode); p->arg.fl = &p->fl; - p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid); + p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask); if (p->arg.open_seqid == NULL) goto out_free; - p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid); + p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask); if (p->arg.lock_seqid == NULL) goto out_free_seqid; p->arg.lock_stateid = &lsp->ls_stateid; p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; - p->arg.lock_owner.id = lsp->ls_id.id; + p->arg.lock_owner.id = lsp->ls_seqid.owner_id; + p->arg.lock_owner.s_dev = server->s_dev; + p->res.lock_seqid = p->arg.lock_seqid; p->lsp = lsp; + p->server = server; atomic_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); memcpy(&p->fl, fl, sizeof(p->fl)); @@ -3337,68 +5428,83 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) struct nfs4_lockdata *data = calldata; struct nfs4_state *state = data->lsp->ls_state; - dprintk("%s: begin!\n", __FUNCTION__); + dprintk("%s: begin!\n", __func__); if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) - return; + goto out_wait; /* Do we need to do an open_to_lock_owner? */ if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { - if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) - return; - data->arg.open_stateid = &state->stateid; + if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { + goto out_release_lock_seqid; + } + data->arg.open_stateid = &state->open_stateid; data->arg.new_lock_owner = 1; + data->res.open_seqid = data->arg.open_seqid; } else data->arg.new_lock_owner = 0; + if (!nfs4_valid_open_stateid(state)) { + data->rpc_status = -EBADF; + task->tk_action = NULL; + goto out_release_open_seqid; + } data->timestamp = jiffies; - rpc_call_start(task); - dprintk("%s: done!, ret = %d\n", __FUNCTION__, data->rpc_status); + if (nfs4_setup_sequence(data->server, + &data->arg.seq_args, + &data->res.seq_res, + task) == 0) + return; +out_release_open_seqid: + nfs_release_seqid(data->arg.open_seqid); +out_release_lock_seqid: + nfs_release_seqid(data->arg.lock_seqid); +out_wait: + nfs4_sequence_done(task, &data->res.seq_res); + dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); } static void nfs4_lock_done(struct rpc_task *task, void *calldata) { struct nfs4_lockdata *data = calldata; - dprintk("%s: begin!\n", __FUNCTION__); + dprintk("%s: begin!\n", __func__); + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; data->rpc_status = task->tk_status; - if (RPC_ASSASSINATED(task)) - goto out; if (data->arg.new_lock_owner != 0) { - nfs_increment_open_seqid(data->rpc_status, data->arg.open_seqid); if (data->rpc_status == 0) nfs_confirm_seqid(&data->lsp->ls_seqid, 0); else goto out; } if (data->rpc_status == 0) { - memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, - sizeof(data->lsp->ls_stateid.data)); - data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; - renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); + nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid); + set_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags); + renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp); } - nfs_increment_lock_seqid(data->rpc_status, data->arg.lock_seqid); out: - dprintk("%s: done, ret = %d!\n", __FUNCTION__, data->rpc_status); + dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status); } static void nfs4_lock_release(void *calldata) { struct nfs4_lockdata *data = calldata; - dprintk("%s: begin!\n", __FUNCTION__); + dprintk("%s: begin!\n", __func__); nfs_free_seqid(data->arg.open_seqid); if (data->cancelled != 0) { struct rpc_task *task; task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp, data->arg.lock_seqid); if (!IS_ERR(task)) - rpc_put_task(task); - dprintk("%s: cancelling lock!\n", __FUNCTION__); + rpc_put_task_async(task); + dprintk("%s: cancelling lock!\n", __func__); } else nfs_free_seqid(data->arg.lock_seqid); nfs4_put_lock_state(data->lsp); put_nfs_open_context(data->ctx); kfree(data); - dprintk("%s: done!\n", __FUNCTION__); + dprintk("%s: done!\n", __func__); } static const struct rpc_call_ops nfs4_lock_ops = { @@ -3407,7 +5513,24 @@ static const struct rpc_call_ops nfs4_lock_ops = { .rpc_release = nfs4_lock_release, }; -static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int reclaim) +static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) +{ + switch (error) { + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + if (new_lock_owner != 0 || + test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) + nfs4_schedule_stateid_recovery(server, lsp->ls_state); + break; + case -NFS4ERR_STALE_STATEID: + lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + case -NFS4ERR_EXPIRED: + nfs4_schedule_lease_recovery(server->nfs_client); + }; +} + +static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type) { struct nfs4_lockdata *data; struct rpc_task *task; @@ -3419,48 +5542,58 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f .rpc_client = NFS_CLIENT(state->inode), .rpc_message = &msg, .callback_ops = &nfs4_lock_ops, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; int ret; - dprintk("%s: begin!\n", __FUNCTION__); + dprintk("%s: begin!\n", __func__); data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), - fl->fl_u.nfs4_fl.owner); + fl->fl_u.nfs4_fl.owner, + recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS); if (data == NULL) return -ENOMEM; if (IS_SETLKW(cmd)) data->arg.block = 1; - if (reclaim != 0) - data->arg.reclaim = 1; - msg.rpc_argp = &data->arg, - msg.rpc_resp = &data->res, + nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; task_setup_data.callback_data = data; + if (recovery_type > NFS_LOCK_NEW) { + if (recovery_type == NFS_LOCK_RECLAIM) + data->arg.reclaim = NFS_LOCK_RECLAIM; + nfs4_set_sequence_privileged(&data->arg.seq_args); + } task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); ret = nfs4_wait_for_completion_rpc_task(task); if (ret == 0) { ret = data->rpc_status; - if (ret == -NFS4ERR_DENIED) - ret = -EAGAIN; + if (ret) + nfs4_handle_setlk_error(data->server, data->lsp, + data->arg.new_lock_owner, ret); } else data->cancelled = 1; rpc_put_task(task); - dprintk("%s: done, ret = %d!\n", __FUNCTION__, ret); + dprintk("%s: done, ret = %d!\n", __func__, ret); return ret; } static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request) { struct nfs_server *server = NFS_SERVER(state->inode); - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .inode = state->inode, + }; int err; do { /* Cache the lock if possible... */ if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; - err = _nfs4_do_setlk(state, F_SETLK, request, 1); + err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); + trace_nfs4_lock_reclaim(request, state, F_SETLK, err); if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -3471,29 +5604,98 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request) { struct nfs_server *server = NFS_SERVER(state->inode); - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .inode = state->inode, + }; int err; err = nfs4_set_lock_state(state, request); if (err != 0) return err; + if (!recover_lost_locks) { + set_bit(NFS_LOCK_LOST, &request->fl_u.nfs4_fl.owner->ls_flags); + return 0; + } do { if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; - err = _nfs4_do_setlk(state, F_SETLK, request, 0); - if (err != -NFS4ERR_DELAY) - break; - nfs4_handle_exception(server, err, &exception); + err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED); + trace_nfs4_lock_expired(request, state, F_SETLK, err); + switch (err) { + default: + goto out; + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + nfs4_handle_exception(server, err, &exception); + err = 0; + } } while (exception.retry); +out: return err; } +#if defined(CONFIG_NFS_V4_1) +/** + * nfs41_check_expired_locks - possibly free a lock stateid + * + * @state: NFSv4 state for an inode + * + * Returns NFS_OK if recovery for this stateid is now finished. + * Otherwise a negative NFS4ERR value is returned. + */ +static int nfs41_check_expired_locks(struct nfs4_state *state) +{ + int status, ret = -NFS4ERR_BAD_STATEID; + struct nfs4_lock_state *lsp; + struct nfs_server *server = NFS_SERVER(state->inode); + + list_for_each_entry(lsp, &state->lock_states, ls_locks) { + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { + struct rpc_cred *cred = lsp->ls_state->owner->so_cred; + + status = nfs41_test_stateid(server, + &lsp->ls_stateid, + cred); + trace_nfs4_test_lock_stateid(state, lsp, status); + if (status != NFS_OK) { + /* Free the stateid unless the server + * informs us the stateid is unrecognized. */ + if (status != -NFS4ERR_BAD_STATEID) + nfs41_free_stateid(server, + &lsp->ls_stateid, + cred); + clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); + ret = status; + } + } + }; + + return ret; +} + +static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) +{ + int status = NFS_OK; + + if (test_bit(LK_STATE_IN_USE, &state->flags)) + status = nfs41_check_expired_locks(state); + if (status != NFS_OK) + status = nfs4_lock_expired(state, request); + return status; +} +#endif + static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { - struct nfs_client *clp = state->owner->so_client; + struct nfs4_state_owner *sp = state->owner; + struct nfs_inode *nfsi = NFS_I(state->inode); unsigned char fl_flags = request->fl_flags; - int status; + unsigned int seq; + int status = -ENOLCK; + if ((fl_flags & FL_POSIX) && + !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) + goto out; /* Is this a delegated open? */ status = nfs4_set_lock_state(state, request); if (status != 0) @@ -3502,29 +5704,31 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock status = do_vfs_lock(request->fl_file, request); if (status < 0) goto out; - down_read(&clp->cl_sem); + down_read(&nfsi->rwsem); if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { - struct nfs_inode *nfsi = NFS_I(state->inode); /* Yes: cache locks! */ - down_read(&nfsi->rwsem); /* ...but avoid races with delegation recall... */ - if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { - request->fl_flags = fl_flags & ~FL_SLEEP; - status = do_vfs_lock(request->fl_file, request); - up_read(&nfsi->rwsem); - goto out_unlock; - } - up_read(&nfsi->rwsem); + request->fl_flags = fl_flags & ~FL_SLEEP; + status = do_vfs_lock(request->fl_file, request); + goto out_unlock; } - status = _nfs4_do_setlk(state, cmd, request, 0); + seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); + up_read(&nfsi->rwsem); + status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); if (status != 0) + goto out; + down_read(&nfsi->rwsem); + if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) { + status = -NFS4ERR_DELAY; goto out_unlock; + } /* Note: we always want to sleep here! */ request->fl_flags = fl_flags | FL_SLEEP; if (do_vfs_lock(request->fl_file, request) < 0) - printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __FUNCTION__); + printk(KERN_WARNING "NFS: %s: VFS is out of sync with lock " + "manager!\n", __func__); out_unlock: - up_read(&clp->cl_sem); + up_read(&nfsi->rwsem); out: request->fl_flags = fl_flags; return status; @@ -3532,13 +5736,19 @@ out: static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .state = state, + .inode = state->inode, + }; int err; do { + err = _nfs4_proc_setlk(state, cmd, request); + trace_nfs4_set_lock(request, state, cmd, err); + if (err == -NFS4ERR_DENIED) + err = -EAGAIN; err = nfs4_handle_exception(NFS_SERVER(state->inode), - _nfs4_proc_setlk(state, cmd, request), - &exception); + err, &exception); } while (exception.retry); return err; } @@ -3558,14 +5768,36 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (request->fl_start < 0 || request->fl_end < 0) return -EINVAL; - if (IS_GETLK(cmd)) - return nfs4_proc_getlk(state, F_GETLK, request); + if (IS_GETLK(cmd)) { + if (state != NULL) + return nfs4_proc_getlk(state, F_GETLK, request); + return 0; + } if (!(IS_SETLK(cmd) || IS_SETLKW(cmd))) return -EINVAL; - if (request->fl_type == F_UNLCK) - return nfs4_proc_unlck(state, cmd, request); + if (request->fl_type == F_UNLCK) { + if (state != NULL) + return nfs4_proc_unlck(state, cmd, request); + return 0; + } + + if (state == NULL) + return -ENOLCK; + /* + * Don't rely on the VFS having checked the file open mode, + * since it won't do this for flock() locks. + */ + switch (request->fl_type) { + case F_RDLCK: + if (!(filp->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + } do { status = nfs4_proc_setlk(state, cmd, request); @@ -3579,73 +5811,200 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) return status; } -int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) +int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid) { struct nfs_server *server = NFS_SERVER(state->inode); - struct nfs4_exception exception = { }; int err; err = nfs4_set_lock_state(state, fl); if (err != 0) - goto out; - do { - err = _nfs4_do_setlk(state, F_SETLK, fl, 0); - if (err != -NFS4ERR_DELAY) - break; - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); -out: - return err; + return err; + err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); + return nfs4_handle_delegation_recall_error(server, state, stateid, err); } -#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" +struct nfs_release_lockowner_data { + struct nfs4_lock_state *lsp; + struct nfs_server *server; + struct nfs_release_lockowner_args args; + struct nfs_release_lockowner_res res; + unsigned long timestamp; +}; -int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, - size_t buflen, int flags) +static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata) { - struct inode *inode = dentry->d_inode; + struct nfs_release_lockowner_data *data = calldata; + nfs40_setup_sequence(data->server, + &data->args.seq_args, &data->res.seq_res, task); + data->timestamp = jiffies; +} - if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0) - return -EOPNOTSUPP; +static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata) +{ + struct nfs_release_lockowner_data *data = calldata; + struct nfs_server *server = data->server; + + nfs40_sequence_done(task, &data->res.seq_res); - return nfs4_proc_set_acl(inode, buf, buflen); + switch (task->tk_status) { + case 0: + renew_lease(server, data->timestamp); + break; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_EXPIRED: + case -NFS4ERR_LEASE_MOVED: + case -NFS4ERR_DELAY: + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } } -/* The getxattr man page suggests returning -ENODATA for unknown attributes, - * and that's what we'll do for e.g. user attributes that haven't been set. - * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported - * attributes in kernel-managed attribute namespaces. */ -ssize_t nfs4_getxattr(struct dentry *dentry, const char *key, void *buf, - size_t buflen) +static void nfs4_release_lockowner_release(void *calldata) { - struct inode *inode = dentry->d_inode; + struct nfs_release_lockowner_data *data = calldata; + nfs4_free_lock_state(data->server, data->lsp); + kfree(calldata); +} - if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0) - return -EOPNOTSUPP; +static const struct rpc_call_ops nfs4_release_lockowner_ops = { + .rpc_call_prepare = nfs4_release_lockowner_prepare, + .rpc_call_done = nfs4_release_lockowner_done, + .rpc_release = nfs4_release_lockowner_release, +}; + +static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) +{ + struct nfs_release_lockowner_data *data; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], + }; + + if (server->nfs_client->cl_mvops->minor_version != 0) + return -EINVAL; + + data = kmalloc(sizeof(*data), GFP_NOFS); + if (!data) + return -ENOMEM; + data->lsp = lsp; + data->server = server; + data->args.lock_owner.clientid = server->nfs_client->cl_clientid; + data->args.lock_owner.id = lsp->ls_seqid.owner_id; + data->args.lock_owner.s_dev = server->s_dev; + + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); + return 0; +} + +#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" + +static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key, + const void *buf, size_t buflen, + int flags, int type) +{ + if (strcmp(key, "") != 0) + return -EINVAL; - return nfs4_proc_get_acl(inode, buf, buflen); + return nfs4_proc_set_acl(dentry->d_inode, buf, buflen); } -ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen) +static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key, + void *buf, size_t buflen, int type) { - size_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1; + if (strcmp(key, "") != 0) + return -EINVAL; + + return nfs4_proc_get_acl(dentry->d_inode, buf, buflen); +} + +static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, + size_t list_len, const char *name, + size_t name_len, int type) +{ + size_t len = sizeof(XATTR_NAME_NFSV4_ACL); if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode))) return 0; - if (buf && buflen < len) - return -ERANGE; - if (buf) - memcpy(buf, XATTR_NAME_NFSV4_ACL, len); + + if (list && len <= list_len) + memcpy(list, XATTR_NAME_NFSV4_ACL, len); return len; } -int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, - struct nfs4_fs_locations *fs_locations, struct page *page) +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static inline int nfs4_server_supports_labels(struct nfs_server *server) +{ + return server->caps & NFS_CAP_SECURITY_LABEL; +} + +static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key, + const void *buf, size_t buflen, + int flags, int type) +{ + if (security_ismaclabel(key)) + return nfs4_set_security_label(dentry, buf, buflen); + + return -EOPNOTSUPP; +} + +static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key, + void *buf, size_t buflen, int type) +{ + if (security_ismaclabel(key)) + return nfs4_get_security_label(dentry->d_inode, buf, buflen); + return -EOPNOTSUPP; +} + +static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list, + size_t list_len, const char *name, + size_t name_len, int type) +{ + size_t len = 0; + + if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) { + len = security_inode_listsecurity(dentry->d_inode, NULL, 0); + if (list && len <= list_len) + security_inode_listsecurity(dentry->d_inode, list, len); + } + return len; +} + +static const struct xattr_handler nfs4_xattr_nfs4_label_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = nfs4_xattr_list_nfs4_label, + .get = nfs4_xattr_get_nfs4_label, + .set = nfs4_xattr_set_nfs4_label, +}; +#endif + + +/* + * nfs_fhget will use either the mounted_on_fileid or the fileid + */ +static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr) +{ + if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) || + (fattr->valid & NFS_ATTR_FATTR_FILEID)) && + (fattr->valid & NFS_ATTR_FATTR_FSID) && + (fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS))) + return; + + fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | + NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_V4_REFERRAL; + fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO; + fattr->nlink = 2; +} + +static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, + const struct qstr *name, + struct nfs4_fs_locations *fs_locations, + struct page *page) { struct nfs_server *server = NFS_SERVER(dir); - u32 bitmask[2] = { + u32 bitmask[3] = { [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, - [1] = FATTR4_WORD1_MOUNTED_ON_FILEID, }; struct nfs4_fs_locations_arg args = { .dir_fh = NFS_FH(dir), @@ -3653,39 +6012,2376 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, .page = page, .bitmask = bitmask, }; + struct nfs4_fs_locations_res res = { + .fs_locations = fs_locations, + }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], .rpc_argp = &args, - .rpc_resp = fs_locations, + .rpc_resp = &res, }; int status; - dprintk("%s: start\n", __FUNCTION__); + dprintk("%s: start\n", __func__); + + /* Ask for the fileid of the absent filesystem if mounted_on_fileid + * is not supported */ + if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) + bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; + else + bitmask[0] |= FATTR4_WORD0_FILEID; + nfs_fattr_init(&fs_locations->fattr); fs_locations->server = server; fs_locations->nlocations = 0; - status = rpc_call_sync(server->client, &msg, 0); - dprintk("%s: returned status = %d\n", __FUNCTION__, status); + status = nfs4_call_sync(client, server, &msg, &args.seq_args, &res.seq_res, 0); + dprintk("%s: returned status = %d\n", __func__, status); + return status; +} + +int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, + const struct qstr *name, + struct nfs4_fs_locations *fs_locations, + struct page *page) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs4_proc_fs_locations(client, dir, name, + fs_locations, page); + trace_nfs4_get_fs_locations(dir, name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, + &exception); + } while (exception.retry); + return err; +} + +/* + * This operation also signals the server that this client is + * performing migration recovery. The server can stop returning + * NFS4ERR_LEASE_MOVED to this client. A RENEW operation is + * appended to this compound to identify the client ID which is + * performing recovery. + */ +static int _nfs40_proc_get_locations(struct inode *inode, + struct nfs4_fs_locations *locations, + struct page *page, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_clnt *clnt = server->client; + u32 bitmask[2] = { + [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, + }; + struct nfs4_fs_locations_arg args = { + .clientid = server->nfs_client->cl_clientid, + .fh = NFS_FH(inode), + .page = page, + .bitmask = bitmask, + .migration = 1, /* skip LOOKUP */ + .renew = 1, /* append RENEW */ + }; + struct nfs4_fs_locations_res res = { + .fs_locations = locations, + .migration = 1, + .renew = 1, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + unsigned long now = jiffies; + int status; + + nfs_fattr_init(&locations->fattr); + locations->server = server; + locations->nlocations = 0; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + if (status) + return status; + + renew_lease(server, now); + return 0; +} + +#ifdef CONFIG_NFS_V4_1 + +/* + * This operation also signals the server that this client is + * performing migration recovery. The server can stop asserting + * SEQ4_STATUS_LEASE_MOVED for this client. The client ID + * performing this operation is identified in the SEQUENCE + * operation in this compound. + * + * When the client supports GETATTR(fs_locations_info), it can + * be plumbed in here. + */ +static int _nfs41_proc_get_locations(struct inode *inode, + struct nfs4_fs_locations *locations, + struct page *page, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_clnt *clnt = server->client; + u32 bitmask[2] = { + [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, + }; + struct nfs4_fs_locations_arg args = { + .fh = NFS_FH(inode), + .page = page, + .bitmask = bitmask, + .migration = 1, /* skip LOOKUP */ + }; + struct nfs4_fs_locations_res res = { + .fs_locations = locations, + .migration = 1, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + nfs_fattr_init(&locations->fattr); + locations->server = server; + locations->nlocations = 0; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + if (status == NFS4_OK && + res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) + status = -NFS4ERR_LEASE_MOVED; + return status; +} + +#endif /* CONFIG_NFS_V4_1 */ + +/** + * nfs4_proc_get_locations - discover locations for a migrated FSID + * @inode: inode on FSID that is migrating + * @locations: result of query + * @page: buffer + * @cred: credential to use for this operation + * + * Returns NFS4_OK on success, a negative NFS4ERR status code if the + * operation failed, or a negative errno if a local error occurred. + * + * On success, "locations" is filled in, but if the server has + * no locations information, NFS_ATTR_FATTR_V4_LOCATIONS is not + * asserted. + * + * -NFS4ERR_LEASE_MOVED is returned if the server still has leases + * from this client that require migration recovery. + */ +int nfs4_proc_get_locations(struct inode *inode, + struct nfs4_fs_locations *locations, + struct page *page, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + const struct nfs4_mig_recovery_ops *ops = + clp->cl_mvops->mig_recovery_ops; + struct nfs4_exception exception = { }; + int status; + + dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + nfs_display_fhandle(NFS_FH(inode), __func__); + + do { + status = ops->get_locations(inode, locations, page, cred); + if (status != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, status, &exception); + } while (exception.retry); + return status; +} + +/* + * This operation also signals the server that this client is + * performing "lease moved" recovery. The server can stop + * returning NFS4ERR_LEASE_MOVED to this client. A RENEW operation + * is appended to this compound to identify the client ID which is + * performing recovery. + */ +static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct rpc_clnt *clnt = server->client; + struct nfs4_fsid_present_arg args = { + .fh = NFS_FH(inode), + .clientid = clp->cl_clientid, + .renew = 1, /* append RENEW */ + }; + struct nfs4_fsid_present_res res = { + .renew = 1, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + unsigned long now = jiffies; + int status; + + res.fh = nfs_alloc_fhandle(); + if (res.fh == NULL) + return -ENOMEM; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + nfs_free_fhandle(res.fh); + if (status) + return status; + + do_renew_lease(clp, now); + return 0; +} + +#ifdef CONFIG_NFS_V4_1 + +/* + * This operation also signals the server that this client is + * performing "lease moved" recovery. The server can stop asserting + * SEQ4_STATUS_LEASE_MOVED for this client. The client ID performing + * this operation is identified in the SEQUENCE operation in this + * compound. + */ +static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_clnt *clnt = server->client; + struct nfs4_fsid_present_arg args = { + .fh = NFS_FH(inode), + }; + struct nfs4_fsid_present_res res = { + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + res.fh = nfs_alloc_fhandle(); + if (res.fh == NULL) + return -ENOMEM; + + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(clnt, server, &msg, + &args.seq_args, &res.seq_res); + nfs_free_fhandle(res.fh); + if (status == NFS4_OK && + res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) + status = -NFS4ERR_LEASE_MOVED; + return status; +} + +#endif /* CONFIG_NFS_V4_1 */ + +/** + * nfs4_proc_fsid_present - Is this FSID present or absent on server? + * @inode: inode on FSID to check + * @cred: credential to use for this operation + * + * Server indicates whether the FSID is present, moved, or not + * recognized. This operation is necessary to clear a LEASE_MOVED + * condition for this client ID. + * + * Returns NFS4_OK if the FSID is present on this server, + * -NFS4ERR_MOVED if the FSID is no longer present, a negative + * NFS4ERR code if some error occurred on the server, or a + * negative errno if a local failure occurred. + */ +int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; + const struct nfs4_mig_recovery_ops *ops = + clp->cl_mvops->mig_recovery_ops; + struct nfs4_exception exception = { }; + int status; + + dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + nfs_display_fhandle(NFS_FH(inode), __func__); + + do { + status = ops->fsid_present(inode, cred); + if (status != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, status, &exception); + } while (exception.retry); + return status; +} + +/** + * If 'use_integrity' is true and the state managment nfs_client + * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient + * and the machine credential as per RFC3530bis and RFC5661 Security + * Considerations sections. Otherwise, just use the user cred with the + * filesystem's rpc_client. + */ +static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity) +{ + int status; + struct nfs4_secinfo_arg args = { + .dir_fh = NFS_FH(dir), + .name = name, + }; + struct nfs4_secinfo_res res = { + .flavors = flavors, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO], + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct rpc_clnt *clnt = NFS_SERVER(dir)->client; + struct rpc_cred *cred = NULL; + + if (use_integrity) { + clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient; + cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client); + msg.rpc_cred = cred; + } + + dprintk("NFS call secinfo %s\n", name->name); + + nfs4_state_protect(NFS_SERVER(dir)->nfs_client, + NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + + status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args, + &res.seq_res, 0); + dprintk("NFS reply secinfo: %d\n", status); + + if (cred) + put_rpccred(cred); + + return status; +} + +int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, + struct nfs4_secinfo_flavors *flavors) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = -NFS4ERR_WRONGSEC; + + /* try to use integrity protection with machine cred */ + if (_nfs4_is_integrity_protected(NFS_SERVER(dir)->nfs_client)) + err = _nfs4_proc_secinfo(dir, name, flavors, true); + + /* + * if unable to use integrity protection, or SECINFO with + * integrity protection returns NFS4ERR_WRONGSEC (which is + * disallowed by spec, but exists in deployed servers) use + * the current filesystem's rpc_client and the user cred. + */ + if (err == -NFS4ERR_WRONGSEC) + err = _nfs4_proc_secinfo(dir, name, flavors, false); + + trace_nfs4_secinfo(dir, name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, + &exception); + } while (exception.retry); + return err; +} + +#ifdef CONFIG_NFS_V4_1 +/* + * Check the exchange flags returned by the server for invalid flags, having + * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or + * DS flags set. + */ +static int nfs4_check_cl_exchange_flags(u32 flags) +{ + if (flags & ~EXCHGID4_FLAG_MASK_R) + goto out_inval; + if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) && + (flags & EXCHGID4_FLAG_USE_NON_PNFS)) + goto out_inval; + if (!(flags & (EXCHGID4_FLAG_MASK_PNFS))) + goto out_inval; + return NFS_OK; +out_inval: + return -NFS4ERR_INVAL; +} + +static bool +nfs41_same_server_scope(struct nfs41_server_scope *a, + struct nfs41_server_scope *b) +{ + if (a->server_scope_sz == b->server_scope_sz && + memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0) + return true; + + return false; +} + +/* + * nfs4_proc_bind_conn_to_session() + * + * The 4.1 client currently uses the same TCP connection for the + * fore and backchannel. + */ +int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred) +{ + int status; + struct nfs41_bind_conn_to_session_res res; + struct rpc_message msg = { + .rpc_proc = + &nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION], + .rpc_argp = clp, + .rpc_resp = &res, + .rpc_cred = cred, + }; + + dprintk("--> %s\n", __func__); + + res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); + if (unlikely(res.session == NULL)) { + status = -ENOMEM; + goto out; + } + + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_bind_conn_to_session(clp, status); + if (status == 0) { + if (memcmp(res.session->sess_id.data, + clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { + dprintk("NFS: %s: Session ID mismatch\n", __func__); + status = -EIO; + goto out_session; + } + if (res.dir != NFS4_CDFS4_BOTH) { + dprintk("NFS: %s: Unexpected direction from server\n", + __func__); + status = -EIO; + goto out_session; + } + if (res.use_conn_in_rdma_mode) { + dprintk("NFS: %s: Server returned RDMA mode = true\n", + __func__); + status = -EIO; + goto out_session; + } + } +out_session: + kfree(res.session); +out: + dprintk("<-- %s status= %d\n", __func__, status); + return status; +} + +/* + * Minimum set of SP4_MACH_CRED operations from RFC 5661 in the enforce map + * and operations we'd like to see to enable certain features in the allow map + */ +static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = { + .how = SP4_MACH_CRED, + .enforce.u.words = { + [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) | + 1 << (OP_EXCHANGE_ID - 32) | + 1 << (OP_CREATE_SESSION - 32) | + 1 << (OP_DESTROY_SESSION - 32) | + 1 << (OP_DESTROY_CLIENTID - 32) + }, + .allow.u.words = { + [0] = 1 << (OP_CLOSE) | + 1 << (OP_LOCKU) | + 1 << (OP_COMMIT), + [1] = 1 << (OP_SECINFO - 32) | + 1 << (OP_SECINFO_NO_NAME - 32) | + 1 << (OP_TEST_STATEID - 32) | + 1 << (OP_FREE_STATEID - 32) | + 1 << (OP_WRITE - 32) + } +}; + +/* + * Select the state protection mode for client `clp' given the server results + * from exchange_id in `sp'. + * + * Returns 0 on success, negative errno otherwise. + */ +static int nfs4_sp4_select_mode(struct nfs_client *clp, + struct nfs41_state_protection *sp) +{ + static const u32 supported_enforce[NFS4_OP_MAP_NUM_WORDS] = { + [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) | + 1 << (OP_EXCHANGE_ID - 32) | + 1 << (OP_CREATE_SESSION - 32) | + 1 << (OP_DESTROY_SESSION - 32) | + 1 << (OP_DESTROY_CLIENTID - 32) + }; + unsigned int i; + + if (sp->how == SP4_MACH_CRED) { + /* Print state protect result */ + dfprintk(MOUNT, "Server SP4_MACH_CRED support:\n"); + for (i = 0; i <= LAST_NFS4_OP; i++) { + if (test_bit(i, sp->enforce.u.longs)) + dfprintk(MOUNT, " enforce op %d\n", i); + if (test_bit(i, sp->allow.u.longs)) + dfprintk(MOUNT, " allow op %d\n", i); + } + + /* make sure nothing is on enforce list that isn't supported */ + for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) { + if (sp->enforce.u.words[i] & ~supported_enforce[i]) { + dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); + return -EINVAL; + } + } + + /* + * Minimal mode - state operations are allowed to use machine + * credential. Note this already happens by default, so the + * client doesn't have to do anything more than the negotiation. + * + * NOTE: we don't care if EXCHANGE_ID is in the list - + * we're already using the machine cred for exchange_id + * and will never use a different cred. + */ + if (test_bit(OP_BIND_CONN_TO_SESSION, sp->enforce.u.longs) && + test_bit(OP_CREATE_SESSION, sp->enforce.u.longs) && + test_bit(OP_DESTROY_SESSION, sp->enforce.u.longs) && + test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) { + dfprintk(MOUNT, "sp4_mach_cred:\n"); + dfprintk(MOUNT, " minimal mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_MINIMAL, &clp->cl_sp4_flags); + } else { + dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); + return -EINVAL; + } + + if (test_bit(OP_CLOSE, sp->allow.u.longs) && + test_bit(OP_LOCKU, sp->allow.u.longs)) { + dfprintk(MOUNT, " cleanup mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags); + } + + if (test_bit(OP_SECINFO, sp->allow.u.longs) && + test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) { + dfprintk(MOUNT, " secinfo mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_SECINFO, &clp->cl_sp4_flags); + } + + if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) && + test_bit(OP_FREE_STATEID, sp->allow.u.longs)) { + dfprintk(MOUNT, " stateid mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_STATEID, &clp->cl_sp4_flags); + } + + if (test_bit(OP_WRITE, sp->allow.u.longs)) { + dfprintk(MOUNT, " write mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_WRITE, &clp->cl_sp4_flags); + } + + if (test_bit(OP_COMMIT, sp->allow.u.longs)) { + dfprintk(MOUNT, " commit mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags); + } + } + + return 0; +} + +/* + * _nfs4_proc_exchange_id() + * + * Wrapper for EXCHANGE_ID operation. + */ +static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, + u32 sp4_how) +{ + nfs4_verifier verifier; + struct nfs41_exchange_id_args args = { + .verifier = &verifier, + .client = clp, +#ifdef CONFIG_NFS_V4_1_MIGRATION + .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID | + EXCHGID4_FLAG_SUPP_MOVED_MIGR, +#else + .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID, +#endif + }; + struct nfs41_exchange_id_res res = { + 0 + }; + int status; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + + nfs4_init_boot_verifier(clp, &verifier); + args.id_len = nfs4_init_uniform_client_string(clp, args.id, + sizeof(args.id)); + dprintk("NFS call exchange_id auth=%s, '%.*s'\n", + clp->cl_rpcclient->cl_auth->au_ops->au_name, + args.id_len, args.id); + + res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), + GFP_NOFS); + if (unlikely(res.server_owner == NULL)) { + status = -ENOMEM; + goto out; + } + + res.server_scope = kzalloc(sizeof(struct nfs41_server_scope), + GFP_NOFS); + if (unlikely(res.server_scope == NULL)) { + status = -ENOMEM; + goto out_server_owner; + } + + res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS); + if (unlikely(res.impl_id == NULL)) { + status = -ENOMEM; + goto out_server_scope; + } + + switch (sp4_how) { + case SP4_NONE: + args.state_protect.how = SP4_NONE; + break; + + case SP4_MACH_CRED: + args.state_protect = nfs4_sp4_mach_cred_request; + break; + + default: + /* unsupported! */ + WARN_ON_ONCE(1); + status = -EINVAL; + goto out_server_scope; + } + + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_exchange_id(clp, status); + if (status == 0) + status = nfs4_check_cl_exchange_flags(res.flags); + + if (status == 0) + status = nfs4_sp4_select_mode(clp, &res.state_protect); + + if (status == 0) { + clp->cl_clientid = res.clientid; + clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R); + if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) + clp->cl_seqid = res.seqid; + + kfree(clp->cl_serverowner); + clp->cl_serverowner = res.server_owner; + res.server_owner = NULL; + + /* use the most recent implementation id */ + kfree(clp->cl_implid); + clp->cl_implid = res.impl_id; + + if (clp->cl_serverscope != NULL && + !nfs41_same_server_scope(clp->cl_serverscope, + res.server_scope)) { + dprintk("%s: server_scope mismatch detected\n", + __func__); + set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); + kfree(clp->cl_serverscope); + clp->cl_serverscope = NULL; + } + + if (clp->cl_serverscope == NULL) { + clp->cl_serverscope = res.server_scope; + goto out; + } + } else + kfree(res.impl_id); + +out_server_owner: + kfree(res.server_owner); +out_server_scope: + kfree(res.server_scope); +out: + if (clp->cl_implid != NULL) + dprintk("NFS reply exchange_id: Server Implementation ID: " + "domain: %s, name: %s, date: %llu,%u\n", + clp->cl_implid->domain, clp->cl_implid->name, + clp->cl_implid->date.seconds, + clp->cl_implid->date.nseconds); + dprintk("NFS reply exchange_id: %d\n", status); + return status; +} + +/* + * nfs4_proc_exchange_id() + * + * Returns zero, a negative errno, or a negative NFS4ERR status code. + * + * Since the clientid has expired, all compounds using sessions + * associated with the stale clientid will be returning + * NFS4ERR_BADSESSION in the sequence operation, and will therefore + * be in some phase of session reset. + * + * Will attempt to negotiate SP4_MACH_CRED if krb5i / krb5p auth is used. + */ +int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) +{ + rpc_authflavor_t authflavor = clp->cl_rpcclient->cl_auth->au_flavor; + int status; + + /* try SP4_MACH_CRED if krb5i/p */ + if (authflavor == RPC_AUTH_GSS_KRB5I || + authflavor == RPC_AUTH_GSS_KRB5P) { + status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED); + if (!status) + return 0; + } + + /* try SP4_NONE */ + return _nfs4_proc_exchange_id(clp, cred, SP4_NONE); +} + +static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, + struct rpc_cred *cred) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_CLIENTID], + .rpc_argp = clp, + .rpc_cred = cred, + }; + int status; + + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_destroy_clientid(clp, status); + if (status) + dprintk("NFS: Got error %d from the server %s on " + "DESTROY_CLIENTID.", status, clp->cl_hostname); + return status; +} + +static int nfs4_proc_destroy_clientid(struct nfs_client *clp, + struct rpc_cred *cred) +{ + unsigned int loop; + int ret; + + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { + ret = _nfs4_proc_destroy_clientid(clp, cred); + switch (ret) { + case -NFS4ERR_DELAY: + case -NFS4ERR_CLIENTID_BUSY: + ssleep(1); + break; + default: + return ret; + } + } + return 0; +} + +int nfs4_destroy_clientid(struct nfs_client *clp) +{ + struct rpc_cred *cred; + int ret = 0; + + if (clp->cl_mvops->minor_version < 1) + goto out; + if (clp->cl_exchange_flags == 0) + goto out; + if (clp->cl_preserve_clid) + goto out; + cred = nfs4_get_clid_cred(clp); + ret = nfs4_proc_destroy_clientid(clp, cred); + if (cred) + put_rpccred(cred); + switch (ret) { + case 0: + case -NFS4ERR_STALE_CLIENTID: + clp->cl_exchange_flags = 0; + } +out: + return ret; +} + +struct nfs4_get_lease_time_data { + struct nfs4_get_lease_time_args *args; + struct nfs4_get_lease_time_res *res; + struct nfs_client *clp; +}; + +static void nfs4_get_lease_time_prepare(struct rpc_task *task, + void *calldata) +{ + struct nfs4_get_lease_time_data *data = + (struct nfs4_get_lease_time_data *)calldata; + + dprintk("--> %s\n", __func__); + /* just setup sequence, do not trigger session recovery + since we're invoked within one */ + nfs41_setup_sequence(data->clp->cl_session, + &data->args->la_seq_args, + &data->res->lr_seq_res, + task); + dprintk("<-- %s\n", __func__); +} + +/* + * Called from nfs4_state_manager thread for session setup, so don't recover + * from sequence operation or clientid errors. + */ +static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_get_lease_time_data *data = + (struct nfs4_get_lease_time_data *)calldata; + + dprintk("--> %s\n", __func__); + if (!nfs41_sequence_done(task, &data->res->lr_seq_res)) + return; + switch (task->tk_status) { + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); + rpc_delay(task, NFS4_POLL_RETRY_MIN); + task->tk_status = 0; + /* fall through */ + case -NFS4ERR_RETRY_UNCACHED_REP: + rpc_restart_call_prepare(task); + return; + } + dprintk("<-- %s\n", __func__); +} + +static const struct rpc_call_ops nfs4_get_lease_time_ops = { + .rpc_call_prepare = nfs4_get_lease_time_prepare, + .rpc_call_done = nfs4_get_lease_time_done, +}; + +int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) +{ + struct rpc_task *task; + struct nfs4_get_lease_time_args args; + struct nfs4_get_lease_time_res res = { + .lr_fsinfo = fsinfo, + }; + struct nfs4_get_lease_time_data data = { + .args = &args, + .res = &res, + .clp = clp, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GET_LEASE_TIME], + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs4_get_lease_time_ops, + .callback_data = &data, + .flags = RPC_TASK_TIMEOUT, + }; + int status; + + nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); + nfs4_set_sequence_privileged(&args.la_seq_args); + dprintk("--> %s\n", __func__); + task = rpc_run_task(&task_setup); + + if (IS_ERR(task)) + status = PTR_ERR(task); + else { + status = task->tk_status; + rpc_put_task(task); + } + dprintk("<-- %s return %d\n", __func__, status); + + return status; +} + +/* + * Initialize the values to be used by the client in CREATE_SESSION + * If nfs4_init_session set the fore channel request and response sizes, + * use them. + * + * Set the back channel max_resp_sz_cached to zero to force the client to + * always set csa_cachethis to FALSE because the current implementation + * of the back channel DRC only supports caching the CB_SEQUENCE operation. + */ +static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) +{ + unsigned int max_rqst_sz, max_resp_sz; + + max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead; + max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead; + + /* Fore channel attributes */ + args->fc_attrs.max_rqst_sz = max_rqst_sz; + args->fc_attrs.max_resp_sz = max_resp_sz; + args->fc_attrs.max_ops = NFS4_MAX_OPS; + args->fc_attrs.max_reqs = max_session_slots; + + dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u " + "max_ops=%u max_reqs=%u\n", + __func__, + args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz, + args->fc_attrs.max_ops, args->fc_attrs.max_reqs); + + /* Back channel attributes */ + args->bc_attrs.max_rqst_sz = PAGE_SIZE; + args->bc_attrs.max_resp_sz = PAGE_SIZE; + args->bc_attrs.max_resp_sz_cached = 0; + args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; + args->bc_attrs.max_reqs = 1; + + dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u " + "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", + __func__, + args->bc_attrs.max_rqst_sz, args->bc_attrs.max_resp_sz, + args->bc_attrs.max_resp_sz_cached, args->bc_attrs.max_ops, + args->bc_attrs.max_reqs); +} + +static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) +{ + struct nfs4_channel_attrs *sent = &args->fc_attrs; + struct nfs4_channel_attrs *rcvd = &session->fc_attrs; + + if (rcvd->max_resp_sz > sent->max_resp_sz) + return -EINVAL; + /* + * Our requested max_ops is the minimum we need; we're not + * prepared to break up compounds into smaller pieces than that. + * So, no point even trying to continue if the server won't + * cooperate: + */ + if (rcvd->max_ops < sent->max_ops) + return -EINVAL; + if (rcvd->max_reqs == 0) + return -EINVAL; + if (rcvd->max_reqs > NFS4_MAX_SLOT_TABLE) + rcvd->max_reqs = NFS4_MAX_SLOT_TABLE; + return 0; +} + +static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) +{ + struct nfs4_channel_attrs *sent = &args->bc_attrs; + struct nfs4_channel_attrs *rcvd = &session->bc_attrs; + + if (rcvd->max_rqst_sz > sent->max_rqst_sz) + return -EINVAL; + if (rcvd->max_resp_sz < sent->max_resp_sz) + return -EINVAL; + if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) + return -EINVAL; + /* These would render the backchannel useless: */ + if (rcvd->max_ops != sent->max_ops) + return -EINVAL; + if (rcvd->max_reqs != sent->max_reqs) + return -EINVAL; + return 0; +} + +static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args, + struct nfs4_session *session) +{ + int ret; + + ret = nfs4_verify_fore_channel_attrs(args, session); + if (ret) + return ret; + return nfs4_verify_back_channel_attrs(args, session); +} + +static int _nfs4_proc_create_session(struct nfs_client *clp, + struct rpc_cred *cred) +{ + struct nfs4_session *session = clp->cl_session; + struct nfs41_create_session_args args = { + .client = clp, + .cb_program = NFS4_CALLBACK, + }; + struct nfs41_create_session_res res = { + .client = clp, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + nfs4_init_channel_attrs(&args); + args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); + + status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_create_session(clp, status); + + if (!status) { + /* Verify the session's negotiated channel_attrs values */ + status = nfs4_verify_channel_attrs(&args, session); + /* Increment the clientid slot sequence id */ + clp->cl_seqid++; + } + + return status; +} + +/* + * Issues a CREATE_SESSION operation to the server. + * It is the responsibility of the caller to verify the session is + * expired before calling this routine. + */ +int nfs4_proc_create_session(struct nfs_client *clp, struct rpc_cred *cred) +{ + int status; + unsigned *ptr; + struct nfs4_session *session = clp->cl_session; + + dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); + + status = _nfs4_proc_create_session(clp, cred); + if (status) + goto out; + + /* Init or reset the session slot tables */ + status = nfs4_setup_session_slot_tables(session); + dprintk("slot table setup returned %d\n", status); + if (status) + goto out; + + ptr = (unsigned *)&session->sess_id.data[0]; + dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__, + clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]); +out: + dprintk("<-- %s\n", __func__); + return status; +} + +/* + * Issue the over-the-wire RPC DESTROY_SESSION. + * The caller must serialize access to this routine. + */ +int nfs4_proc_destroy_session(struct nfs4_session *session, + struct rpc_cred *cred) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION], + .rpc_argp = session, + .rpc_cred = cred, + }; + int status = 0; + + dprintk("--> nfs4_proc_destroy_session\n"); + + /* session is still being setup */ + if (session->clp->cl_cons_state != NFS_CS_READY) + return status; + + status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + trace_nfs4_destroy_session(session->clp, status); + + if (status) + dprintk("NFS: Got error %d from the server on DESTROY_SESSION. " + "Session has been destroyed regardless...\n", status); + + dprintk("<-- nfs4_proc_destroy_session\n"); return status; } -struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = { +/* + * Renew the cl_session lease. + */ +struct nfs4_sequence_data { + struct nfs_client *clp; + struct nfs4_sequence_args args; + struct nfs4_sequence_res res; +}; + +static void nfs41_sequence_release(void *data) +{ + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); + kfree(calldata); +} + +static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp) +{ + switch(task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + return -EAGAIN; + default: + nfs4_schedule_lease_recovery(clp); + } + return 0; +} + +static void nfs41_sequence_call_done(struct rpc_task *task, void *data) +{ + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; + + if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp)) + return; + + trace_nfs4_sequence(clp, task->tk_status); + if (task->tk_status < 0) { + dprintk("%s ERROR %d\n", __func__, task->tk_status); + if (atomic_read(&clp->cl_count) == 1) + goto out; + + if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + } + dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); +out: + dprintk("<-- %s\n", __func__); +} + +static void nfs41_sequence_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; + struct nfs4_sequence_args *args; + struct nfs4_sequence_res *res; + + args = task->tk_msg.rpc_argp; + res = task->tk_msg.rpc_resp; + + nfs41_setup_sequence(clp->cl_session, args, res, task); +} + +static const struct rpc_call_ops nfs41_sequence_ops = { + .rpc_call_done = nfs41_sequence_call_done, + .rpc_call_prepare = nfs41_sequence_prepare, + .rpc_release = nfs41_sequence_release, +}; + +static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, + struct rpc_cred *cred, + bool is_privileged) +{ + struct nfs4_sequence_data *calldata; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], + .rpc_cred = cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs41_sequence_ops, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, + }; + + if (!atomic_inc_not_zero(&clp->cl_count)) + return ERR_PTR(-EIO); + calldata = kzalloc(sizeof(*calldata), GFP_NOFS); + if (calldata == NULL) { + nfs_put_client(clp); + return ERR_PTR(-ENOMEM); + } + nfs4_init_sequence(&calldata->args, &calldata->res, 0); + if (is_privileged) + nfs4_set_sequence_privileged(&calldata->args); + msg.rpc_argp = &calldata->args; + msg.rpc_resp = &calldata->res; + calldata->clp = clp; + task_setup_data.callback_data = calldata; + + return rpc_run_task(&task_setup_data); +} + +static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags) +{ + struct rpc_task *task; + int ret = 0; + + if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) + return 0; + task = _nfs41_proc_sequence(clp, cred, false); + if (IS_ERR(task)) + ret = PTR_ERR(task); + else + rpc_put_task_async(task); + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; +} + +static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct rpc_task *task; + int ret; + + task = _nfs41_proc_sequence(clp, cred, true); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + ret = rpc_wait_for_completion_task(task); + if (!ret) { + struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; + + if (task->tk_status == 0) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + ret = task->tk_status; + } + rpc_put_task(task); +out: + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; +} + +struct nfs4_reclaim_complete_data { + struct nfs_client *clp; + struct nfs41_reclaim_complete_args arg; + struct nfs41_reclaim_complete_res res; +}; + +static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_reclaim_complete_data *calldata = data; + + nfs41_setup_sequence(calldata->clp->cl_session, + &calldata->arg.seq_args, + &calldata->res.seq_res, + task); +} + +static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) +{ + switch(task->tk_status) { + case 0: + case -NFS4ERR_COMPLETE_ALREADY: + case -NFS4ERR_WRONG_CRED: /* What to do here? */ + break; + case -NFS4ERR_DELAY: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + /* fall through */ + case -NFS4ERR_RETRY_UNCACHED_REP: + return -EAGAIN; + default: + nfs4_schedule_lease_recovery(clp); + } + return 0; +} + +static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) +{ + struct nfs4_reclaim_complete_data *calldata = data; + struct nfs_client *clp = calldata->clp; + struct nfs4_sequence_res *res = &calldata->res.seq_res; + + dprintk("--> %s\n", __func__); + if (!nfs41_sequence_done(task, res)) + return; + + trace_nfs4_reclaim_complete(clp, task->tk_status); + if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + dprintk("<-- %s\n", __func__); +} + +static void nfs4_free_reclaim_complete_data(void *data) +{ + struct nfs4_reclaim_complete_data *calldata = data; + + kfree(calldata); +} + +static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = { + .rpc_call_prepare = nfs4_reclaim_complete_prepare, + .rpc_call_done = nfs4_reclaim_complete_done, + .rpc_release = nfs4_free_reclaim_complete_data, +}; + +/* + * Issue a global reclaim complete. + */ +static int nfs41_proc_reclaim_complete(struct nfs_client *clp, + struct rpc_cred *cred) +{ + struct nfs4_reclaim_complete_data *calldata; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], + .rpc_cred = cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs4_reclaim_complete_call_ops, + .flags = RPC_TASK_ASYNC, + }; + int status = -ENOMEM; + + dprintk("--> %s\n", __func__); + calldata = kzalloc(sizeof(*calldata), GFP_NOFS); + if (calldata == NULL) + goto out; + calldata->clp = clp; + calldata->arg.one_fs = 0; + + nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0); + nfs4_set_sequence_privileged(&calldata->arg.seq_args); + msg.rpc_argp = &calldata->arg; + msg.rpc_resp = &calldata->res; + task_setup_data.callback_data = calldata; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + status = PTR_ERR(task); + goto out; + } + status = nfs4_wait_for_completion_rpc_task(task); + if (status == 0) + status = task->tk_status; + rpc_put_task(task); + return 0; +out: + dprintk("<-- %s status=%d\n", __func__, status); + return status; +} + +static void +nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct nfs4_session *session = nfs4_get_session(server); + + dprintk("--> %s\n", __func__); + /* Note the is a race here, where a CB_LAYOUTRECALL can come in + * right now covering the LAYOUTGET we are about to send. + * However, that is not so catastrophic, and there seems + * to be no way to prevent it completely. + */ + if (nfs41_setup_sequence(session, &lgp->args.seq_args, + &lgp->res.seq_res, task)) + return; + if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, + NFS_I(lgp->args.inode)->layout, + lgp->args.ctx->state)) { + rpc_exit(task, NFS4_OK); + } +} + +static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct inode *inode = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layout_hdr *lo; + struct nfs4_state *state = NULL; + unsigned long timeo, now, giveup; + + dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); + + if (!nfs41_sequence_done(task, &lgp->res.seq_res)) + goto out; + + switch (task->tk_status) { + case 0: + goto out; + /* + * NFS4ERR_LAYOUTTRYLATER is a conflict with another client + * (or clients) writing to the same RAID stripe + */ + case -NFS4ERR_LAYOUTTRYLATER: + /* + * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall + * existing layout before getting a new one). + */ + case -NFS4ERR_RECALLCONFLICT: + timeo = rpc_get_timeout(task->tk_client); + giveup = lgp->args.timestamp + timeo; + now = jiffies; + if (time_after(giveup, now)) { + unsigned long delay; + + /* Delay for: + * - Not less then NFS4_POLL_RETRY_MIN. + * - One last time a jiffie before we give up + * - exponential backoff (time_now minus start_attempt) + */ + delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN, + min((giveup - now - 1), + now - lgp->args.timestamp)); + + dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n", + __func__, delay); + rpc_delay(task, delay); + task->tk_status = 0; + rpc_restart_call_prepare(task); + goto out; /* Do not call nfs4_async_handle_error() */ + } + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_BAD_STATEID: + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (!lo || list_empty(&lo->plh_segs)) { + spin_unlock(&inode->i_lock); + /* If the open stateid was bad, then recover it. */ + state = lgp->args.ctx->state; + } else { + LIST_HEAD(head); + + pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); + spin_unlock(&inode->i_lock); + /* Mark the bad layout state as invalid, then + * retry using the open stateid. */ + pnfs_free_lseg_list(&head); + } + } + if (nfs4_async_handle_error(task, server, state) == -EAGAIN) + rpc_restart_call_prepare(task); +out: + dprintk("<-- %s\n", __func__); +} + +static size_t max_response_pages(struct nfs_server *server) +{ + u32 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + return nfs_page_array_len(0, max_resp_sz); +} + +static void nfs4_free_pages(struct page **pages, size_t size) +{ + int i; + + if (!pages) + return; + + for (i = 0; i < size; i++) { + if (!pages[i]) + break; + __free_page(pages[i]); + } + kfree(pages); +} + +static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) +{ + struct page **pages; + int i; + + pages = kcalloc(size, sizeof(struct page *), gfp_flags); + if (!pages) { + dprintk("%s: can't alloc array of %zu pages\n", __func__, size); + return NULL; + } + + for (i = 0; i < size; i++) { + pages[i] = alloc_page(gfp_flags); + if (!pages[i]) { + dprintk("%s: failed to allocate page\n", __func__); + nfs4_free_pages(pages, size); + return NULL; + } + } + + return pages; +} + +static void nfs4_layoutget_release(void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct inode *inode = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(inode); + size_t max_pages = max_response_pages(server); + + dprintk("--> %s\n", __func__); + nfs4_free_pages(lgp->args.layout.pages, max_pages); + pnfs_put_layout_hdr(NFS_I(inode)->layout); + put_nfs_open_context(lgp->args.ctx); + kfree(calldata); + dprintk("<-- %s\n", __func__); +} + +static const struct rpc_call_ops nfs4_layoutget_call_ops = { + .rpc_call_prepare = nfs4_layoutget_prepare, + .rpc_call_done = nfs4_layoutget_done, + .rpc_release = nfs4_layoutget_release, +}; + +struct pnfs_layout_segment * +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) +{ + struct inode *inode = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(inode); + size_t max_pages = max_response_pages(server); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], + .rpc_argp = &lgp->args, + .rpc_resp = &lgp->res, + .rpc_cred = lgp->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_layoutget_call_ops, + .callback_data = lgp, + .flags = RPC_TASK_ASYNC, + }; + struct pnfs_layout_segment *lseg = NULL; + int status = 0; + + dprintk("--> %s\n", __func__); + + lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); + if (!lgp->args.layout.pages) { + nfs4_layoutget_release(lgp); + return ERR_PTR(-ENOMEM); + } + lgp->args.layout.pglen = max_pages * PAGE_SIZE; + lgp->args.timestamp = jiffies; + + lgp->res.layoutp = &lgp->args.layout; + lgp->res.seq_res.sr_slot = NULL; + nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); + + /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ + pnfs_get_layout_hdr(NFS_I(inode)->layout); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return ERR_CAST(task); + status = nfs4_wait_for_completion_rpc_task(task); + if (status == 0) + status = task->tk_status; + trace_nfs4_layoutget(lgp->args.ctx, + &lgp->args.range, + &lgp->res.range, + status); + /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ + if (status == 0 && lgp->res.layoutp->len) + lseg = pnfs_layout_process(lgp); + rpc_put_task(task); + dprintk("<-- %s status=%d\n", __func__, status); + if (status) + return ERR_PTR(status); + return lseg; +} + +static void +nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutreturn *lrp = calldata; + + dprintk("--> %s\n", __func__); + nfs41_setup_sequence(lrp->clp->cl_session, + &lrp->args.seq_args, + &lrp->res.seq_res, + task); +} + +static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutreturn *lrp = calldata; + struct nfs_server *server; + + dprintk("--> %s\n", __func__); + + if (!nfs41_sequence_done(task, &lrp->res.seq_res)) + return; + + server = NFS_SERVER(lrp->args.inode); + switch (task->tk_status) { + default: + task->tk_status = 0; + case 0: + break; + case -NFS4ERR_DELAY: + if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN) + break; + rpc_restart_call_prepare(task); + return; + } + dprintk("<-- %s\n", __func__); +} + +static void nfs4_layoutreturn_release(void *calldata) +{ + struct nfs4_layoutreturn *lrp = calldata; + struct pnfs_layout_hdr *lo = lrp->args.layout; + + dprintk("--> %s\n", __func__); + spin_lock(&lo->plh_inode->i_lock); + if (lrp->res.lrs_present) + pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + lo->plh_block_lgets--; + spin_unlock(&lo->plh_inode->i_lock); + pnfs_put_layout_hdr(lrp->args.layout); + kfree(calldata); + dprintk("<-- %s\n", __func__); +} + +static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { + .rpc_call_prepare = nfs4_layoutreturn_prepare, + .rpc_call_done = nfs4_layoutreturn_done, + .rpc_release = nfs4_layoutreturn_release, +}; + +int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) +{ + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], + .rpc_argp = &lrp->args, + .rpc_resp = &lrp->res, + .rpc_cred = lrp->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = NFS_SERVER(lrp->args.inode)->client, + .rpc_message = &msg, + .callback_ops = &nfs4_layoutreturn_call_ops, + .callback_data = lrp, + }; + int status; + + dprintk("--> %s\n", __func__); + nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1); + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = task->tk_status; + trace_nfs4_layoutreturn(lrp->args.inode, status); + dprintk("<-- %s status=%d\n", __func__, status); + rpc_put_task(task); + return status; +} + +/* + * Retrieve the list of Data Server devices from the MDS. + */ +static int _nfs4_getdevicelist(struct nfs_server *server, + const struct nfs_fh *fh, + struct pnfs_devicelist *devlist) +{ + struct nfs4_getdevicelist_args args = { + .fh = fh, + .layoutclass = server->pnfs_curr_ld->id, + }; + struct nfs4_getdevicelist_res res = { + .devlist = devlist, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, + &res.seq_res, 0); + dprintk("<-- %s status=%d\n", __func__, status); + return status; +} + +int nfs4_proc_getdevicelist(struct nfs_server *server, + const struct nfs_fh *fh, + struct pnfs_devicelist *devlist) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(server, + _nfs4_getdevicelist(server, fh, devlist), + &exception); + } while (exception.retry); + + dprintk("%s: err=%d, num_devs=%u\n", __func__, + err, devlist->num_devs); + + return err; +} +EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); + +static int +_nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *pdev, + struct rpc_cred *cred) +{ + struct nfs4_getdeviceinfo_args args = { + .pdev = pdev, + }; + struct nfs4_getdeviceinfo_res res = { + .pdev = pdev, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + int status; + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + dprintk("<-- %s status=%d\n", __func__, status); + + return status; +} + +int nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *pdev, + struct rpc_cred *cred) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(server, + _nfs4_proc_getdeviceinfo(server, pdev, cred), + &exception); + } while (exception.retry); + return err; +} +EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo); + +static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + struct nfs4_session *session = nfs4_get_session(server); + + nfs41_setup_sequence(session, + &data->args.seq_args, + &data->res.seq_res, + task); +} + +static void +nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + + if (!nfs41_sequence_done(task, &data->res.seq_res)) + return; + + switch (task->tk_status) { /* Just ignore these failures */ + case -NFS4ERR_DELEG_REVOKED: /* layout was recalled */ + case -NFS4ERR_BADIOMODE: /* no IOMODE_RW layout for range */ + case -NFS4ERR_BADLAYOUT: /* no layout */ + case -NFS4ERR_GRACE: /* loca_recalim always false */ + task->tk_status = 0; + case 0: + break; + default: + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } + } +} + +static void nfs4_layoutcommit_release(void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + + pnfs_cleanup_layoutcommit(data); + nfs_post_op_update_inode_force_wcc(data->args.inode, + data->res.fattr); + put_rpccred(data->cred); + kfree(data); +} + +static const struct rpc_call_ops nfs4_layoutcommit_ops = { + .rpc_call_prepare = nfs4_layoutcommit_prepare, + .rpc_call_done = nfs4_layoutcommit_done, + .rpc_release = nfs4_layoutcommit_release, +}; + +int +nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT], + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, + .rpc_client = NFS_CLIENT(data->args.inode), + .rpc_message = &msg, + .callback_ops = &nfs4_layoutcommit_ops, + .callback_data = data, + .flags = RPC_TASK_ASYNC, + }; + struct rpc_task *task; + int status = 0; + + dprintk("NFS: %4d initiating layoutcommit call. sync %d " + "lbw: %llu inode %lu\n", + data->task.tk_pid, sync, + data->args.lastbytewritten, + data->args.inode->i_ino); + + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (sync == false) + goto out; + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) + goto out; + status = task->tk_status; + trace_nfs4_layoutcommit(data->args.inode, status); +out: + dprintk("%s: status %d\n", __func__, status); + rpc_put_task(task); + return status; +} + +/** + * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if + * possible) as per RFC3530bis and RFC5661 Security Considerations sections + */ +static int +_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, + struct nfs4_secinfo_flavors *flavors, bool use_integrity) +{ + struct nfs41_secinfo_no_name_args args = { + .style = SECINFO_STYLE_CURRENT_FH, + }; + struct nfs4_secinfo_res res = { + .flavors = flavors, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO_NO_NAME], + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct rpc_clnt *clnt = server->client; + struct rpc_cred *cred = NULL; + int status; + + if (use_integrity) { + clnt = server->nfs_client->cl_rpcclient; + cred = nfs4_get_clid_cred(server->nfs_client); + msg.rpc_cred = cred; + } + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, + &res.seq_res, 0); + dprintk("<-- %s status=%d\n", __func__, status); + + if (cred) + put_rpccred(cred); + + return status; +} + +static int +nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) +{ + struct nfs4_exception exception = { }; + int err; + do { + /* first try using integrity protection */ + err = -NFS4ERR_WRONGSEC; + + /* try to use integrity protection with machine cred */ + if (_nfs4_is_integrity_protected(server->nfs_client)) + err = _nfs41_proc_secinfo_no_name(server, fhandle, info, + flavors, true); + + /* + * if unable to use integrity protection, or SECINFO with + * integrity protection returns NFS4ERR_WRONGSEC (which is + * disallowed by spec, but exists in deployed servers) use + * the current filesystem's rpc_client and the user cred. + */ + if (err == -NFS4ERR_WRONGSEC) + err = _nfs41_proc_secinfo_no_name(server, fhandle, info, + flavors, false); + + switch (err) { + case 0: + case -NFS4ERR_WRONGSEC: + case -ENOTSUPP: + goto out; + default: + err = nfs4_handle_exception(server, err, &exception); + } + } while (exception.retry); +out: + return err; +} + +static int +nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int err; + struct page *page; + rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; + struct nfs4_secinfo_flavors *flavors; + struct nfs4_secinfo4 *secinfo; + int i; + + page = alloc_page(GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + flavors = page_address(page); + err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); + + /* + * Fall back on "guess and check" method if + * the server doesn't support SECINFO_NO_NAME + */ + if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) { + err = nfs4_find_root_sec(server, fhandle, info); + goto out_freepage; + } + if (err) + goto out_freepage; + + for (i = 0; i < flavors->num_flavors; i++) { + secinfo = &flavors->flavors[i]; + + switch (secinfo->flavor) { + case RPC_AUTH_NULL: + case RPC_AUTH_UNIX: + case RPC_AUTH_GSS: + flavor = rpcauth_get_pseudoflavor(secinfo->flavor, + &secinfo->flavor_info); + break; + default: + flavor = RPC_AUTH_MAXFLAVOR; + break; + } + + if (!nfs_auth_info_match(&server->auth_info, flavor)) + flavor = RPC_AUTH_MAXFLAVOR; + + if (flavor != RPC_AUTH_MAXFLAVOR) { + err = nfs4_lookup_root_sec(server, fhandle, + info, flavor); + if (!err) + break; + } + } + + if (flavor == RPC_AUTH_MAXFLAVOR) + err = -EPERM; + +out_freepage: + put_page(page); + if (err == -EACCES) + return -EPERM; +out: + return err; +} + +static int _nfs41_test_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + int status; + struct nfs41_test_stateid_args args = { + .stateid = stateid, + }; + struct nfs41_test_stateid_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID], + .rpc_argp = &args, + .rpc_resp = &res, + .rpc_cred = cred, + }; + struct rpc_clnt *rpc_client = server->client; + + nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID, + &rpc_client, &msg); + + dprintk("NFS call test_stateid %p\n", stateid); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); + nfs4_set_sequence_privileged(&args.seq_args); + status = nfs4_call_sync_sequence(rpc_client, server, &msg, + &args.seq_args, &res.seq_res); + if (status != NFS_OK) { + dprintk("NFS reply test_stateid: failed, %d\n", status); + return status; + } + dprintk("NFS reply test_stateid: succeeded, %d\n", -res.status); + return -res.status; +} + +/** + * nfs41_test_stateid - perform a TEST_STATEID operation + * + * @server: server / transport on which to perform the operation + * @stateid: state ID to test + * @cred: credential + * + * Returns NFS_OK if the server recognizes that "stateid" is valid. + * Otherwise a negative NFS4ERR value is returned if the operation + * failed or the state ID is not currently valid. + */ +static int nfs41_test_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs41_test_stateid(server, stateid, cred); + if (err != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + return err; +} + +struct nfs_free_stateid_data { + struct nfs_server *server; + struct nfs41_free_stateid_args args; + struct nfs41_free_stateid_res res; +}; + +static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_free_stateid_data *data = calldata; + nfs41_setup_sequence(nfs4_get_session(data->server), + &data->args.seq_args, + &data->res.seq_res, + task); +} + +static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata) +{ + struct nfs_free_stateid_data *data = calldata; + + nfs41_sequence_done(task, &data->res.seq_res); + + switch (task->tk_status) { + case -NFS4ERR_DELAY: + if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } +} + +static void nfs41_free_stateid_release(void *calldata) +{ + kfree(calldata); +} + +static const struct rpc_call_ops nfs41_free_stateid_ops = { + .rpc_call_prepare = nfs41_free_stateid_prepare, + .rpc_call_done = nfs41_free_stateid_done, + .rpc_release = nfs41_free_stateid_release, +}; + +static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred, + bool privileged) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID], + .rpc_cred = cred, + }; + struct rpc_task_setup task_setup = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs41_free_stateid_ops, + .flags = RPC_TASK_ASYNC, + }; + struct nfs_free_stateid_data *data; + + nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID, + &task_setup.rpc_client, &msg); + + dprintk("NFS call free_stateid %p\n", stateid); + data = kmalloc(sizeof(*data), GFP_NOFS); + if (!data) + return ERR_PTR(-ENOMEM); + data->server = server; + nfs4_stateid_copy(&data->args.stateid, stateid); + + task_setup.callback_data = data; + + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + if (privileged) + nfs4_set_sequence_privileged(&data->args.seq_args); + + return rpc_run_task(&task_setup); +} + +/** + * nfs41_free_stateid - perform a FREE_STATEID operation + * + * @server: server / transport on which to perform the operation + * @stateid: state ID to release + * @cred: credential + * + * Returns NFS_OK if the server freed "stateid". Otherwise a + * negative NFS4ERR value is returned. + */ +static int nfs41_free_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + struct rpc_task *task; + int ret; + + task = _nfs41_free_stateid(server, stateid, cred, true); + if (IS_ERR(task)) + return PTR_ERR(task); + ret = rpc_wait_for_completion_task(task); + if (!ret) + ret = task->tk_status; + rpc_put_task(task); + return ret; +} + +static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) +{ + struct rpc_task *task; + struct rpc_cred *cred = lsp->ls_state->owner->so_cred; + + task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); + nfs4_free_lock_state(server, lsp); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} + +static bool nfs41_match_stateid(const nfs4_stateid *s1, + const nfs4_stateid *s2) +{ + if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0) + return false; + + if (s1->seqid == s2->seqid) + return true; + if (s1->seqid == 0 || s2->seqid == 0) + return true; + + return false; +} + +#endif /* CONFIG_NFS_V4_1 */ + +static bool nfs4_match_stateid(const nfs4_stateid *s1, + const nfs4_stateid *s2) +{ + return nfs4_stateid_match(s1, s2); +} + + +static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { + .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT, + .state_flag_bit = NFS_STATE_RECLAIM_REBOOT, .recover_open = nfs4_open_reclaim, .recover_lock = nfs4_lock_reclaim, + .establish_clid = nfs4_init_clientid, + .detect_trunking = nfs40_discover_server_trunking, }; -struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops = { +#if defined(CONFIG_NFS_V4_1) +static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = { + .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT, + .state_flag_bit = NFS_STATE_RECLAIM_REBOOT, + .recover_open = nfs4_open_reclaim, + .recover_lock = nfs4_lock_reclaim, + .establish_clid = nfs41_init_clientid, + .reclaim_complete = nfs41_proc_reclaim_complete, + .detect_trunking = nfs41_discover_server_trunking, +}; +#endif /* CONFIG_NFS_V4_1 */ + +static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = { + .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, + .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, .recover_open = nfs4_open_expired, .recover_lock = nfs4_lock_expired, + .establish_clid = nfs4_init_clientid, +}; + +#if defined(CONFIG_NFS_V4_1) +static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { + .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, + .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, + .recover_open = nfs41_open_expired, + .recover_lock = nfs41_lock_expired, + .establish_clid = nfs41_init_clientid, +}; +#endif /* CONFIG_NFS_V4_1 */ + +static const struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = { + .sched_state_renewal = nfs4_proc_async_renew, + .get_state_renewal_cred_locked = nfs4_get_renew_cred_locked, + .renew_lease = nfs4_proc_renew, +}; + +#if defined(CONFIG_NFS_V4_1) +static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { + .sched_state_renewal = nfs41_proc_async_sequence, + .get_state_renewal_cred_locked = nfs4_get_machine_cred_locked, + .renew_lease = nfs4_proc_sequence, +}; +#endif + +static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = { + .get_locations = _nfs40_proc_get_locations, + .fsid_present = _nfs40_proc_fsid_present, +}; + +#if defined(CONFIG_NFS_V4_1) +static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = { + .get_locations = _nfs41_proc_get_locations, + .fsid_present = _nfs41_proc_fsid_present, +}; +#endif /* CONFIG_NFS_V4_1 */ + +static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { + .minor_version = 0, + .init_caps = NFS_CAP_READDIRPLUS + | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_CHANGE_ATTR + | NFS_CAP_POSIX_LOCK, + .init_client = nfs40_init_client, + .shutdown_client = nfs40_shutdown_client, + .match_stateid = nfs4_match_stateid, + .find_root_sec = nfs4_find_root_sec, + .free_lock_state = nfs4_release_lockowner, + .call_sync_ops = &nfs40_call_sync_ops, + .reboot_recovery_ops = &nfs40_reboot_recovery_ops, + .nograce_recovery_ops = &nfs40_nograce_recovery_ops, + .state_renewal_ops = &nfs40_state_renewal_ops, + .mig_recovery_ops = &nfs40_mig_recovery_ops, +}; + +#if defined(CONFIG_NFS_V4_1) +static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { + .minor_version = 1, + .init_caps = NFS_CAP_READDIRPLUS + | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_CHANGE_ATTR + | NFS_CAP_POSIX_LOCK + | NFS_CAP_STATEID_NFSV41 + | NFS_CAP_ATOMIC_OPEN_V1, + .init_client = nfs41_init_client, + .shutdown_client = nfs41_shutdown_client, + .match_stateid = nfs41_match_stateid, + .find_root_sec = nfs41_find_root_sec, + .free_lock_state = nfs41_free_lock_state, + .call_sync_ops = &nfs41_call_sync_ops, + .reboot_recovery_ops = &nfs41_reboot_recovery_ops, + .nograce_recovery_ops = &nfs41_nograce_recovery_ops, + .state_renewal_ops = &nfs41_state_renewal_ops, + .mig_recovery_ops = &nfs41_mig_recovery_ops, +}; +#endif + +#if defined(CONFIG_NFS_V4_2) +static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { + .minor_version = 2, + .init_caps = NFS_CAP_READDIRPLUS + | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_CHANGE_ATTR + | NFS_CAP_POSIX_LOCK + | NFS_CAP_STATEID_NFSV41 + | NFS_CAP_ATOMIC_OPEN_V1, + .init_client = nfs41_init_client, + .shutdown_client = nfs41_shutdown_client, + .match_stateid = nfs41_match_stateid, + .find_root_sec = nfs41_find_root_sec, + .free_lock_state = nfs41_free_lock_state, + .call_sync_ops = &nfs41_call_sync_ops, + .reboot_recovery_ops = &nfs41_reboot_recovery_ops, + .nograce_recovery_ops = &nfs41_nograce_recovery_ops, + .state_renewal_ops = &nfs41_state_renewal_ops, +}; +#endif + +const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { + [0] = &nfs_v4_0_minor_ops, +#if defined(CONFIG_NFS_V4_1) + [1] = &nfs_v4_1_minor_ops, +#endif +#if defined(CONFIG_NFS_V4_2) + [2] = &nfs_v4_2_minor_ops, +#endif +}; + +static const struct inode_operations nfs4_dir_inode_operations = { + .create = nfs_create, + .lookup = nfs_lookup, + .atomic_open = nfs_atomic_open, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, }; static const struct inode_operations nfs4_file_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, - .getxattr = nfs4_getxattr, - .setxattr = nfs4_setxattr, - .listxattr = nfs4_listxattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, }; const struct nfs_rpc_ops nfs_v4_clientops = { @@ -3693,18 +8389,23 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .dentry_ops = &nfs4_dentry_operations, .dir_inode_ops = &nfs4_dir_inode_operations, .file_inode_ops = &nfs4_file_inode_operations, + .file_ops = &nfs4_file_operations, .getroot = nfs4_proc_get_root, + .submount = nfs4_submount, + .try_mount = nfs4_try_mount, .getattr = nfs4_proc_getattr, .setattr = nfs4_proc_setattr, - .lookupfh = nfs4_proc_lookupfh, .lookup = nfs4_proc_lookup, .access = nfs4_proc_access, .readlink = nfs4_proc_readlink, .create = nfs4_proc_create, .remove = nfs4_proc_remove, .unlink_setup = nfs4_proc_unlink_setup, + .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare, .unlink_done = nfs4_proc_unlink_done, - .rename = nfs4_proc_rename, + .rename_setup = nfs4_proc_rename_setup, + .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare, + .rename_done = nfs4_proc_rename_done, .link = nfs4_proc_link, .symlink = nfs4_proc_symlink, .mkdir = nfs4_proc_mkdir, @@ -3716,16 +8417,40 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .pathconf = nfs4_proc_pathconf, .set_capabilities = nfs4_server_capabilities, .decode_dirent = nfs4_decode_dirent, + .pgio_rpc_prepare = nfs4_proc_pgio_rpc_prepare, .read_setup = nfs4_proc_read_setup, .read_done = nfs4_read_done, .write_setup = nfs4_proc_write_setup, .write_done = nfs4_write_done, .commit_setup = nfs4_proc_commit_setup, + .commit_rpc_prepare = nfs4_proc_commit_rpc_prepare, .commit_done = nfs4_commit_done, - .file_open = nfs_open, - .file_release = nfs_release, .lock = nfs4_proc_lock, .clear_acl_cache = nfs4_zap_acl_attr, + .close_context = nfs4_close_context, + .open_context = nfs4_atomic_open, + .have_delegation = nfs4_have_delegation, + .return_delegation = nfs4_inode_return_delegation, + .alloc_client = nfs4_alloc_client, + .init_client = nfs4_init_client, + .free_client = nfs4_free_client, + .create_server = nfs4_create_server, + .clone_server = nfs_clone_server, +}; + +static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { + .prefix = XATTR_NAME_NFSV4_ACL, + .list = nfs4_xattr_list_nfs4_acl, + .get = nfs4_xattr_get_nfs4_acl, + .set = nfs4_xattr_set_nfs4_acl, +}; + +const struct xattr_handler *nfs4_xattr_handlers[] = { + &nfs4_xattr_nfs4_acl_handler, +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + &nfs4_xattr_nfs4_label_handler, +#endif + NULL }; /* diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 5e2e4af1a0e..1720d32ffa5 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -36,11 +36,6 @@ * as an rpc_task, not a real kernel thread, so it always runs in rpciod's * context. There is one renewd per nfs_server. * - * TODO: If the send queue gets backlogged (e.g., if the server goes down), - * we will keep filling the queue with periodic RENEW requests. We need a - * mechanism for ensuring that if renewd successfully sends off a request, - * then it only wakes up when the request is finished. Maybe use the - * child task framework of the RPC layer? */ #include <linux/mm.h> @@ -54,58 +49,62 @@ #include "nfs4_fs.h" #include "delegation.h" -#define NFSDBG_FACILITY NFSDBG_PROC +#define NFSDBG_FACILITY NFSDBG_STATE void nfs4_renew_state(struct work_struct *work) { + const struct nfs4_state_maintenance_ops *ops; struct nfs_client *clp = container_of(work, struct nfs_client, cl_renewd.work); struct rpc_cred *cred; - long lease, timeout; + long lease; unsigned long last, now; + unsigned renew_flags = 0; - down_read(&clp->cl_sem); - dprintk("%s: start\n", __FUNCTION__); - /* Are there any active superblocks? */ - if (list_empty(&clp->cl_superblocks)) + ops = clp->cl_mvops->state_renewal_ops; + dprintk("%s: start\n", __func__); + + if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state)) goto out; + spin_lock(&clp->cl_lock); lease = clp->cl_lease_time; last = clp->cl_last_renewal; now = jiffies; - timeout = (2 * lease) / 3 + (long)last - (long)now; /* Are we close to a lease timeout? */ - if (time_after(now, last + lease/3)) { - cred = nfs4_get_renew_cred(clp); + if (time_after(now, last + lease/3)) + renew_flags |= NFS4_RENEW_TIMEOUT; + if (nfs_delegations_present(clp)) + renew_flags |= NFS4_RENEW_DELEGATION_CB; + + if (renew_flags != 0) { + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); if (cred == NULL) { - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - spin_unlock(&clp->cl_lock); + if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) { + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + goto out; + } nfs_expire_all_delegations(clp); - goto out; + } else { + /* Queue an asynchronous RENEW. */ + ops->sched_state_renewal(clp, cred, renew_flags); + put_rpccred(cred); + goto out_exp; } - spin_unlock(&clp->cl_lock); - /* Queue an asynchronous RENEW. */ - nfs4_proc_async_renew(clp, cred); - put_rpccred(cred); - timeout = (2 * lease) / 3; - spin_lock(&clp->cl_lock); - } else + } else { dprintk("%s: failed to call renewd. Reason: lease not expired \n", - __FUNCTION__); - if (timeout < 5 * HZ) /* safeguard */ - timeout = 5 * HZ; - dprintk("%s: requeueing work. Lease period = %ld\n", - __FUNCTION__, (timeout + HZ - 1) / HZ); - cancel_delayed_work(&clp->cl_renewd); - schedule_delayed_work(&clp->cl_renewd, timeout); - spin_unlock(&clp->cl_lock); + __func__); + spin_unlock(&clp->cl_lock); + } + nfs4_schedule_state_renewal(clp); +out_exp: + nfs_expire_unreferenced_delegations(clp); out: - up_read(&clp->cl_sem); - dprintk("%s: done\n", __FUNCTION__); + dprintk("%s: done\n", __func__); } -/* Must be called with clp->cl_sem locked for writes */ void nfs4_schedule_state_renewal(struct nfs_client *clp) { @@ -117,20 +116,13 @@ nfs4_schedule_state_renewal(struct nfs_client *clp) if (timeout < 5 * HZ) timeout = 5 * HZ; dprintk("%s: requeueing work. Lease period = %ld\n", - __FUNCTION__, (timeout + HZ - 1) / HZ); - cancel_delayed_work(&clp->cl_renewd); - schedule_delayed_work(&clp->cl_renewd, timeout); + __func__, (timeout + HZ - 1) / HZ); + mod_delayed_work(system_wq, &clp->cl_renewd, timeout); set_bit(NFS_CS_RENEWD, &clp->cl_res_state); spin_unlock(&clp->cl_lock); } void -nfs4_renewd_prepare_shutdown(struct nfs_server *server) -{ - cancel_delayed_work(&server->nfs_client->cl_renewd); -} - -void nfs4_kill_renewd(struct nfs_client *clp) { cancel_delayed_work_sync(&clp->cl_renewd); diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c new file mode 100644 index 00000000000..e799dc3c3b1 --- /dev/null +++ b/fs/nfs/nfs4session.c @@ -0,0 +1,565 @@ +/* + * fs/nfs/nfs4session.c + * + * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com> + * + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/sunrpc/sched.h> +#include <linux/sunrpc/bc_xprt.h> +#include <linux/nfs.h> +#include <linux/nfs4.h> +#include <linux/nfs_fs.h> +#include <linux/module.h> + +#include "nfs4_fs.h" +#include "internal.h" +#include "nfs4session.h" +#include "callback.h" + +#define NFSDBG_FACILITY NFSDBG_STATE + +static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue) +{ + tbl->highest_used_slotid = NFS4_NO_SLOT; + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue); + init_completion(&tbl->complete); +} + +/* + * nfs4_shrink_slot_table - free retired slots from the slot table + */ +static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize) +{ + struct nfs4_slot **p; + if (newsize >= tbl->max_slots) + return; + + p = &tbl->slots; + while (newsize--) + p = &(*p)->next; + while (*p) { + struct nfs4_slot *slot = *p; + + *p = slot->next; + kfree(slot); + tbl->max_slots--; + } +} + +/** + * nfs4_slot_tbl_drain_complete - wake waiters when drain is complete + * @tbl - controlling slot table + * + */ +void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl) +{ + if (nfs4_slot_tbl_draining(tbl)) + complete(&tbl->complete); +} + +/* + * nfs4_free_slot - free a slot and efficiently update slot table. + * + * freeing a slot is trivially done by clearing its respective bit + * in the bitmap. + * If the freed slotid equals highest_used_slotid we want to update it + * so that the server would be able to size down the slot table if needed, + * otherwise we know that the highest_used_slotid is still in use. + * When updating highest_used_slotid there may be "holes" in the bitmap + * so we need to scan down from highest_used_slotid to 0 looking for the now + * highest slotid in use. + * If none found, highest_used_slotid is set to NFS4_NO_SLOT. + * + * Must be called while holding tbl->slot_tbl_lock + */ +void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot) +{ + u32 slotid = slot->slot_nr; + + /* clear used bit in bitmap */ + __clear_bit(slotid, tbl->used_slots); + + /* update highest_used_slotid when it is freed */ + if (slotid == tbl->highest_used_slotid) { + u32 new_max = find_last_bit(tbl->used_slots, slotid); + if (new_max < slotid) + tbl->highest_used_slotid = new_max; + else { + tbl->highest_used_slotid = NFS4_NO_SLOT; + nfs4_slot_tbl_drain_complete(tbl); + } + } + dprintk("%s: slotid %u highest_used_slotid %u\n", __func__, + slotid, tbl->highest_used_slotid); +} + +static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_init, gfp_t gfp_mask) +{ + struct nfs4_slot *slot; + + slot = kzalloc(sizeof(*slot), gfp_mask); + if (slot) { + slot->table = tbl; + slot->slot_nr = slotid; + slot->seq_nr = seq_init; + } + return slot; +} + +static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table *tbl, + u32 slotid, u32 seq_init, gfp_t gfp_mask) +{ + struct nfs4_slot **p, *slot; + + p = &tbl->slots; + for (;;) { + if (*p == NULL) { + *p = nfs4_new_slot(tbl, tbl->max_slots, + seq_init, gfp_mask); + if (*p == NULL) + break; + tbl->max_slots++; + } + slot = *p; + if (slot->slot_nr == slotid) + return slot; + p = &slot->next; + } + return ERR_PTR(-ENOMEM); +} + +/* + * nfs4_alloc_slot - efficiently look for a free slot + * + * nfs4_alloc_slot looks for an unset bit in the used_slots bitmap. + * If found, we mark the slot as used, update the highest_used_slotid, + * and respectively set up the sequence operation args. + * + * Note: must be called with under the slot_tbl_lock. + */ +struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl) +{ + struct nfs4_slot *ret = ERR_PTR(-EBUSY); + u32 slotid; + + dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n", + __func__, tbl->used_slots[0], tbl->highest_used_slotid, + tbl->max_slotid + 1); + slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1); + if (slotid > tbl->max_slotid) + goto out; + ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT); + if (IS_ERR(ret)) + goto out; + __set_bit(slotid, tbl->used_slots); + if (slotid > tbl->highest_used_slotid || + tbl->highest_used_slotid == NFS4_NO_SLOT) + tbl->highest_used_slotid = slotid; + ret->generation = tbl->generation; + +out: + dprintk("<-- %s used_slots=%04lx highest_used=%u slotid=%u\n", + __func__, tbl->used_slots[0], tbl->highest_used_slotid, + !IS_ERR(ret) ? ret->slot_nr : NFS4_NO_SLOT); + return ret; +} + +static int nfs4_grow_slot_table(struct nfs4_slot_table *tbl, + u32 max_reqs, u32 ivalue) +{ + if (max_reqs <= tbl->max_slots) + return 0; + if (!IS_ERR(nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS))) + return 0; + return -ENOMEM; +} + +static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl, + u32 server_highest_slotid, + u32 ivalue) +{ + struct nfs4_slot **p; + + nfs4_shrink_slot_table(tbl, server_highest_slotid + 1); + p = &tbl->slots; + while (*p) { + (*p)->seq_nr = ivalue; + (*p)->interrupted = 0; + p = &(*p)->next; + } + tbl->highest_used_slotid = NFS4_NO_SLOT; + tbl->target_highest_slotid = server_highest_slotid; + tbl->server_highest_slotid = server_highest_slotid; + tbl->d_target_highest_slotid = 0; + tbl->d2_target_highest_slotid = 0; + tbl->max_slotid = server_highest_slotid; +} + +/* + * (re)Initialise a slot table + */ +static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, + u32 max_reqs, u32 ivalue) +{ + int ret; + + dprintk("--> %s: max_reqs=%u, tbl->max_slots %u\n", __func__, + max_reqs, tbl->max_slots); + + if (max_reqs > NFS4_MAX_SLOT_TABLE) + max_reqs = NFS4_MAX_SLOT_TABLE; + + ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue); + if (ret) + goto out; + + spin_lock(&tbl->slot_tbl_lock); + nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue); + spin_unlock(&tbl->slot_tbl_lock); + + dprintk("%s: tbl=%p slots=%p max_slots=%u\n", __func__, + tbl, tbl->slots, tbl->max_slots); +out: + dprintk("<-- %s: return %d\n", __func__, ret); + return ret; +} + +/* + * nfs4_release_slot_table - release all slot table entries + */ +static void nfs4_release_slot_table(struct nfs4_slot_table *tbl) +{ + nfs4_shrink_slot_table(tbl, 0); +} + +/** + * nfs4_shutdown_slot_table - release resources attached to a slot table + * @tbl: slot table to shut down + * + */ +void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl) +{ + nfs4_release_slot_table(tbl); + rpc_destroy_wait_queue(&tbl->slot_tbl_waitq); +} + +/** + * nfs4_setup_slot_table - prepare a stand-alone slot table for use + * @tbl: slot table to set up + * @max_reqs: maximum number of requests allowed + * @queue: name to give RPC wait queue + * + * Returns zero on success, or a negative errno. + */ +int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, unsigned int max_reqs, + const char *queue) +{ + nfs4_init_slot_table(tbl, queue); + return nfs4_realloc_slot_table(tbl, max_reqs, 0); +} + +static bool nfs41_assign_slot(struct rpc_task *task, void *pslot) +{ + struct nfs4_sequence_args *args = task->tk_msg.rpc_argp; + struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; + struct nfs4_slot *slot = pslot; + struct nfs4_slot_table *tbl = slot->table; + + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) + return false; + slot->generation = tbl->generation; + args->sa_slot = slot; + res->sr_timestamp = jiffies; + res->sr_slot = slot; + res->sr_status_flags = 0; + res->sr_status = 1; + return true; +} + +static bool __nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot) +{ + if (rpc_wake_up_first(&tbl->slot_tbl_waitq, nfs41_assign_slot, slot)) + return true; + return false; +} + +bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot) +{ + if (slot->slot_nr > tbl->max_slotid) + return false; + return __nfs41_wake_and_assign_slot(tbl, slot); +} + +static bool nfs41_try_wake_next_slot_table_entry(struct nfs4_slot_table *tbl) +{ + struct nfs4_slot *slot = nfs4_alloc_slot(tbl); + if (!IS_ERR(slot)) { + bool ret = __nfs41_wake_and_assign_slot(tbl, slot); + if (ret) + return ret; + nfs4_free_slot(tbl, slot); + } + return false; +} + +void nfs41_wake_slot_table(struct nfs4_slot_table *tbl) +{ + for (;;) { + if (!nfs41_try_wake_next_slot_table_entry(tbl)) + break; + } +} + +#if defined(CONFIG_NFS_V4_1) + +static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl, + u32 target_highest_slotid) +{ + u32 max_slotid; + + max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, target_highest_slotid); + if (max_slotid > tbl->server_highest_slotid) + max_slotid = tbl->server_highest_slotid; + if (max_slotid > tbl->target_highest_slotid) + max_slotid = tbl->target_highest_slotid; + tbl->max_slotid = max_slotid; + nfs41_wake_slot_table(tbl); +} + +/* Update the client's idea of target_highest_slotid */ +static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl, + u32 target_highest_slotid) +{ + if (tbl->target_highest_slotid == target_highest_slotid) + return; + tbl->target_highest_slotid = target_highest_slotid; + tbl->generation++; +} + +void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, + u32 target_highest_slotid) +{ + spin_lock(&tbl->slot_tbl_lock); + nfs41_set_target_slotid_locked(tbl, target_highest_slotid); + tbl->d_target_highest_slotid = 0; + tbl->d2_target_highest_slotid = 0; + nfs41_set_max_slotid_locked(tbl, target_highest_slotid); + spin_unlock(&tbl->slot_tbl_lock); +} + +static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl, + u32 highest_slotid) +{ + if (tbl->server_highest_slotid == highest_slotid) + return; + if (tbl->highest_used_slotid > highest_slotid) + return; + /* Deallocate slots */ + nfs4_shrink_slot_table(tbl, highest_slotid + 1); + tbl->server_highest_slotid = highest_slotid; +} + +static s32 nfs41_derivative_target_slotid(s32 s1, s32 s2) +{ + s1 -= s2; + if (s1 == 0) + return 0; + if (s1 < 0) + return (s1 - 1) >> 1; + return (s1 + 1) >> 1; +} + +static int nfs41_sign_s32(s32 s1) +{ + if (s1 > 0) + return 1; + if (s1 < 0) + return -1; + return 0; +} + +static bool nfs41_same_sign_or_zero_s32(s32 s1, s32 s2) +{ + if (!s1 || !s2) + return true; + return nfs41_sign_s32(s1) == nfs41_sign_s32(s2); +} + +/* Try to eliminate outliers by checking for sharp changes in the + * derivatives and second derivatives + */ +static bool nfs41_is_outlier_target_slotid(struct nfs4_slot_table *tbl, + u32 new_target) +{ + s32 d_target, d2_target; + bool ret = true; + + d_target = nfs41_derivative_target_slotid(new_target, + tbl->target_highest_slotid); + d2_target = nfs41_derivative_target_slotid(d_target, + tbl->d_target_highest_slotid); + /* Is first derivative same sign? */ + if (nfs41_same_sign_or_zero_s32(d_target, tbl->d_target_highest_slotid)) + ret = false; + /* Is second derivative same sign? */ + if (nfs41_same_sign_or_zero_s32(d2_target, tbl->d2_target_highest_slotid)) + ret = false; + tbl->d_target_highest_slotid = d_target; + tbl->d2_target_highest_slotid = d2_target; + return ret; +} + +void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot, + struct nfs4_sequence_res *res) +{ + spin_lock(&tbl->slot_tbl_lock); + if (!nfs41_is_outlier_target_slotid(tbl, res->sr_target_highest_slotid)) + nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid); + if (tbl->generation == slot->generation) + nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid); + nfs41_set_max_slotid_locked(tbl, res->sr_target_highest_slotid); + spin_unlock(&tbl->slot_tbl_lock); +} + +static void nfs4_release_session_slot_tables(struct nfs4_session *session) +{ + nfs4_release_slot_table(&session->fc_slot_table); + nfs4_release_slot_table(&session->bc_slot_table); +} + +/* + * Initialize or reset the forechannel and backchannel tables + */ +int nfs4_setup_session_slot_tables(struct nfs4_session *ses) +{ + struct nfs4_slot_table *tbl; + int status; + + dprintk("--> %s\n", __func__); + /* Fore channel */ + tbl = &ses->fc_slot_table; + tbl->session = ses; + status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1); + if (status) /* -ENOMEM */ + return status; + /* Back channel */ + tbl = &ses->bc_slot_table; + tbl->session = ses; + status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0); + if (status && tbl->slots == NULL) + /* Fore and back channel share a connection so get + * both slot tables or neither */ + nfs4_release_session_slot_tables(ses); + return status; +} + +struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) +{ + struct nfs4_session *session; + + session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); + if (!session) + return NULL; + + nfs4_init_slot_table(&session->fc_slot_table, "ForeChannel Slot table"); + nfs4_init_slot_table(&session->bc_slot_table, "BackChannel Slot table"); + session->session_state = 1<<NFS4_SESSION_INITING; + + session->clp = clp; + return session; +} + +static void nfs4_destroy_session_slot_tables(struct nfs4_session *session) +{ + nfs4_shutdown_slot_table(&session->fc_slot_table); + nfs4_shutdown_slot_table(&session->bc_slot_table); +} + +void nfs4_destroy_session(struct nfs4_session *session) +{ + struct rpc_xprt *xprt; + struct rpc_cred *cred; + + cred = nfs4_get_clid_cred(session->clp); + nfs4_proc_destroy_session(session, cred); + if (cred) + put_rpccred(cred); + + rcu_read_lock(); + xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt); + rcu_read_unlock(); + dprintk("%s Destroy backchannel for xprt %p\n", + __func__, xprt); + xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS); + nfs4_destroy_session_slot_tables(session); + kfree(session); +} + +/* + * With sessions, the client is not marked ready until after a + * successful EXCHANGE_ID and CREATE_SESSION. + * + * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate + * other versions of NFS can be tried. + */ +static int nfs41_check_session_ready(struct nfs_client *clp) +{ + int ret; + + if (clp->cl_cons_state == NFS_CS_SESSION_INITING) { + ret = nfs4_client_recover_expired_lease(clp); + if (ret) + return ret; + } + if (clp->cl_cons_state < NFS_CS_READY) + return -EPROTONOSUPPORT; + smp_rmb(); + return 0; +} + +int nfs4_init_session(struct nfs_client *clp) +{ + if (!nfs4_has_session(clp)) + return 0; + + clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state); + return nfs41_check_session_ready(clp); +} + +int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time) +{ + struct nfs4_session *session = clp->cl_session; + int ret; + + spin_lock(&clp->cl_lock); + if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { + /* + * Do not set NFS_CS_CHECK_LEASE_TIME instead set the + * DS lease to be equal to the MDS lease. + */ + clp->cl_lease_time = lease_time; + clp->cl_last_renewal = jiffies; + } + spin_unlock(&clp->cl_lock); + + ret = nfs41_check_session_ready(clp); + if (ret) + return ret; + /* Test for the DS role */ + if (!is_ds_client(clp)) + return -ENODEV; + return 0; +} +EXPORT_SYMBOL_GPL(nfs4_init_ds_session); + +#endif /* defined(CONFIG_NFS_V4_1) */ diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h new file mode 100644 index 00000000000..b34ada9bc6a --- /dev/null +++ b/fs/nfs/nfs4session.h @@ -0,0 +1,153 @@ +/* + * fs/nfs/nfs4session.h + * + * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com> + * + */ +#ifndef __LINUX_FS_NFS_NFS4SESSION_H +#define __LINUX_FS_NFS_NFS4SESSION_H + +/* maximum number of slots to use */ +#define NFS4_DEF_SLOT_TABLE_SIZE (64U) +#define NFS4_MAX_SLOT_TABLE (1024U) +#define NFS4_NO_SLOT ((u32)-1) + +#if IS_ENABLED(CONFIG_NFS_V4) + +/* Sessions slot seqid */ +struct nfs4_slot { + struct nfs4_slot_table *table; + struct nfs4_slot *next; + unsigned long generation; + u32 slot_nr; + u32 seq_nr; + unsigned int interrupted : 1; +}; + +/* Sessions */ +enum nfs4_slot_tbl_state { + NFS4_SLOT_TBL_DRAINING, +}; + +#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long)) +struct nfs4_slot_table { + struct nfs4_session *session; /* Parent session */ + struct nfs4_slot *slots; /* seqid per slot */ + unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */ + spinlock_t slot_tbl_lock; + struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */ + u32 max_slots; /* # slots in table */ + u32 max_slotid; /* Max allowed slotid value */ + u32 highest_used_slotid; /* sent to server on each SEQ. + * op for dynamic resizing */ + u32 target_highest_slotid; /* Server max_slot target */ + u32 server_highest_slotid; /* Server highest slotid */ + s32 d_target_highest_slotid; /* Derivative */ + s32 d2_target_highest_slotid; /* 2nd derivative */ + unsigned long generation; /* Generation counter for + target_highest_slotid */ + struct completion complete; + unsigned long slot_tbl_state; +}; + +/* + * Session related parameters + */ +struct nfs4_session { + struct nfs4_sessionid sess_id; + u32 flags; + unsigned long session_state; + u32 hash_alg; + u32 ssv_len; + + /* The fore and back channel */ + struct nfs4_channel_attrs fc_attrs; + struct nfs4_slot_table fc_slot_table; + struct nfs4_channel_attrs bc_attrs; + struct nfs4_slot_table bc_slot_table; + struct nfs_client *clp; +}; + +enum nfs4_session_state { + NFS4_SESSION_INITING, +}; + +extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, + unsigned int max_reqs, const char *queue); +extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl); +extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); +extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); +extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); +bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot); +void nfs41_wake_slot_table(struct nfs4_slot_table *tbl); + +static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl) +{ + return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); +} + +#if defined(CONFIG_NFS_V4_1) +extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl, + u32 target_highest_slotid); +extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, + struct nfs4_slot *slot, + struct nfs4_sequence_res *res); + +extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses); + +extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); +extern void nfs4_destroy_session(struct nfs4_session *session); +extern int nfs4_init_session(struct nfs_client *clp); +extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); + +/* + * Determine if sessions are in use. + */ +static inline int nfs4_has_session(const struct nfs_client *clp) +{ + if (clp->cl_session) + return 1; + return 0; +} + +static inline int nfs4_has_persistent_session(const struct nfs_client *clp) +{ + if (nfs4_has_session(clp)) + return (clp->cl_session->flags & SESSION4_PERSIST); + return 0; +} + +#ifdef CONFIG_CRC32 +/* + * nfs_session_id_hash - calculate the crc32 hash for the session id + * @session - pointer to session + */ +#define nfs_session_id_hash(sess_id) \ + (~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data))) +#else +#define nfs_session_id_hash(session) (0) +#endif +#else /* defined(CONFIG_NFS_V4_1) */ + +static inline int nfs4_init_session(struct nfs_client *clp) +{ + return 0; +} + +/* + * Determine if sessions are in use. + */ +static inline int nfs4_has_session(const struct nfs_client *clp) +{ + return 0; +} + +static inline int nfs4_has_persistent_session(const struct nfs_client *clp) +{ + return 0; +} + +#endif /* defined(CONFIG_NFS_V4_1) */ +#endif /* IS_ENABLED(CONFIG_NFS_V4) */ +#endif /* __LINUX_FS_NFS_NFS4SESSION_H */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index b962397004c..848f6853c59 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -40,45 +40,144 @@ #include <linux/kernel.h> #include <linux/slab.h> -#include <linux/smp_lock.h> +#include <linux/fs.h> #include <linux/nfs_fs.h> #include <linux/nfs_idmap.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> +#include <linux/ratelimit.h> #include <linux/workqueue.h> #include <linux/bitops.h> +#include <linux/jiffies.h> + +#include <linux/sunrpc/clnt.h> #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" #include "internal.h" +#include "nfs4session.h" +#include "pnfs.h" +#include "netns.h" + +#define NFSDBG_FACILITY NFSDBG_STATE #define OPENOWNER_POOL_SIZE 8 const nfs4_stateid zero_stateid; +static DEFINE_MUTEX(nfs_clid_init_mutex); -static LIST_HEAD(nfs4_clientid_list); +int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct nfs4_setclientid_res clid = { + .clientid = clp->cl_clientid, + .confirm = clp->cl_confirm, + }; + unsigned short port; + int status; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + + if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) + goto do_confirm; + port = nn->nfs_callback_tcpport; + if (clp->cl_addr.ss_family == AF_INET6) + port = nn->nfs_callback_tcpport6; + + status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); + if (status != 0) + goto out; + clp->cl_clientid = clid.clientid; + clp->cl_confirm = clid.confirm; + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); +do_confirm: + status = nfs4_proc_setclientid_confirm(clp, &clid, cred); + if (status != 0) + goto out; + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + nfs4_schedule_state_renewal(clp); +out: + return status; +} -static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred) +/** + * nfs40_discover_server_trunking - Detect server IP address trunking (mv0) + * + * @clp: nfs_client under test + * @result: OUT: found nfs_client, or clp + * @cred: credential to use for trunking test + * + * Returns zero, a negative errno, or a negative NFS4ERR status. + * If zero is returned, an nfs_client pointer is planted in + * "result". + * + * Note: The returned client may not yet be marked ready. + */ +int nfs40_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred) { - int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, - nfs_callback_tcpport, cred); - if (status == 0) - status = nfs4_proc_setclientid_confirm(clp, cred); - if (status == 0) - nfs4_schedule_state_renewal(clp); + struct nfs4_setclientid_res clid = { + .clientid = clp->cl_clientid, + .confirm = clp->cl_confirm, + }; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + unsigned short port; + int status; + + port = nn->nfs_callback_tcpport; + if (clp->cl_addr.ss_family == AF_INET6) + port = nn->nfs_callback_tcpport6; + + status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); + if (status != 0) + goto out; + clp->cl_clientid = clid.clientid; + clp->cl_confirm = clid.confirm; + + status = nfs40_walk_client_list(clp, result, cred); + if (status == 0) { + /* Sustain the lease, even if it's empty. If the clientid4 + * goes stale it's of no use for trunking discovery. */ + nfs4_schedule_state_renewal(*result); + } +out: return status; } -struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp) +struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp) +{ + struct rpc_cred *cred = NULL; + + if (clp->cl_machine_cred != NULL) + cred = get_rpccred(clp->cl_machine_cred); + return cred; +} + +static void nfs4_root_machine_cred(struct nfs_client *clp) +{ + struct rpc_cred *cred, *new; + + new = rpc_lookup_machine_cred(NULL); + spin_lock(&clp->cl_lock); + cred = clp->cl_machine_cred; + clp->cl_machine_cred = new; + spin_unlock(&clp->cl_lock); + if (cred != NULL) + put_rpccred(cred); +} + +static struct rpc_cred * +nfs4_get_renew_cred_server_locked(struct nfs_server *server) { + struct rpc_cred *cred = NULL; struct nfs4_state_owner *sp; struct rb_node *pos; - struct rpc_cred *cred = NULL; - for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { - sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); if (list_empty(&sp->so_states)) continue; cred = get_rpccred(sp->so_cred); @@ -87,147 +186,271 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp) return cred; } -static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) +/** + * nfs4_get_renew_cred_locked - Acquire credential for a renew operation + * @clp: client state handle + * + * Returns an rpc_cred with reference count bumped, or NULL. + * Caller must hold clp->cl_lock. + */ +struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) { - struct nfs4_state_owner *sp; - struct rb_node *pos; + struct rpc_cred *cred = NULL; + struct nfs_server *server; - pos = rb_first(&clp->cl_state_owners); - if (pos != NULL) { - sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); - return get_rpccred(sp->so_cred); + /* Use machine credentials if available */ + cred = nfs4_get_machine_cred_locked(clp); + if (cred != NULL) + goto out; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + cred = nfs4_get_renew_cred_server_locked(server); + if (cred != NULL) + break; } - return NULL; + rcu_read_unlock(); + +out: + return cred; } -static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new, - __u64 minval, int maxbits) +static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl) { - struct rb_node **p, *parent; - struct nfs_unique_id *pos; - __u64 mask = ~0ULL; + if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { + spin_lock(&tbl->slot_tbl_lock); + nfs41_wake_slot_table(tbl); + spin_unlock(&tbl->slot_tbl_lock); + } +} - if (maxbits < 64) - mask = (1ULL << maxbits) - 1ULL; +static void nfs4_end_drain_session(struct nfs_client *clp) +{ + struct nfs4_session *ses = clp->cl_session; - /* Ensure distribution is more or less flat */ - get_random_bytes(&new->id, sizeof(new->id)); - new->id &= mask; - if (new->id < minval) - new->id += minval; -retry: - p = &root->rb_node; - parent = NULL; + if (clp->cl_slot_tbl) { + nfs4_end_drain_slot_table(clp->cl_slot_tbl); + return; + } - while (*p != NULL) { - parent = *p; - pos = rb_entry(parent, struct nfs_unique_id, rb_node); + if (ses != NULL) { + nfs4_end_drain_slot_table(&ses->bc_slot_table); + nfs4_end_drain_slot_table(&ses->fc_slot_table); + } +} - if (new->id < pos->id) - p = &(*p)->rb_left; - else if (new->id > pos->id) - p = &(*p)->rb_right; - else - goto id_exists; +static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl) +{ + set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); + spin_lock(&tbl->slot_tbl_lock); + if (tbl->highest_used_slotid != NFS4_NO_SLOT) { + reinit_completion(&tbl->complete); + spin_unlock(&tbl->slot_tbl_lock); + return wait_for_completion_interruptible(&tbl->complete); } - rb_link_node(&new->rb_node, parent, p); - rb_insert_color(&new->rb_node, root); - return; -id_exists: - for (;;) { - new->id++; - if (new->id < minval || (new->id & mask) != new->id) { - new->id = minval; - break; - } - parent = rb_next(parent); - if (parent == NULL) - break; - pos = rb_entry(parent, struct nfs_unique_id, rb_node); - if (new->id < pos->id) - break; + spin_unlock(&tbl->slot_tbl_lock); + return 0; +} + +static int nfs4_begin_drain_session(struct nfs_client *clp) +{ + struct nfs4_session *ses = clp->cl_session; + int ret = 0; + + if (clp->cl_slot_tbl) + return nfs4_drain_slot_tbl(clp->cl_slot_tbl); + + /* back channel */ + ret = nfs4_drain_slot_tbl(&ses->bc_slot_table); + if (ret) + return ret; + /* fore channel */ + return nfs4_drain_slot_tbl(&ses->fc_slot_table); +} + +#if defined(CONFIG_NFS_V4_1) + +static int nfs41_setup_state_renewal(struct nfs_client *clp) +{ + int status; + struct nfs_fsinfo fsinfo; + + if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { + nfs4_schedule_state_renewal(clp); + return 0; + } + + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + /* Update lease time and schedule renewal */ + spin_lock(&clp->cl_lock); + clp->cl_lease_time = fsinfo.lease_time * HZ; + clp->cl_last_renewal = jiffies; + spin_unlock(&clp->cl_lock); + + nfs4_schedule_state_renewal(clp); } - goto retry; + + return status; +} + +static void nfs41_finish_session_reset(struct nfs_client *clp) +{ + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + /* create_session negotiated new slot table */ + clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + nfs41_setup_state_renewal(clp); +} + +int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) +{ + int status; + + if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) + goto do_confirm; + nfs4_begin_drain_session(clp); + status = nfs4_proc_exchange_id(clp, cred); + if (status != 0) + goto out; + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); +do_confirm: + status = nfs4_proc_create_session(clp, cred); + if (status != 0) + goto out; + nfs41_finish_session_reset(clp); + nfs_mark_client_ready(clp, NFS_CS_READY); +out: + return status; } -static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id) +/** + * nfs41_discover_server_trunking - Detect server IP address trunking (mv1) + * + * @clp: nfs_client under test + * @result: OUT: found nfs_client, or clp + * @cred: credential to use for trunking test + * + * Returns NFS4_OK, a negative errno, or a negative NFS4ERR status. + * If NFS4_OK is returned, an nfs_client pointer is planted in + * "result". + * + * Note: The returned client may not yet be marked ready. + */ +int nfs41_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred) { - rb_erase(&id->rb_node, root); + int status; + + status = nfs4_proc_exchange_id(clp, cred); + if (status != NFS4_OK) + return status; + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + + return nfs41_walk_client_list(clp, result, cred); +} + +#endif /* CONFIG_NFS_V4_1 */ + +/** + * nfs4_get_clid_cred - Acquire credential for a setclientid operation + * @clp: client state handle + * + * Returns an rpc_cred with reference count bumped, or NULL. + */ +struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp) +{ + struct rpc_cred *cred; + + spin_lock(&clp->cl_lock); + cred = nfs4_get_machine_cred_locked(clp); + spin_unlock(&clp->cl_lock); + return cred; } static struct nfs4_state_owner * -nfs4_find_state_owner(struct nfs_server *server, struct rpc_cred *cred) +nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred) { - struct nfs_client *clp = server->nfs_client; - struct rb_node **p = &clp->cl_state_owners.rb_node, + struct rb_node **p = &server->state_owners.rb_node, *parent = NULL; - struct nfs4_state_owner *sp, *res = NULL; + struct nfs4_state_owner *sp; while (*p != NULL) { parent = *p; - sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); + sp = rb_entry(parent, struct nfs4_state_owner, so_server_node); - if (server < sp->so_server) { - p = &parent->rb_left; - continue; - } - if (server > sp->so_server) { - p = &parent->rb_right; - continue; - } if (cred < sp->so_cred) p = &parent->rb_left; else if (cred > sp->so_cred) p = &parent->rb_right; else { + if (!list_empty(&sp->so_lru)) + list_del_init(&sp->so_lru); atomic_inc(&sp->so_count); - res = sp; - break; + return sp; } } - return res; + return NULL; } static struct nfs4_state_owner * -nfs4_insert_state_owner(struct nfs_client *clp, struct nfs4_state_owner *new) +nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) { - struct rb_node **p = &clp->cl_state_owners.rb_node, + struct nfs_server *server = new->so_server; + struct rb_node **p = &server->state_owners.rb_node, *parent = NULL; struct nfs4_state_owner *sp; + int err; while (*p != NULL) { parent = *p; - sp = rb_entry(parent, struct nfs4_state_owner, so_client_node); + sp = rb_entry(parent, struct nfs4_state_owner, so_server_node); - if (new->so_server < sp->so_server) { - p = &parent->rb_left; - continue; - } - if (new->so_server > sp->so_server) { - p = &parent->rb_right; - continue; - } if (new->so_cred < sp->so_cred) p = &parent->rb_left; else if (new->so_cred > sp->so_cred) p = &parent->rb_right; else { + if (!list_empty(&sp->so_lru)) + list_del_init(&sp->so_lru); atomic_inc(&sp->so_count); return sp; } } - nfs_alloc_unique_id(&clp->cl_openowner_id, &new->so_owner_id, 1, 64); - rb_link_node(&new->so_client_node, parent, p); - rb_insert_color(&new->so_client_node, &clp->cl_state_owners); + err = ida_get_new(&server->openowner_id, &new->so_seqid.owner_id); + if (err) + return ERR_PTR(err); + rb_link_node(&new->so_server_node, parent, p); + rb_insert_color(&new->so_server_node, &server->state_owners); return new; } static void -nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp) +nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp) +{ + struct nfs_server *server = sp->so_server; + + if (!RB_EMPTY_NODE(&sp->so_server_node)) + rb_erase(&sp->so_server_node, &server->state_owners); + ida_remove(&server->openowner_id, sp->so_seqid.owner_id); +} + +static void +nfs4_init_seqid_counter(struct nfs_seqid_counter *sc) +{ + sc->create_time = ktime_get(); + sc->flags = 0; + sc->counter = 0; + spin_lock_init(&sc->lock); + INIT_LIST_HEAD(&sc->list); + rpc_init_wait_queue(&sc->wait, "Seqid_waitqueue"); +} + +static void +nfs4_destroy_seqid_counter(struct nfs_seqid_counter *sc) { - if (!RB_EMPTY_NODE(&sp->so_client_node)) - rb_erase(&sp->so_client_node, &clp->cl_state_owners); - nfs_free_unique_id(&clp->cl_openowner_id, &sp->so_owner_id); + rpc_destroy_wait_queue(&sc->wait); } /* @@ -236,82 +459,162 @@ nfs4_remove_state_owner(struct nfs_client *clp, struct nfs4_state_owner *sp) * */ static struct nfs4_state_owner * -nfs4_alloc_state_owner(void) +nfs4_alloc_state_owner(struct nfs_server *server, + struct rpc_cred *cred, + gfp_t gfp_flags) { struct nfs4_state_owner *sp; - sp = kzalloc(sizeof(*sp),GFP_KERNEL); + sp = kzalloc(sizeof(*sp), gfp_flags); if (!sp) return NULL; + sp->so_server = server; + sp->so_cred = get_rpccred(cred); spin_lock_init(&sp->so_lock); INIT_LIST_HEAD(&sp->so_states); - INIT_LIST_HEAD(&sp->so_delegations); - rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); - sp->so_seqid.sequence = &sp->so_sequence; - spin_lock_init(&sp->so_sequence.lock); - INIT_LIST_HEAD(&sp->so_sequence.list); + nfs4_init_seqid_counter(&sp->so_seqid); atomic_set(&sp->so_count, 1); + INIT_LIST_HEAD(&sp->so_lru); + seqcount_init(&sp->so_reclaim_seqcount); + mutex_init(&sp->so_delegreturn_mutex); return sp; } -void +static void nfs4_drop_state_owner(struct nfs4_state_owner *sp) { - if (!RB_EMPTY_NODE(&sp->so_client_node)) { - struct nfs_client *clp = sp->so_client; + struct rb_node *rb_node = &sp->so_server_node; + + if (!RB_EMPTY_NODE(rb_node)) { + struct nfs_server *server = sp->so_server; + struct nfs_client *clp = server->nfs_client; spin_lock(&clp->cl_lock); - rb_erase(&sp->so_client_node, &clp->cl_state_owners); - RB_CLEAR_NODE(&sp->so_client_node); + if (!RB_EMPTY_NODE(rb_node)) { + rb_erase(rb_node, &server->state_owners); + RB_CLEAR_NODE(rb_node); + } spin_unlock(&clp->cl_lock); } } -/* - * Note: must be called with clp->cl_sem held in order to prevent races - * with reboot recovery! +static void nfs4_free_state_owner(struct nfs4_state_owner *sp) +{ + nfs4_destroy_seqid_counter(&sp->so_seqid); + put_rpccred(sp->so_cred); + kfree(sp); +} + +static void nfs4_gc_state_owners(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state_owner *sp, *tmp; + unsigned long time_min, time_max; + LIST_HEAD(doomed); + + spin_lock(&clp->cl_lock); + time_max = jiffies; + time_min = (long)time_max - (long)clp->cl_lease_time; + list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) { + /* NB: LRU is sorted so that oldest is at the head */ + if (time_in_range(sp->so_expires, time_min, time_max)) + break; + list_move(&sp->so_lru, &doomed); + nfs4_remove_state_owner_locked(sp); + } + spin_unlock(&clp->cl_lock); + + list_for_each_entry_safe(sp, tmp, &doomed, so_lru) { + list_del(&sp->so_lru); + nfs4_free_state_owner(sp); + } +} + +/** + * nfs4_get_state_owner - Look up a state owner given a credential + * @server: nfs_server to search + * @cred: RPC credential to match + * + * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL. */ -struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred) +struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, + struct rpc_cred *cred, + gfp_t gfp_flags) { struct nfs_client *clp = server->nfs_client; struct nfs4_state_owner *sp, *new; spin_lock(&clp->cl_lock); - sp = nfs4_find_state_owner(server, cred); + sp = nfs4_find_state_owner_locked(server, cred); spin_unlock(&clp->cl_lock); if (sp != NULL) - return sp; - new = nfs4_alloc_state_owner(); + goto out; + new = nfs4_alloc_state_owner(server, cred, gfp_flags); if (new == NULL) - return NULL; - new->so_client = clp; - new->so_server = server; - new->so_cred = cred; - spin_lock(&clp->cl_lock); - sp = nfs4_insert_state_owner(clp, new); - spin_unlock(&clp->cl_lock); - if (sp == new) - get_rpccred(cred); - else - kfree(new); + goto out; + do { + if (ida_pre_get(&server->openowner_id, gfp_flags) == 0) + break; + spin_lock(&clp->cl_lock); + sp = nfs4_insert_state_owner_locked(new); + spin_unlock(&clp->cl_lock); + } while (sp == ERR_PTR(-EAGAIN)); + if (sp != new) + nfs4_free_state_owner(new); +out: + nfs4_gc_state_owners(server); return sp; } -/* - * Must be called with clp->cl_sem held in order to avoid races - * with state recovery... +/** + * nfs4_put_state_owner - Release a nfs4_state_owner + * @sp: state owner data to release + * + * Note that we keep released state owners on an LRU + * list. + * This caches valid state owners so that they can be + * reused, to avoid the OPEN_CONFIRM on minor version 0. + * It also pins the uniquifier of dropped state owners for + * a while, to ensure that those state owner names are + * never reused. */ void nfs4_put_state_owner(struct nfs4_state_owner *sp) { - struct nfs_client *clp = sp->so_client; - struct rpc_cred *cred = sp->so_cred; + struct nfs_server *server = sp->so_server; + struct nfs_client *clp = server->nfs_client; if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) return; - nfs4_remove_state_owner(clp, sp); + + sp->so_expires = jiffies; + list_add_tail(&sp->so_lru, &server->state_owners_lru); spin_unlock(&clp->cl_lock); - put_rpccred(cred); - kfree(sp); +} + +/** + * nfs4_purge_state_owners - Release all cached state owners + * @server: nfs_server with cached state owners to release + * + * Called at umount time. Remaining state owners will be on + * the LRU with ref count of zero. + */ +void nfs4_purge_state_owners(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state_owner *sp, *tmp; + LIST_HEAD(doomed); + + spin_lock(&clp->cl_lock); + list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) { + list_move(&sp->so_lru, &doomed); + nfs4_remove_state_owner_locked(sp); + } + spin_unlock(&clp->cl_lock); + + list_for_each_entry_safe(sp, tmp, &doomed, so_lru) { + list_del(&sp->so_lru); + nfs4_free_state_owner(sp); + } } static struct nfs4_state * @@ -319,7 +622,7 @@ nfs4_alloc_open_state(void) { struct nfs4_state *state; - state = kzalloc(sizeof(*state), GFP_KERNEL); + state = kzalloc(sizeof(*state), GFP_NOFS); if (!state) return NULL; atomic_set(&state->count, 1); @@ -330,18 +633,18 @@ nfs4_alloc_open_state(void) } void -nfs4_state_set_mode_locked(struct nfs4_state *state, mode_t mode) +nfs4_state_set_mode_locked(struct nfs4_state *state, fmode_t fmode) { - if (state->state == mode) + if (state->state == fmode) return; /* NB! List reordering - see the reclaim code for why. */ - if ((mode & FMODE_WRITE) != (state->state & FMODE_WRITE)) { - if (mode & FMODE_WRITE) + if ((fmode & FMODE_WRITE) != (state->state & FMODE_WRITE)) { + if (fmode & FMODE_WRITE) list_move(&state->open_states, &state->owner->so_states); else list_move_tail(&state->open_states, &state->owner->so_states); } - state->state = mode; + state->state = fmode; } static struct nfs4_state * @@ -353,6 +656,8 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner) list_for_each_entry(state, &nfsi->open_states, inode_states) { if (state->owner != owner) continue; + if (!nfs4_valid_open_stateid(state)) + continue; if (atomic_inc_not_zero(&state->count)) return state; } @@ -385,7 +690,8 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) state->owner = owner; atomic_inc(&owner->so_count); list_add(&state->inode_states, &nfsi->open_states); - state->inode = igrab(inode); + ihold(inode); + state->inode = inode; spin_unlock(&inode->i_lock); /* Note: The reclaim code dictates that we add stateless * and read-only stateids to the end of the list */ @@ -401,10 +707,6 @@ out: return state; } -/* - * Beware! Caller must be holding exactly one - * reference to clp->cl_sem! - */ void nfs4_put_open_state(struct nfs4_state *state) { struct inode *inode = state->inode; @@ -425,16 +727,17 @@ void nfs4_put_open_state(struct nfs4_state *state) /* * Close the current file. */ -static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mode, int wait) +static void __nfs4_close(struct nfs4_state *state, + fmode_t fmode, gfp_t gfp_mask, int wait) { struct nfs4_state_owner *owner = state->owner; int call_close = 0; - int newstate; + fmode_t newstate; atomic_inc(&owner->so_count); /* Protect against nfs4_find_state() */ spin_lock(&owner->so_lock); - switch (mode & (FMODE_READ | FMODE_WRITE)) { + switch (fmode & (FMODE_READ | FMODE_WRITE)) { case FMODE_READ: state->n_rdonly--; break; @@ -466,17 +769,17 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mod nfs4_put_open_state(state); nfs4_put_state_owner(owner); } else - nfs4_do_close(path, state, wait); + nfs4_do_close(state, gfp_mask, wait); } -void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode) +void nfs4_close_state(struct nfs4_state *state, fmode_t fmode) { - __nfs4_close(path, state, mode, 0); + __nfs4_close(state, fmode, GFP_NOFS, 0); } -void nfs4_close_sync(struct path *path, struct nfs4_state *state, mode_t mode) +void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode) { - __nfs4_close(path, state, mode, 1); + __nfs4_close(state, fmode, GFP_KERNEL, 1); } /* @@ -484,12 +787,21 @@ void nfs4_close_sync(struct path *path, struct nfs4_state *state, mode_t mode) * that is compatible with current->files */ static struct nfs4_lock_state * -__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) { struct nfs4_lock_state *pos; list_for_each_entry(pos, &state->lock_states, ls_locks) { - if (pos->ls_owner != fl_owner) + if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type) continue; + switch (pos->ls_owner.lo_type) { + case NFS4_POSIX_LOCK_TYPE: + if (pos->ls_owner.lo_u.posix_owner != fl_owner) + continue; + break; + case NFS4_FLOCK_LOCK_TYPE: + if (pos->ls_owner.lo_u.flock_owner != fl_pid) + continue; + } atomic_inc(&pos->ls_count); return pos; } @@ -501,34 +813,42 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) * exists, return an uninitialized one. * */ -static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) { struct nfs4_lock_state *lsp; - struct nfs_client *clp = state->owner->so_client; + struct nfs_server *server = state->owner->so_server; - lsp = kzalloc(sizeof(*lsp), GFP_KERNEL); + lsp = kzalloc(sizeof(*lsp), GFP_NOFS); if (lsp == NULL) return NULL; - rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue"); - spin_lock_init(&lsp->ls_sequence.lock); - INIT_LIST_HEAD(&lsp->ls_sequence.list); - lsp->ls_seqid.sequence = &lsp->ls_sequence; + nfs4_init_seqid_counter(&lsp->ls_seqid); atomic_set(&lsp->ls_count, 1); - lsp->ls_owner = fl_owner; - spin_lock(&clp->cl_lock); - nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); - spin_unlock(&clp->cl_lock); + lsp->ls_state = state; + lsp->ls_owner.lo_type = type; + switch (lsp->ls_owner.lo_type) { + case NFS4_FLOCK_LOCK_TYPE: + lsp->ls_owner.lo_u.flock_owner = fl_pid; + break; + case NFS4_POSIX_LOCK_TYPE: + lsp->ls_owner.lo_u.posix_owner = fl_owner; + break; + default: + goto out_free; + } + lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS); + if (lsp->ls_seqid.owner_id < 0) + goto out_free; INIT_LIST_HEAD(&lsp->ls_locks); return lsp; +out_free: + kfree(lsp); + return NULL; } -static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) +void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) { - struct nfs_client *clp = lsp->ls_state->owner->so_client; - - spin_lock(&clp->cl_lock); - nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); - spin_unlock(&clp->cl_lock); + ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id); + nfs4_destroy_seqid_counter(&lsp->ls_seqid); kfree(lsp); } @@ -536,19 +856,17 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) * Return a compatible lock_state. If no initialized lock_state structure * exists, return an uninitialized one. * - * The caller must be holding clp->cl_sem */ -static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) +static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type) { struct nfs4_lock_state *lsp, *new = NULL; for(;;) { spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, owner); + lsp = __nfs4_find_lock_state(state, owner, pid, type); if (lsp != NULL) break; if (new != NULL) { - new->ls_state = state; list_add(&new->ls_locks, &state->lock_states); set_bit(LK_STATE_IN_USE, &state->flags); lsp = new; @@ -556,13 +874,13 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ break; } spin_unlock(&state->state_lock); - new = nfs4_alloc_lock_state(state, owner); + new = nfs4_alloc_lock_state(state, owner, pid, type); if (new == NULL) return NULL; } spin_unlock(&state->state_lock); if (new != NULL) - nfs4_free_lock_state(new); + nfs4_free_lock_state(state->owner->so_server, new); return lsp; } @@ -572,6 +890,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ */ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) { + struct nfs_server *server; struct nfs4_state *state; if (lsp == NULL) @@ -583,7 +902,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) if (list_empty(&state->lock_states)) clear_bit(LK_STATE_IN_USE, &state->flags); spin_unlock(&state->state_lock); - nfs4_free_lock_state(lsp); + server = state->owner->so_server; + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { + struct nfs_client *clp = server->nfs_client; + + clp->cl_mvops->free_lock_state(server, lsp); + } else + nfs4_free_lock_state(server, lsp); } static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) @@ -599,7 +924,7 @@ static void nfs4_fl_release_lock(struct file_lock *fl) nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner); } -static struct file_lock_operations nfs4_fl_lock_ops = { +static const struct file_lock_operations nfs4_fl_lock_ops = { .fl_copy_lock = nfs4_fl_copy_lock, .fl_release_private = nfs4_fl_release_lock, }; @@ -610,7 +935,13 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) if (fl->fl_ops != NULL) return 0; - lsp = nfs4_get_lock_state(state, fl->fl_owner); + if (fl->fl_flags & FL_POSIX) + lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE); + else if (fl->fl_flags & FL_FLOCK) + lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid, + NFS4_FLOCK_LOCK_TYPE); + else + return -EINVAL; if (lsp == NULL) return -ENOMEM; fl->fl_u.nfs4_fl.owner = lsp; @@ -618,70 +949,134 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) return 0; } -/* - * Byte-range lock aware utility to initialize the stateid of read/write - * requests. - */ -void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) +static int nfs4_copy_lock_stateid(nfs4_stateid *dst, + struct nfs4_state *state, + const struct nfs_lockowner *lockowner) { struct nfs4_lock_state *lsp; - int seq; + fl_owner_t fl_owner; + pid_t fl_pid; + int ret = -ENOENT; + + + if (lockowner == NULL) + goto out; - do { - seq = read_seqbegin(&state->seqlock); - memcpy(dst, &state->stateid, sizeof(*dst)); - } while (read_seqretry(&state->seqlock, seq)); if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) - return; + goto out; + fl_owner = lockowner->l_owner; + fl_pid = lockowner->l_pid; spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, fl_owner); - if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) - memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); + lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); + if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) + ret = -EIO; + else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { + nfs4_stateid_copy(dst, &lsp->ls_stateid); + ret = 0; + } spin_unlock(&state->state_lock); nfs4_put_lock_state(lsp); +out: + return ret; } -struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) +static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +{ + const nfs4_stateid *src; + int seq; + + do { + src = &zero_stateid; + seq = read_seqbegin(&state->seqlock); + if (test_bit(NFS_OPEN_STATE, &state->flags)) + src = &state->open_stateid; + nfs4_stateid_copy(dst, src); + } while (read_seqretry(&state->seqlock, seq)); +} + +/* + * Byte-range lock aware utility to initialize the stateid of read/write + * requests. + */ +int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, + fmode_t fmode, const struct nfs_lockowner *lockowner) +{ + int ret = nfs4_copy_lock_stateid(dst, state, lockowner); + if (ret == -EIO) + /* A lost lock - don't even consider delegations */ + goto out; + /* returns true if delegation stateid found and copied */ + if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) { + ret = 0; + goto out; + } + if (ret != -ENOENT) + /* nfs4_copy_delegation_stateid() didn't over-write + * dst, so it still has the lock stateid which we now + * choose to use. + */ + goto out; + nfs4_copy_open_stateid(dst, state); + ret = 0; +out: + if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41)) + dst->seqid = 0; + return ret; +} + +struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask) { struct nfs_seqid *new; - new = kmalloc(sizeof(*new), GFP_KERNEL); + new = kmalloc(sizeof(*new), gfp_mask); if (new != NULL) { new->sequence = counter; INIT_LIST_HEAD(&new->list); + new->task = NULL; } return new; } -void nfs_free_seqid(struct nfs_seqid *seqid) +void nfs_release_seqid(struct nfs_seqid *seqid) { - if (!list_empty(&seqid->list)) { - struct rpc_sequence *sequence = seqid->sequence->sequence; + struct nfs_seqid_counter *sequence; + + if (list_empty(&seqid->list)) + return; + sequence = seqid->sequence; + spin_lock(&sequence->lock); + list_del_init(&seqid->list); + if (!list_empty(&sequence->list)) { + struct nfs_seqid *next; - spin_lock(&sequence->lock); - list_del(&seqid->list); - spin_unlock(&sequence->lock); - rpc_wake_up(&sequence->wait); + next = list_first_entry(&sequence->list, + struct nfs_seqid, list); + rpc_wake_up_queued_task(&sequence->wait, next->task); } + spin_unlock(&sequence->lock); +} + +void nfs_free_seqid(struct nfs_seqid *seqid) +{ + nfs_release_seqid(seqid); kfree(seqid); } /* * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or * failed with a seqid incrementing error - - * see comments nfs_fs.h:seqid_mutating_error() + * see comments nfs4.h:seqid_mutating_error() */ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) { - BUG_ON(list_first_entry(&seqid->sequence->sequence->list, struct nfs_seqid, list) != seqid); switch (status) { case 0: break; case -NFS4ERR_BAD_SEQID: if (seqid->sequence->flags & NFS_SEQID_CONFIRMED) return; - printk(KERN_WARNING "NFS: v4 server returned a bad" + pr_warn_ratelimited("NFS: v4 server returned a bad" " sequence-id error on an" " unconfirmed sequence %p!\n", seqid->sequence); @@ -703,18 +1098,20 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) { - if (status == -NFS4ERR_BAD_SEQID) { - struct nfs4_state_owner *sp = container_of(seqid->sequence, - struct nfs4_state_owner, so_seqid); + struct nfs4_state_owner *sp = container_of(seqid->sequence, + struct nfs4_state_owner, so_seqid); + struct nfs_server *server = sp->so_server; + + if (status == -NFS4ERR_BAD_SEQID) nfs4_drop_state_owner(sp); - } - nfs_increment_seqid(status, seqid); + if (!nfs4_has_session(server->nfs_client)) + nfs_increment_seqid(status, seqid); } /* * Increment the seqid if the LOCK/LOCKU succeeded, or * failed with a seqid incrementing error - - * see comments nfs_fs.h:seqid_mutating_error() + * see comments nfs4.h:seqid_mutating_error() */ void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) { @@ -723,96 +1120,328 @@ void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) { - struct rpc_sequence *sequence = seqid->sequence->sequence; + struct nfs_seqid_counter *sequence = seqid->sequence; int status = 0; spin_lock(&sequence->lock); + seqid->task = task; if (list_empty(&seqid->list)) list_add_tail(&seqid->list, &sequence->list); if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid) goto unlock; - rpc_sleep_on(&sequence->wait, task, NULL, NULL); + rpc_sleep_on(&sequence->wait, task, NULL); status = -EAGAIN; unlock: spin_unlock(&sequence->lock); return status; } -static int reclaimer(void *); +static int nfs4_run_state_manager(void *); -static inline void nfs4_clear_recover_bit(struct nfs_client *clp) +static void nfs4_clear_state_manager_bit(struct nfs_client *clp) { - smp_mb__before_clear_bit(); - clear_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state); - smp_mb__after_clear_bit(); - wake_up_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER); + smp_mb__before_atomic(); + clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); + smp_mb__after_atomic(); + wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING); rpc_wake_up(&clp->cl_rpcwaitq); } /* - * State recovery routine + * Schedule the nfs_client asynchronous state management routine */ -static void nfs4_recover_state(struct nfs_client *clp) +void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; + char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + return; __module_get(THIS_MODULE); atomic_inc(&clp->cl_count); - task = kthread_run(reclaimer, clp, "%s-reclaim", - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR)); - if (!IS_ERR(task)) - return; - nfs4_clear_recover_bit(clp); - nfs_put_client(clp); - module_put(THIS_MODULE); + + /* The rcu_read_lock() is not strictly necessary, as the state + * manager is the only thread that ever changes the rpc_xprt + * after it's initialized. At this point, we're single threaded. */ + rcu_read_lock(); + snprintf(buf, sizeof(buf), "%s-manager", + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + rcu_read_unlock(); + task = kthread_run(nfs4_run_state_manager, clp, "%s", buf); + if (IS_ERR(task)) { + printk(KERN_ERR "%s: kthread_run: %ld\n", + __func__, PTR_ERR(task)); + nfs4_clear_state_manager_bit(clp); + nfs_put_client(clp); + module_put(THIS_MODULE); + } } /* - * Schedule a state recovery attempt + * Schedule a lease recovery attempt */ -void nfs4_schedule_state_recovery(struct nfs_client *clp) +void nfs4_schedule_lease_recovery(struct nfs_client *clp) { if (!clp) return; - if (test_and_set_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0) - nfs4_recover_state(clp); + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + dprintk("%s: scheduling lease recovery for server %s\n", __func__, + clp->cl_hostname); + nfs4_schedule_state_manager(clp); +} +EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery); + +/** + * nfs4_schedule_migration_recovery - trigger migration recovery + * + * @server: FSID that is migrating + * + * Returns zero if recovery has started, otherwise a negative NFS4ERR + * value is returned. + */ +int nfs4_schedule_migration_recovery(const struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + + if (server->fh_expire_type != NFS4_FH_PERSISTENT) { + pr_err("NFS: volatile file handles not supported (server %s)\n", + clp->cl_hostname); + return -NFS4ERR_IO; + } + + if (test_bit(NFS_MIG_FAILED, &server->mig_status)) + return -NFS4ERR_IO; + + dprintk("%s: scheduling migration recovery for (%llx:%llx) on %s\n", + __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + + set_bit(NFS_MIG_IN_TRANSITION, + &((struct nfs_server *)server)->mig_status); + set_bit(NFS4CLNT_MOVED, &clp->cl_state); + + nfs4_schedule_state_manager(clp); + return 0; +} +EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery); + +/** + * nfs4_schedule_lease_moved_recovery - start lease-moved recovery + * + * @clp: server to check for moved leases + * + */ +void nfs4_schedule_lease_moved_recovery(struct nfs_client *clp) +{ + dprintk("%s: scheduling lease-moved recovery for client ID %llx on %s\n", + __func__, clp->cl_clientid, clp->cl_hostname); + + set_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state); + nfs4_schedule_state_manager(clp); +} +EXPORT_SYMBOL_GPL(nfs4_schedule_lease_moved_recovery); + +int nfs4_wait_clnt_recover(struct nfs_client *clp) +{ + int res; + + might_sleep(); + + atomic_inc(&clp->cl_count); + res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (res) + goto out; + if (clp->cl_cons_state < 0) + res = clp->cl_cons_state; +out: + nfs_put_client(clp); + return res; +} + +int nfs4_client_recover_expired_lease(struct nfs_client *clp) +{ + unsigned int loop; + int ret; + + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { + ret = nfs4_wait_clnt_recover(clp); + if (ret != 0) + break; + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && + !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) + break; + nfs4_schedule_state_manager(clp); + ret = -EIO; + } + return ret; } -static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_state *state) +/* + * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN + * @clp: client to process + * + * Set the NFS4CLNT_LEASE_EXPIRED state in order to force a + * resend of the SETCLIENTID and hence re-establish the + * callback channel. Then return all existing delegations. + */ +static void nfs40_handle_cb_pathdown(struct nfs_client *clp) +{ + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs_expire_all_delegations(clp); + dprintk("%s: handling CB_PATHDOWN recovery for server %s\n", __func__, + clp->cl_hostname); +} + +void nfs4_schedule_path_down_recovery(struct nfs_client *clp) +{ + nfs40_handle_cb_pathdown(clp); + nfs4_schedule_state_manager(clp); +} + +static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) +{ + + set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); + /* Don't recover state that expired before the reboot */ + if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) { + clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); + return 0; + } + set_bit(NFS_OWNER_RECLAIM_REBOOT, &state->owner->so_flags); + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + return 1; +} + +int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) +{ + set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); + clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); + set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags); + set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); + return 1; +} + +int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state) +{ + struct nfs_client *clp = server->nfs_client; + + if (!nfs4_valid_open_stateid(state)) + return -EBADF; + nfs4_state_mark_reclaim_nograce(clp, state); + dprintk("%s: scheduling stateid recovery for server %s\n", __func__, + clp->cl_hostname); + nfs4_schedule_state_manager(clp); + return 0; +} +EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); + +void nfs_inode_find_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *ctx; + struct nfs4_state *state; + bool found = false; + + spin_lock(&inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + state = ctx->state; + if (state == NULL) + continue; + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + continue; + if (!nfs4_stateid_match(&state->stateid, stateid)) + continue; + nfs4_state_mark_reclaim_nograce(clp, state); + found = true; + } + spin_unlock(&inode->i_lock); + if (found) + nfs4_schedule_state_manager(clp); +} + +static void nfs4_state_mark_open_context_bad(struct nfs4_state *state) { struct inode *inode = state->inode; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *ctx; + + spin_lock(&inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + if (ctx->state != state) + continue; + set_bit(NFS_CONTEXT_BAD, &ctx->flags); + } + spin_unlock(&inode->i_lock); +} + +static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error) +{ + set_bit(NFS_STATE_RECOVERY_FAILED, &state->flags); + nfs4_state_mark_open_context_bad(state); +} + + +static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) +{ + struct inode *inode = state->inode; + struct nfs_inode *nfsi = NFS_I(inode); struct file_lock *fl; int status = 0; + if (inode->i_flock == NULL) + return 0; + + /* Guard against delegation returns and new lock/unlock calls */ + down_write(&nfsi->rwsem); + /* Protect inode->i_flock using the BKL */ + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) continue; if (nfs_file_open_context(fl->fl_file)->state != state) continue; + spin_unlock(&inode->i_lock); status = ops->recover_lock(state, fl); - if (status >= 0) - continue; switch (status) { - default: - printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", - __FUNCTION__, status); + case 0: + break; + case -ESTALE: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BAD_STATEID: case -NFS4ERR_EXPIRED: case -NFS4ERR_NO_GRACE: + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + goto out; + default: + printk(KERN_ERR "NFS: %s: unhandled error %d\n", + __func__, status); + case -ENOMEM: + case -NFS4ERR_DENIED: case -NFS4ERR_RECLAIM_BAD: case -NFS4ERR_RECLAIM_CONFLICT: /* kill_proc(fl->fl_pid, SIGLOST, 1); */ - break; - case -NFS4ERR_STALE_CLIENTID: - goto out_err; + status = 0; } + spin_lock(&inode->i_lock); } - return 0; -out_err: + spin_unlock(&inode->i_lock); +out: + up_write(&nfsi->rwsem); return status; } -static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct nfs4_state_owner *sp) +static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops) { struct nfs4_state *state; struct nfs4_lock_state *lock; @@ -826,155 +1455,1001 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n * recovering after a network partition or a reboot from a * server that doesn't support a grace period. */ + spin_lock(&sp->so_lock); + raw_write_seqcount_begin(&sp->so_reclaim_seqcount); +restart: list_for_each_entry(state, &sp->so_states, open_states) { + if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) + continue; + if (!nfs4_valid_open_stateid(state)) + continue; if (state->state == 0) continue; + atomic_inc(&state->count); + spin_unlock(&sp->so_lock); status = ops->recover_open(sp, state); if (status >= 0) { - status = nfs4_reclaim_locks(ops, state); - if (status < 0) - goto out_err; - list_for_each_entry(lock, &state->lock_states, ls_locks) { - if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) - printk("%s: Lock reclaim failed!\n", - __FUNCTION__); + status = nfs4_reclaim_locks(state, ops); + if (status >= 0) { + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { + spin_lock(&state->state_lock); + list_for_each_entry(lock, &state->lock_states, ls_locks) { + if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) + pr_warn_ratelimited("NFS: " + "%s: Lock reclaim " + "failed!\n", __func__); + } + spin_unlock(&state->state_lock); + } + nfs4_put_open_state(state); + spin_lock(&sp->so_lock); + goto restart; } - continue; } switch (status) { default: - printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", - __FUNCTION__, status); + printk(KERN_ERR "NFS: %s: unhandled error %d\n", + __func__, status); case -ENOENT: + case -ENOMEM: + case -ESTALE: + /* Open state on this file cannot be recovered */ + nfs4_state_mark_recovery_failed(state, status); + break; + case -EAGAIN: + ssleep(1); + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BAD_STATEID: case -NFS4ERR_RECLAIM_BAD: case -NFS4ERR_RECLAIM_CONFLICT: - /* - * Open state on this file cannot be recovered - * All we can do is revert to using the zero stateid. - */ - memset(state->stateid.data, 0, - sizeof(state->stateid.data)); - /* Mark the file as being 'closed' */ - state->state = 0; + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); break; case -NFS4ERR_EXPIRED: case -NFS4ERR_NO_GRACE: + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: goto out_err; } + nfs4_put_open_state(state); + spin_lock(&sp->so_lock); + goto restart; } + raw_write_seqcount_end(&sp->so_reclaim_seqcount); + spin_unlock(&sp->so_lock); return 0; out_err: + nfs4_put_open_state(state); + spin_lock(&sp->so_lock); + raw_write_seqcount_end(&sp->so_reclaim_seqcount); + spin_unlock(&sp->so_lock); return status; } -static void nfs4_state_mark_reclaim(struct nfs_client *clp) +static void nfs4_clear_open_state(struct nfs4_state *state) { + struct nfs4_lock_state *lock; + + clear_bit(NFS_DELEGATED_STATE, &state->flags); + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDWR_STATE, &state->flags); + spin_lock(&state->state_lock); + list_for_each_entry(lock, &state->lock_states, ls_locks) { + lock->ls_seqid.flags = 0; + clear_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags); + } + spin_unlock(&state->state_lock); +} + +static void nfs4_reset_seqids(struct nfs_server *server, + int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state)) +{ + struct nfs_client *clp = server->nfs_client; struct nfs4_state_owner *sp; struct rb_node *pos; struct nfs4_state *state; - struct nfs4_lock_state *lock; - /* Reset all sequence ids to zero */ - for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { - sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); - sp->so_seqid.counter = 0; + spin_lock(&clp->cl_lock); + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); sp->so_seqid.flags = 0; spin_lock(&sp->so_lock); list_for_each_entry(state, &sp->so_states, open_states) { - clear_bit(NFS_DELEGATED_STATE, &state->flags); - clear_bit(NFS_O_RDONLY_STATE, &state->flags); - clear_bit(NFS_O_WRONLY_STATE, &state->flags); - clear_bit(NFS_O_RDWR_STATE, &state->flags); - list_for_each_entry(lock, &state->lock_states, ls_locks) { - lock->ls_seqid.counter = 0; - lock->ls_seqid.flags = 0; - lock->ls_flags &= ~NFS_LOCK_INITIALIZED; - } + if (mark_reclaim(clp, state)) + nfs4_clear_open_state(state); } spin_unlock(&sp->so_lock); } + spin_unlock(&clp->cl_lock); } -static int reclaimer(void *ptr) +static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, + int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state)) { - struct nfs_client *clp = ptr; + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs4_reset_seqids(server, mark_reclaim); + rcu_read_unlock(); +} + +static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) +{ + /* Mark all delegations for reclaim */ + nfs_delegation_mark_reclaim(clp); + nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot); +} + +static void nfs4_reclaim_complete(struct nfs_client *clp, + const struct nfs4_state_recovery_ops *ops, + struct rpc_cred *cred) +{ + /* Notify the server we're done reclaiming our state */ + if (ops->reclaim_complete) + (void)ops->reclaim_complete(clp, cred); +} + +static void nfs4_clear_reclaim_server(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; struct nfs4_state_owner *sp; struct rb_node *pos; - struct nfs4_state_recovery_ops *ops; + struct nfs4_state *state; + + spin_lock(&clp->cl_lock); + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); + spin_lock(&sp->so_lock); + list_for_each_entry(state, &sp->so_states, open_states) { + if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, + &state->flags)) + continue; + nfs4_state_mark_reclaim_nograce(clp, state); + } + spin_unlock(&sp->so_lock); + } + spin_unlock(&clp->cl_lock); +} + +static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp) +{ + struct nfs_server *server; + + if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + return 0; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs4_clear_reclaim_server(server); + rcu_read_unlock(); + + nfs_delegation_reap_unclaimed(clp); + return 1; +} + +static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) +{ + const struct nfs4_state_recovery_ops *ops; struct rpc_cred *cred; + + if (!nfs4_state_clear_reclaim_reboot(clp)) + return; + ops = clp->cl_mvops->reboot_recovery_ops; + cred = nfs4_get_clid_cred(clp); + nfs4_reclaim_complete(clp, ops, cred); + put_rpccred(cred); +} + +static void nfs_delegation_clear_all(struct nfs_client *clp) +{ + nfs_delegation_mark_reclaim(clp); + nfs_delegation_reap_unclaimed(clp); +} + +static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) +{ + nfs_delegation_clear_all(clp); + nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); +} + +static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) +{ + switch (error) { + case 0: + break; + case -NFS4ERR_CB_PATH_DOWN: + nfs40_handle_cb_pathdown(clp); + break; + case -NFS4ERR_NO_GRACE: + nfs4_state_end_reclaim_reboot(clp); + break; + case -NFS4ERR_STALE_CLIENTID: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_clear_reclaim_reboot(clp); + nfs4_state_start_reclaim_reboot(clp); + break; + case -NFS4ERR_EXPIRED: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_nograce(clp); + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + /* Zero session reset errors */ + break; + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + break; + default: + dprintk("%s: failed to handle error %d for server %s\n", + __func__, error, clp->cl_hostname); + return error; + } + dprintk("%s: handled error %d for server %s\n", __func__, error, + clp->cl_hostname); + return 0; +} + +static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops) +{ + struct nfs4_state_owner *sp; + struct nfs_server *server; + struct rb_node *pos; int status = 0; - allow_signal(SIGKILL); +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + nfs4_purge_state_owners(server); + spin_lock(&clp->cl_lock); + for (pos = rb_first(&server->state_owners); + pos != NULL; + pos = rb_next(pos)) { + sp = rb_entry(pos, + struct nfs4_state_owner, so_server_node); + if (!test_and_clear_bit(ops->owner_flag_bit, + &sp->so_flags)) + continue; + atomic_inc(&sp->so_count); + spin_unlock(&clp->cl_lock); + rcu_read_unlock(); + + status = nfs4_reclaim_open_state(sp, ops); + if (status < 0) { + set_bit(ops->owner_flag_bit, &sp->so_flags); + nfs4_put_state_owner(sp); + return nfs4_recovery_handle_error(clp, status); + } - /* Ensure exclusive access to NFSv4 state */ - lock_kernel(); - down_write(&clp->cl_sem); - /* Are there any NFS mounts out there? */ - if (list_empty(&clp->cl_superblocks)) - goto out; -restart_loop: - ops = &nfs4_network_partition_recovery_ops; - /* Are there any open files on this volume? */ - cred = nfs4_get_renew_cred(clp); - if (cred != NULL) { - /* Yes there are: try to renew the old lease */ - status = nfs4_proc_renew(clp, cred); - switch (status) { - case 0: - case -NFS4ERR_CB_PATH_DOWN: - put_rpccred(cred); - goto out; - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_LEASE_MOVED: - ops = &nfs4_reboot_recovery_ops; + nfs4_put_state_owner(sp); + goto restart; } - } else { - /* "reboot" to ensure we clear all state on the server */ - clp->cl_boot_time = CURRENT_TIME; - cred = nfs4_get_setclientid_cred(clp); - } - /* We're going to have to re-establish a clientid */ - nfs4_state_mark_reclaim(clp); - status = -ENOENT; - if (cred != NULL) { - status = nfs4_init_client(clp, cred); - put_rpccred(cred); + spin_unlock(&clp->cl_lock); } - if (status) - goto out_error; - /* Mark all delegations for reclaim */ - nfs_delegation_mark_reclaim(clp); - /* Note: list is protected by exclusive lock on cl->cl_sem */ - for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { - sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); - status = nfs4_reclaim_open_state(ops, sp); + rcu_read_unlock(); + return status; +} + +static int nfs4_check_lease(struct nfs_client *clp) +{ + struct rpc_cred *cred; + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + int status; + + /* Is the client already known to have an expired lease? */ + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + return 0; + spin_lock(&clp->cl_lock); + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred == NULL) { + cred = nfs4_get_clid_cred(clp); + status = -ENOKEY; + if (cred == NULL) + goto out; + } + status = ops->renew_lease(clp, cred); + put_rpccred(cred); + if (status == -ETIMEDOUT) { + set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + return 0; + } +out: + return nfs4_recovery_handle_error(clp, status); +} + +/* Set NFS4CLNT_LEASE_EXPIRED and reclaim reboot state for all v4.0 errors + * and for recoverable errors on EXCHANGE_ID for v4.1 + */ +static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) +{ + switch (status) { + case -NFS4ERR_SEQ_MISORDERED: + if (test_and_set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) + return -ESERVERFAULT; + /* Lease confirmation error: retry after purging the lease */ + ssleep(1); + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + break; + case -NFS4ERR_STALE_CLIENTID: + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + nfs4_state_clear_reclaim_reboot(clp); + nfs4_state_start_reclaim_reboot(clp); + break; + case -NFS4ERR_CLID_INUSE: + pr_err("NFS: Server %s reports our clientid is in use\n", + clp->cl_hostname); + nfs_mark_client_ready(clp, -EPERM); + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + return -EPERM; + case -EACCES: + case -NFS4ERR_DELAY: + case -ETIMEDOUT: + case -EAGAIN: + ssleep(1); + break; + + case -NFS4ERR_MINOR_VERS_MISMATCH: + if (clp->cl_cons_state == NFS_CS_SESSION_INITING) + nfs_mark_client_ready(clp, -EPROTONOSUPPORT); + dprintk("%s: exit with error %d for server %s\n", + __func__, -EPROTONOSUPPORT, clp->cl_hostname); + return -EPROTONOSUPPORT; + case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery + * in nfs4_exchange_id */ + default: + dprintk("%s: exit with error %d for server %s\n", __func__, + status, clp->cl_hostname); + return status; + } + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + dprintk("%s: handled error %d for server %s\n", __func__, status, + clp->cl_hostname); + return 0; +} + +static int nfs4_establish_lease(struct nfs_client *clp) +{ + struct rpc_cred *cred; + const struct nfs4_state_recovery_ops *ops = + clp->cl_mvops->reboot_recovery_ops; + int status; + + cred = nfs4_get_clid_cred(clp); + if (cred == NULL) + return -ENOENT; + status = ops->establish_clid(clp, cred); + put_rpccred(cred); + if (status != 0) + return status; + pnfs_destroy_all_layouts(clp); + return 0; +} + +/* + * Returns zero or a negative errno. NFS4ERR values are converted + * to local errno values. + */ +static int nfs4_reclaim_lease(struct nfs_client *clp) +{ + int status; + + status = nfs4_establish_lease(clp); + if (status < 0) + return nfs4_handle_reclaim_lease_error(clp, status); + if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state)) + nfs4_state_start_reclaim_nograce(clp); + if (!test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + return 0; +} + +static int nfs4_purge_lease(struct nfs_client *clp) +{ + int status; + + status = nfs4_establish_lease(clp); + if (status < 0) + return nfs4_handle_reclaim_lease_error(clp, status); + clear_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state); + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_nograce(clp); + return 0; +} + +/* + * Try remote migration of one FSID from a source server to a + * destination server. The source server provides a list of + * potential destinations. + * + * Returns zero or a negative NFS4ERR status code. + */ +static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_fs_locations *locations = NULL; + struct inode *inode; + struct page *page; + int status, result; + + dprintk("--> %s: FSID %llx:%llx on \"%s\"\n", __func__, + (unsigned long long)server->fsid.major, + (unsigned long long)server->fsid.minor, + clp->cl_hostname); + + result = 0; + page = alloc_page(GFP_KERNEL); + locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); + if (page == NULL || locations == NULL) { + dprintk("<-- %s: no memory\n", __func__); + goto out; + } + + inode = server->super->s_root->d_inode; + result = nfs4_proc_get_locations(inode, locations, page, cred); + if (result) { + dprintk("<-- %s: failed to retrieve fs_locations: %d\n", + __func__, result); + goto out; + } + + result = -NFS4ERR_NXIO; + if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { + dprintk("<-- %s: No fs_locations data, migration skipped\n", + __func__); + goto out; + } + + nfs4_begin_drain_session(clp); + + status = nfs4_replace_transport(server, locations); + if (status != 0) { + dprintk("<-- %s: failed to replace transport: %d\n", + __func__, status); + goto out; + } + + result = 0; + dprintk("<-- %s: migration succeeded\n", __func__); + +out: + if (page != NULL) + __free_page(page); + kfree(locations); + if (result) { + pr_err("NFS: migration recovery failed (server %s)\n", + clp->cl_hostname); + set_bit(NFS_MIG_FAILED, &server->mig_status); + } + return result; +} + +/* + * Returns zero or a negative NFS4ERR status code. + */ +static int nfs4_handle_migration(struct nfs_client *clp) +{ + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + struct nfs_server *server; + struct rpc_cred *cred; + + dprintk("%s: migration reported on \"%s\"\n", __func__, + clp->cl_hostname); + + spin_lock(&clp->cl_lock); + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred == NULL) + return -NFS4ERR_NOENT; + + clp->cl_mig_gen++; +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + int status; + + if (server->mig_gen == clp->cl_mig_gen) + continue; + server->mig_gen = clp->cl_mig_gen; + + if (!test_and_clear_bit(NFS_MIG_IN_TRANSITION, + &server->mig_status)) + continue; + + rcu_read_unlock(); + status = nfs4_try_migration(server, cred); if (status < 0) { - if (status == -NFS4ERR_NO_GRACE) { - ops = &nfs4_network_partition_recovery_ops; - status = nfs4_reclaim_open_state(ops, sp); - } - if (status == -NFS4ERR_STALE_CLIENTID) - goto restart_loop; - if (status == -NFS4ERR_EXPIRED) - goto restart_loop; + put_rpccred(cred); + return status; } + goto restart; } - nfs_delegation_reap_unclaimed(clp); + rcu_read_unlock(); + put_rpccred(cred); + return 0; +} + +/* + * Test each nfs_server on the clp's cl_superblocks list to see + * if it's moved to another server. Stop when the server no longer + * returns NFS4ERR_LEASE_MOVED. + */ +static int nfs4_handle_lease_moved(struct nfs_client *clp) +{ + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + struct nfs_server *server; + struct rpc_cred *cred; + + dprintk("%s: lease moved reported on \"%s\"\n", __func__, + clp->cl_hostname); + + spin_lock(&clp->cl_lock); + cred = ops->get_state_renewal_cred_locked(clp); + spin_unlock(&clp->cl_lock); + if (cred == NULL) + return -NFS4ERR_NOENT; + + clp->cl_mig_gen++; +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + struct inode *inode; + int status; + + if (server->mig_gen == clp->cl_mig_gen) + continue; + server->mig_gen = clp->cl_mig_gen; + + rcu_read_unlock(); + + inode = server->super->s_root->d_inode; + status = nfs4_proc_fsid_present(inode, cred); + if (status != -NFS4ERR_MOVED) + goto restart; /* wasn't this one */ + if (nfs4_try_migration(server, cred) == -NFS4ERR_LEASE_MOVED) + goto restart; /* there are more */ + goto out; + } + rcu_read_unlock(); + out: - up_write(&clp->cl_sem); - unlock_kernel(); - if (status == -NFS4ERR_CB_PATH_DOWN) - nfs_handle_cb_pathdown(clp); - nfs4_clear_recover_bit(clp); + put_rpccred(cred); + return 0; +} + +/** + * nfs4_discover_server_trunking - Detect server IP address trunking + * + * @clp: nfs_client under test + * @result: OUT: found nfs_client, or clp + * + * Returns zero or a negative errno. If zero is returned, + * an nfs_client pointer is planted in "result". + * + * Note: since we are invoked in process context, and + * not from inside the state manager, we cannot use + * nfs4_handle_reclaim_lease_error(). + */ +int nfs4_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **result) +{ + const struct nfs4_state_recovery_ops *ops = + clp->cl_mvops->reboot_recovery_ops; + struct rpc_clnt *clnt; + struct rpc_cred *cred; + int i, status; + + dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname); + + clnt = clp->cl_rpcclient; + i = 0; + + mutex_lock(&nfs_clid_init_mutex); +again: + status = -ENOENT; + cred = nfs4_get_clid_cred(clp); + if (cred == NULL) + goto out_unlock; + + status = ops->detect_trunking(clp, result, cred); + put_rpccred(cred); + switch (status) { + case 0: + break; + case -ETIMEDOUT: + if (clnt->cl_softrtry) + break; + case -NFS4ERR_DELAY: + case -EAGAIN: + ssleep(1); + case -NFS4ERR_STALE_CLIENTID: + dprintk("NFS: %s after status %d, retrying\n", + __func__, status); + goto again; + case -EACCES: + if (i++ == 0) { + nfs4_root_machine_cred(clp); + goto again; + } + if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) + break; + case -NFS4ERR_CLID_INUSE: + case -NFS4ERR_WRONGSEC: + /* No point in retrying if we already used RPC_AUTH_UNIX */ + if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) { + status = -EPERM; + break; + } + clnt = rpc_clone_client_set_auth(clnt, RPC_AUTH_UNIX); + if (IS_ERR(clnt)) { + status = PTR_ERR(clnt); + break; + } + /* Note: this is safe because we haven't yet marked the + * client as ready, so we are the only user of + * clp->cl_rpcclient + */ + clnt = xchg(&clp->cl_rpcclient, clnt); + rpc_shutdown_client(clnt); + clnt = clp->cl_rpcclient; + goto again; + + case -NFS4ERR_MINOR_VERS_MISMATCH: + status = -EPROTONOSUPPORT; + break; + + case -EKEYEXPIRED: + case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery + * in nfs4_exchange_id */ + status = -EKEYEXPIRED; + break; + default: + pr_warn("NFS: %s unhandled error %d. Exiting with error EIO\n", + __func__, status); + status = -EIO; + } + +out_unlock: + mutex_unlock(&nfs_clid_init_mutex); + dprintk("NFS: %s: status = %d\n", __func__, status); + return status; +} + +#ifdef CONFIG_NFS_V4_1 +void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) +{ + struct nfs_client *clp = session->clp; + + switch (err) { + default: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + break; + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + } + nfs4_schedule_lease_recovery(clp); +} +EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); + +static void nfs41_ping_server(struct nfs_client *clp) +{ + /* Use CHECK_LEASE to ping the server with a SEQUENCE */ + set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + nfs4_schedule_state_manager(clp); +} + +void nfs41_server_notify_target_slotid_update(struct nfs_client *clp) +{ + nfs41_ping_server(clp); +} + +void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp) +{ + nfs41_ping_server(clp); +} + +static void nfs4_reset_all_state(struct nfs_client *clp) +{ + if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { + set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state); + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + nfs4_state_start_reclaim_nograce(clp); + dprintk("%s: scheduling reset of all state for server %s!\n", + __func__, clp->cl_hostname); + nfs4_schedule_state_manager(clp); + } +} + +static void nfs41_handle_server_reboot(struct nfs_client *clp) +{ + if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { + nfs4_state_start_reclaim_reboot(clp); + dprintk("%s: server %s rebooted!\n", __func__, + clp->cl_hostname); + nfs4_schedule_state_manager(clp); + } +} + +static void nfs41_handle_state_revoked(struct nfs_client *clp) +{ + nfs4_reset_all_state(clp); + dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); +} + +static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp) +{ + /* This will need to handle layouts too */ + nfs_expire_all_delegations(clp); + dprintk("%s: Recallable state revoked on server %s!\n", __func__, + clp->cl_hostname); +} + +static void nfs41_handle_backchannel_fault(struct nfs_client *clp) +{ + nfs_expire_all_delegations(clp); + if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) + nfs4_schedule_state_manager(clp); + dprintk("%s: server %s declared a backchannel fault\n", __func__, + clp->cl_hostname); +} + +static void nfs41_handle_cb_path_down(struct nfs_client *clp) +{ + if (test_and_set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, + &clp->cl_state) == 0) + nfs4_schedule_state_manager(clp); +} + +void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) +{ + if (!flags) + return; + + dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n", + __func__, clp->cl_hostname, clp->cl_clientid, flags); + + if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) + nfs41_handle_server_reboot(clp); + if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | + SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | + SEQ4_STATUS_ADMIN_STATE_REVOKED)) + nfs41_handle_state_revoked(clp); + if (flags & SEQ4_STATUS_LEASE_MOVED) + nfs4_schedule_lease_moved_recovery(clp); + if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) + nfs41_handle_recallable_state_revoked(clp); + if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT) + nfs41_handle_backchannel_fault(clp); + else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | + SEQ4_STATUS_CB_PATH_DOWN_SESSION)) + nfs41_handle_cb_path_down(clp); +} + +static int nfs4_reset_session(struct nfs_client *clp) +{ + struct rpc_cred *cred; + int status; + + if (!nfs4_has_session(clp)) + return 0; + nfs4_begin_drain_session(clp); + cred = nfs4_get_clid_cred(clp); + status = nfs4_proc_destroy_session(clp->cl_session, cred); + switch (status) { + case 0: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + break; + case -NFS4ERR_BACK_CHAN_BUSY: + case -NFS4ERR_DELAY: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + status = 0; + ssleep(1); + goto out; + default: + status = nfs4_recovery_handle_error(clp, status); + goto out; + } + + memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN); + status = nfs4_proc_create_session(clp, cred); + if (status) { + dprintk("%s: session reset failed with status %d for server %s!\n", + __func__, status, clp->cl_hostname); + status = nfs4_handle_reclaim_lease_error(clp, status); + goto out; + } + nfs41_finish_session_reset(clp); + dprintk("%s: session reset was successful for server %s!\n", + __func__, clp->cl_hostname); +out: + if (cred) + put_rpccred(cred); + return status; +} + +static int nfs4_bind_conn_to_session(struct nfs_client *clp) +{ + struct rpc_cred *cred; + int ret; + + if (!nfs4_has_session(clp)) + return 0; + nfs4_begin_drain_session(clp); + cred = nfs4_get_clid_cred(clp); + ret = nfs4_proc_bind_conn_to_session(clp, cred); + if (cred) + put_rpccred(cred); + clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + switch (ret) { + case 0: + dprintk("%s: bind_conn_to_session was successful for server %s!\n", + __func__, clp->cl_hostname); + break; + case -NFS4ERR_DELAY: + ssleep(1); + set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + break; + default: + return nfs4_recovery_handle_error(clp, ret); + } + return 0; +} +#else /* CONFIG_NFS_V4_1 */ +static int nfs4_reset_session(struct nfs_client *clp) { return 0; } + +static int nfs4_bind_conn_to_session(struct nfs_client *clp) +{ + return 0; +} +#endif /* CONFIG_NFS_V4_1 */ + +static void nfs4_state_manager(struct nfs_client *clp) +{ + int status = 0; + const char *section = "", *section_sep = ""; + + /* Ensure exclusive access to NFSv4 state */ + do { + if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { + section = "purge state"; + status = nfs4_purge_lease(clp); + if (status < 0) + goto out_error; + continue; + } + + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) { + section = "lease expired"; + /* We're going to have to re-establish a clientid */ + status = nfs4_reclaim_lease(clp); + if (status < 0) + goto out_error; + continue; + } + + /* Initialize or reset the session */ + if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) { + section = "reset session"; + status = nfs4_reset_session(clp); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + continue; + if (status < 0) + goto out_error; + } + + /* Send BIND_CONN_TO_SESSION */ + if (test_and_clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, + &clp->cl_state)) { + section = "bind conn to session"; + status = nfs4_bind_conn_to_session(clp); + if (status < 0) + goto out_error; + continue; + } + + if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { + section = "check lease"; + status = nfs4_check_lease(clp); + if (status < 0) + goto out_error; + } + + if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) { + section = "migration"; + status = nfs4_handle_migration(clp); + if (status < 0) + goto out_error; + } + + if (test_and_clear_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state)) { + section = "lease moved"; + status = nfs4_handle_lease_moved(clp); + if (status < 0) + goto out_error; + } + + /* First recover reboot state... */ + if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { + section = "reclaim reboot"; + status = nfs4_do_reclaim(clp, + clp->cl_mvops->reboot_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + continue; + nfs4_state_end_reclaim_reboot(clp); + if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) + continue; + if (status < 0) + goto out_error; + } + + /* Now recover expired state... */ + if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { + section = "reclaim nograce"; + status = nfs4_do_reclaim(clp, + clp->cl_mvops->nograce_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || + test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + continue; + if (status < 0) + goto out_error; + } + + nfs4_end_drain_session(clp); + if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { + nfs_client_return_marked_delegations(clp); + continue; + } + + nfs4_clear_state_manager_bit(clp); + /* Did we race with an attempt to give us more work? */ + if (clp->cl_state == 0) + break; + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + break; + } while (atomic_read(&clp->cl_count) > 1); + return; +out_error: + if (strlen(section)) + section_sep = ": "; + pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" + " with error %d\n", section_sep, section, + clp->cl_hostname, -status); + ssleep(1); + nfs4_end_drain_session(clp); + nfs4_clear_state_manager_bit(clp); +} + +static int nfs4_run_state_manager(void *ptr) +{ + struct nfs_client *clp = ptr; + + allow_signal(SIGKILL); + nfs4_state_manager(clp); nfs_put_client(clp); module_put_and_exit(0); return 0; -out_error: - printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %s" - " with error %d\n", clp->cl_hostname, -status); - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - goto out; } /* diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c new file mode 100644 index 00000000000..6f340f02f2b --- /dev/null +++ b/fs/nfs/nfs4super.c @@ -0,0 +1,358 @@ +/* + * Copyright (c) 2012 Bryan Schumaker <bjschuma@netapp.com> + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/nfs_idmap.h> +#include <linux/nfs4_mount.h> +#include <linux/nfs_fs.h> +#include "delegation.h" +#include "internal.h" +#include "nfs4_fs.h" +#include "dns_resolve.h" +#include "pnfs.h" +#include "nfs.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc); +static void nfs4_evict_inode(struct inode *inode); +static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); +static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); +static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); + +static struct file_system_type nfs4_remote_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs4_remote_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, +}; + +static struct file_system_type nfs4_remote_referral_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs4_remote_referral_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, +}; + +struct file_system_type nfs4_referral_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .mount = nfs4_referral_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, +}; + +static const struct super_operations nfs4_sops = { + .alloc_inode = nfs_alloc_inode, + .destroy_inode = nfs_destroy_inode, + .write_inode = nfs4_write_inode, + .drop_inode = nfs_drop_inode, + .put_super = nfs_put_super, + .statfs = nfs_statfs, + .evict_inode = nfs4_evict_inode, + .umount_begin = nfs_umount_begin, + .show_options = nfs_show_options, + .show_devname = nfs_show_devname, + .show_path = nfs_show_path, + .show_stats = nfs_show_stats, + .remount_fs = nfs_remount, +}; + +struct nfs_subversion nfs_v4 = { + .owner = THIS_MODULE, + .nfs_fs = &nfs4_fs_type, + .rpc_vers = &nfs_version4, + .rpc_ops = &nfs_v4_clientops, + .sops = &nfs4_sops, + .xattr = nfs4_xattr_handlers, +}; + +static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int ret = nfs_write_inode(inode, wbc); + + if (ret == 0) + ret = pnfs_layoutcommit_inode(inode, + wbc->sync_mode == WB_SYNC_ALL); + return ret; +} + +/* + * Clean out any remaining NFSv4 state that might be left over due + * to open() calls that passed nfs_atomic_lookup, but failed to call + * nfs_open(). + */ +static void nfs4_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + pnfs_return_layout(inode); + pnfs_destroy_layout(NFS_I(inode)); + /* If we are holding a delegation, return it! */ + nfs_inode_return_delegation_noreclaim(inode); + /* First call standard NFS clear_inode() code */ + nfs_clear_inode(inode); +} + +/* + * Get the superblock for the NFS4 root partition + */ +static struct dentry * +nfs4_remote_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *info) +{ + struct nfs_mount_info *mount_info = info; + struct nfs_server *server; + struct dentry *mntroot = ERR_PTR(-ENOMEM); + + mount_info->set_security = nfs_set_sb_security; + + /* Get a volume representation */ + server = nfs4_create_server(mount_info, &nfs_v4); + if (IS_ERR(server)) { + mntroot = ERR_CAST(server); + goto out; + } + + mntroot = nfs_fs_mount_common(server, flags, dev_name, mount_info, &nfs_v4); + +out: + return mntroot; +} + +static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type, + int flags, void *data, const char *hostname) +{ + struct vfsmount *root_mnt; + char *root_devname; + size_t len; + + len = strlen(hostname) + 5; + root_devname = kmalloc(len, GFP_KERNEL); + if (root_devname == NULL) + return ERR_PTR(-ENOMEM); + /* Does hostname needs to be enclosed in brackets? */ + if (strchr(hostname, ':')) + snprintf(root_devname, len, "[%s]:/", hostname); + else + snprintf(root_devname, len, "%s:/", hostname); + root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data); + kfree(root_devname); + return root_mnt; +} + +struct nfs_referral_count { + struct list_head list; + const struct task_struct *task; + unsigned int referral_count; +}; + +static LIST_HEAD(nfs_referral_count_list); +static DEFINE_SPINLOCK(nfs_referral_count_list_lock); + +static struct nfs_referral_count *nfs_find_referral_count(void) +{ + struct nfs_referral_count *p; + + list_for_each_entry(p, &nfs_referral_count_list, list) { + if (p->task == current) + return p; + } + return NULL; +} + +#define NFS_MAX_NESTED_REFERRALS 2 + +static int nfs_referral_loop_protect(void) +{ + struct nfs_referral_count *p, *new; + int ret = -ENOMEM; + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + goto out; + new->task = current; + new->referral_count = 1; + + ret = 0; + spin_lock(&nfs_referral_count_list_lock); + p = nfs_find_referral_count(); + if (p != NULL) { + if (p->referral_count >= NFS_MAX_NESTED_REFERRALS) + ret = -ELOOP; + else + p->referral_count++; + } else { + list_add(&new->list, &nfs_referral_count_list); + new = NULL; + } + spin_unlock(&nfs_referral_count_list_lock); + kfree(new); +out: + return ret; +} + +static void nfs_referral_loop_unprotect(void) +{ + struct nfs_referral_count *p; + + spin_lock(&nfs_referral_count_list_lock); + p = nfs_find_referral_count(); + p->referral_count--; + if (p->referral_count == 0) + list_del(&p->list); + else + p = NULL; + spin_unlock(&nfs_referral_count_list_lock); + kfree(p); +} + +static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, + const char *export_path) +{ + struct dentry *dentry; + int err; + + if (IS_ERR(root_mnt)) + return ERR_CAST(root_mnt); + + err = nfs_referral_loop_protect(); + if (err) { + mntput(root_mnt); + return ERR_PTR(err); + } + + dentry = mount_subtree(root_mnt, export_path); + nfs_referral_loop_unprotect(); + + return dentry; +} + +struct dentry *nfs4_try_mount(int flags, const char *dev_name, + struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + char *export_path; + struct vfsmount *root_mnt; + struct dentry *res; + struct nfs_parsed_mount_data *data = mount_info->parsed; + + dfprintk(MOUNT, "--> nfs4_try_mount()\n"); + + export_path = data->nfs_server.export_path; + data->nfs_server.export_path = "/"; + root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info, + data->nfs_server.hostname); + data->nfs_server.export_path = export_path; + + res = nfs_follow_remote_path(root_mnt, export_path); + + dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", + PTR_ERR_OR_ZERO(res), + IS_ERR(res) ? " [error]" : ""); + return res; +} + +static struct dentry * +nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + struct nfs_mount_info mount_info = { + .fill_super = nfs_fill_super, + .set_security = nfs_clone_sb_security, + .cloned = raw_data, + }; + struct nfs_server *server; + struct dentry *mntroot = ERR_PTR(-ENOMEM); + + dprintk("--> nfs4_referral_get_sb()\n"); + + mount_info.mntfh = nfs_alloc_fhandle(); + if (mount_info.cloned == NULL || mount_info.mntfh == NULL) + goto out; + + /* create a new volume representation */ + server = nfs4_create_referral_server(mount_info.cloned, mount_info.mntfh); + if (IS_ERR(server)) { + mntroot = ERR_CAST(server); + goto out; + } + + mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, &nfs_v4); +out: + nfs_free_fhandle(mount_info.mntfh); + return mntroot; +} + +/* + * Create an NFS4 server record on referral traversal + */ +static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) +{ + struct nfs_clone_mount *data = raw_data; + char *export_path; + struct vfsmount *root_mnt; + struct dentry *res; + + dprintk("--> nfs4_referral_mount()\n"); + + export_path = data->mnt_path; + data->mnt_path = "/"; + + root_mnt = nfs_do_root_mount(&nfs4_remote_referral_fs_type, + flags, data, data->hostname); + data->mnt_path = export_path; + + res = nfs_follow_remote_path(root_mnt, export_path); + dprintk("<-- nfs4_referral_mount() = %d%s\n", + PTR_ERR_OR_ZERO(res), + IS_ERR(res) ? " [error]" : ""); + return res; +} + + +static int __init init_nfs_v4(void) +{ + int err; + + err = nfs_dns_resolver_init(); + if (err) + goto out; + + err = nfs_idmap_init(); + if (err) + goto out1; + + err = nfs4_register_sysctl(); + if (err) + goto out2; + + register_nfs_version(&nfs_v4); + return 0; +out2: + nfs_idmap_quit(); +out1: + nfs_dns_resolver_destroy(); +out: + return err; +} + +static void __exit exit_nfs_v4(void) +{ + unregister_nfs_version(&nfs_v4); + nfs4_unregister_sysctl(); + nfs_idmap_quit(); + nfs_dns_resolver_destroy(); +} + +MODULE_LICENSE("GPL"); + +module_init(init_nfs_v4); +module_exit(exit_nfs_v4); diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c new file mode 100644 index 00000000000..b6ebe7e445f --- /dev/null +++ b/fs/nfs/nfs4sysctl.c @@ -0,0 +1,69 @@ +/* + * linux/fs/nfs/nfs4sysctl.c + * + * Sysctl interface to NFS v4 parameters + * + * Copyright (c) 2006 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#include <linux/sysctl.h> +#include <linux/nfs_idmap.h> +#include <linux/nfs_fs.h> + +#include "nfs4_fs.h" +#include "callback.h" + +static const int nfs_set_port_min = 0; +static const int nfs_set_port_max = 65535; +static struct ctl_table_header *nfs4_callback_sysctl_table; + +static struct ctl_table nfs4_cb_sysctls[] = { + { + .procname = "nfs_callback_tcpport", + .data = &nfs_callback_set_tcpport, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (int *)&nfs_set_port_min, + .extra2 = (int *)&nfs_set_port_max, + }, + { + .procname = "idmap_cache_timeout", + .data = &nfs_idmap_cache_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; + +static struct ctl_table nfs4_cb_sysctl_dir[] = { + { + .procname = "nfs", + .mode = 0555, + .child = nfs4_cb_sysctls, + }, + { } +}; + +static struct ctl_table nfs4_cb_sysctl_root[] = { + { + .procname = "fs", + .mode = 0555, + .child = nfs4_cb_sysctl_dir, + }, + { } +}; + +int nfs4_register_sysctl(void) +{ + nfs4_callback_sysctl_table = register_sysctl_table(nfs4_cb_sysctl_root); + if (nfs4_callback_sysctl_table == NULL) + return -ENOMEM; + return 0; +} + +void nfs4_unregister_sysctl(void) +{ + unregister_sysctl_table(nfs4_callback_sysctl_table); + nfs4_callback_sysctl_table = NULL; +} diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c new file mode 100644 index 00000000000..d774335cc8b --- /dev/null +++ b/fs/nfs/nfs4trace.c @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#include <linux/nfs_fs.h> +#include "nfs4_fs.h" +#include "internal.h" +#include "nfs4session.h" +#include "callback.h" + +#define CREATE_TRACE_POINTS +#include "nfs4trace.h" + +#ifdef CONFIG_NFS_V4_1 +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds); +#endif diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h new file mode 100644 index 00000000000..0a744f3a86f --- /dev/null +++ b/fs/nfs/nfs4trace.h @@ -0,0 +1,1148 @@ +/* + * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nfs4 + +#if !defined(_TRACE_NFS4_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NFS4_H + +#include <linux/tracepoint.h> + +#define show_nfsv4_errors(error) \ + __print_symbolic(error, \ + { NFS4_OK, "OK" }, \ + /* Mapped by nfs4_stat_to_errno() */ \ + { -EPERM, "EPERM" }, \ + { -ENOENT, "ENOENT" }, \ + { -EIO, "EIO" }, \ + { -ENXIO, "ENXIO" }, \ + { -EACCES, "EACCES" }, \ + { -EEXIST, "EEXIST" }, \ + { -EXDEV, "EXDEV" }, \ + { -ENOTDIR, "ENOTDIR" }, \ + { -EISDIR, "EISDIR" }, \ + { -EFBIG, "EFBIG" }, \ + { -ENOSPC, "ENOSPC" }, \ + { -EROFS, "EROFS" }, \ + { -EMLINK, "EMLINK" }, \ + { -ENAMETOOLONG, "ENAMETOOLONG" }, \ + { -ENOTEMPTY, "ENOTEMPTY" }, \ + { -EDQUOT, "EDQUOT" }, \ + { -ESTALE, "ESTALE" }, \ + { -EBADHANDLE, "EBADHANDLE" }, \ + { -EBADCOOKIE, "EBADCOOKIE" }, \ + { -ENOTSUPP, "ENOTSUPP" }, \ + { -ETOOSMALL, "ETOOSMALL" }, \ + { -EREMOTEIO, "EREMOTEIO" }, \ + { -EBADTYPE, "EBADTYPE" }, \ + { -EAGAIN, "EAGAIN" }, \ + { -ELOOP, "ELOOP" }, \ + { -EOPNOTSUPP, "EOPNOTSUPP" }, \ + { -EDEADLK, "EDEADLK" }, \ + /* RPC errors */ \ + { -ENOMEM, "ENOMEM" }, \ + { -EKEYEXPIRED, "EKEYEXPIRED" }, \ + { -ETIMEDOUT, "ETIMEDOUT" }, \ + { -ERESTARTSYS, "ERESTARTSYS" }, \ + { -ECONNREFUSED, "ECONNREFUSED" }, \ + { -ECONNRESET, "ECONNRESET" }, \ + { -ENETUNREACH, "ENETUNREACH" }, \ + { -EHOSTUNREACH, "EHOSTUNREACH" }, \ + { -EHOSTDOWN, "EHOSTDOWN" }, \ + { -EPIPE, "EPIPE" }, \ + { -EPFNOSUPPORT, "EPFNOSUPPORT" }, \ + { -EPROTONOSUPPORT, "EPROTONOSUPPORT" }, \ + /* NFSv4 native errors */ \ + { -NFS4ERR_ACCESS, "ACCESS" }, \ + { -NFS4ERR_ATTRNOTSUPP, "ATTRNOTSUPP" }, \ + { -NFS4ERR_ADMIN_REVOKED, "ADMIN_REVOKED" }, \ + { -NFS4ERR_BACK_CHAN_BUSY, "BACK_CHAN_BUSY" }, \ + { -NFS4ERR_BADCHAR, "BADCHAR" }, \ + { -NFS4ERR_BADHANDLE, "BADHANDLE" }, \ + { -NFS4ERR_BADIOMODE, "BADIOMODE" }, \ + { -NFS4ERR_BADLAYOUT, "BADLAYOUT" }, \ + { -NFS4ERR_BADLABEL, "BADLABEL" }, \ + { -NFS4ERR_BADNAME, "BADNAME" }, \ + { -NFS4ERR_BADOWNER, "BADOWNER" }, \ + { -NFS4ERR_BADSESSION, "BADSESSION" }, \ + { -NFS4ERR_BADSLOT, "BADSLOT" }, \ + { -NFS4ERR_BADTYPE, "BADTYPE" }, \ + { -NFS4ERR_BADXDR, "BADXDR" }, \ + { -NFS4ERR_BAD_COOKIE, "BAD_COOKIE" }, \ + { -NFS4ERR_BAD_HIGH_SLOT, "BAD_HIGH_SLOT" }, \ + { -NFS4ERR_BAD_RANGE, "BAD_RANGE" }, \ + { -NFS4ERR_BAD_SEQID, "BAD_SEQID" }, \ + { -NFS4ERR_BAD_SESSION_DIGEST, "BAD_SESSION_DIGEST" }, \ + { -NFS4ERR_BAD_STATEID, "BAD_STATEID" }, \ + { -NFS4ERR_CB_PATH_DOWN, "CB_PATH_DOWN" }, \ + { -NFS4ERR_CLID_INUSE, "CLID_INUSE" }, \ + { -NFS4ERR_CLIENTID_BUSY, "CLIENTID_BUSY" }, \ + { -NFS4ERR_COMPLETE_ALREADY, "COMPLETE_ALREADY" }, \ + { -NFS4ERR_CONN_NOT_BOUND_TO_SESSION, \ + "CONN_NOT_BOUND_TO_SESSION" }, \ + { -NFS4ERR_DEADLOCK, "DEADLOCK" }, \ + { -NFS4ERR_DEADSESSION, "DEAD_SESSION" }, \ + { -NFS4ERR_DELAY, "DELAY" }, \ + { -NFS4ERR_DELEG_ALREADY_WANTED, \ + "DELEG_ALREADY_WANTED" }, \ + { -NFS4ERR_DELEG_REVOKED, "DELEG_REVOKED" }, \ + { -NFS4ERR_DENIED, "DENIED" }, \ + { -NFS4ERR_DIRDELEG_UNAVAIL, "DIRDELEG_UNAVAIL" }, \ + { -NFS4ERR_DQUOT, "DQUOT" }, \ + { -NFS4ERR_ENCR_ALG_UNSUPP, "ENCR_ALG_UNSUPP" }, \ + { -NFS4ERR_EXIST, "EXIST" }, \ + { -NFS4ERR_EXPIRED, "EXPIRED" }, \ + { -NFS4ERR_FBIG, "FBIG" }, \ + { -NFS4ERR_FHEXPIRED, "FHEXPIRED" }, \ + { -NFS4ERR_FILE_OPEN, "FILE_OPEN" }, \ + { -NFS4ERR_GRACE, "GRACE" }, \ + { -NFS4ERR_HASH_ALG_UNSUPP, "HASH_ALG_UNSUPP" }, \ + { -NFS4ERR_INVAL, "INVAL" }, \ + { -NFS4ERR_IO, "IO" }, \ + { -NFS4ERR_ISDIR, "ISDIR" }, \ + { -NFS4ERR_LAYOUTTRYLATER, "LAYOUTTRYLATER" }, \ + { -NFS4ERR_LAYOUTUNAVAILABLE, "LAYOUTUNAVAILABLE" }, \ + { -NFS4ERR_LEASE_MOVED, "LEASE_MOVED" }, \ + { -NFS4ERR_LOCKED, "LOCKED" }, \ + { -NFS4ERR_LOCKS_HELD, "LOCKS_HELD" }, \ + { -NFS4ERR_LOCK_RANGE, "LOCK_RANGE" }, \ + { -NFS4ERR_MINOR_VERS_MISMATCH, "MINOR_VERS_MISMATCH" }, \ + { -NFS4ERR_MLINK, "MLINK" }, \ + { -NFS4ERR_MOVED, "MOVED" }, \ + { -NFS4ERR_NAMETOOLONG, "NAMETOOLONG" }, \ + { -NFS4ERR_NOENT, "NOENT" }, \ + { -NFS4ERR_NOFILEHANDLE, "NOFILEHANDLE" }, \ + { -NFS4ERR_NOMATCHING_LAYOUT, "NOMATCHING_LAYOUT" }, \ + { -NFS4ERR_NOSPC, "NOSPC" }, \ + { -NFS4ERR_NOTDIR, "NOTDIR" }, \ + { -NFS4ERR_NOTEMPTY, "NOTEMPTY" }, \ + { -NFS4ERR_NOTSUPP, "NOTSUPP" }, \ + { -NFS4ERR_NOT_ONLY_OP, "NOT_ONLY_OP" }, \ + { -NFS4ERR_NOT_SAME, "NOT_SAME" }, \ + { -NFS4ERR_NO_GRACE, "NO_GRACE" }, \ + { -NFS4ERR_NXIO, "NXIO" }, \ + { -NFS4ERR_OLD_STATEID, "OLD_STATEID" }, \ + { -NFS4ERR_OPENMODE, "OPENMODE" }, \ + { -NFS4ERR_OP_ILLEGAL, "OP_ILLEGAL" }, \ + { -NFS4ERR_OP_NOT_IN_SESSION, "OP_NOT_IN_SESSION" }, \ + { -NFS4ERR_PERM, "PERM" }, \ + { -NFS4ERR_PNFS_IO_HOLE, "PNFS_IO_HOLE" }, \ + { -NFS4ERR_PNFS_NO_LAYOUT, "PNFS_NO_LAYOUT" }, \ + { -NFS4ERR_RECALLCONFLICT, "RECALLCONFLICT" }, \ + { -NFS4ERR_RECLAIM_BAD, "RECLAIM_BAD" }, \ + { -NFS4ERR_RECLAIM_CONFLICT, "RECLAIM_CONFLICT" }, \ + { -NFS4ERR_REJECT_DELEG, "REJECT_DELEG" }, \ + { -NFS4ERR_REP_TOO_BIG, "REP_TOO_BIG" }, \ + { -NFS4ERR_REP_TOO_BIG_TO_CACHE, \ + "REP_TOO_BIG_TO_CACHE" }, \ + { -NFS4ERR_REQ_TOO_BIG, "REQ_TOO_BIG" }, \ + { -NFS4ERR_RESOURCE, "RESOURCE" }, \ + { -NFS4ERR_RESTOREFH, "RESTOREFH" }, \ + { -NFS4ERR_RETRY_UNCACHED_REP, "RETRY_UNCACHED_REP" }, \ + { -NFS4ERR_RETURNCONFLICT, "RETURNCONFLICT" }, \ + { -NFS4ERR_ROFS, "ROFS" }, \ + { -NFS4ERR_SAME, "SAME" }, \ + { -NFS4ERR_SHARE_DENIED, "SHARE_DENIED" }, \ + { -NFS4ERR_SEQUENCE_POS, "SEQUENCE_POS" }, \ + { -NFS4ERR_SEQ_FALSE_RETRY, "SEQ_FALSE_RETRY" }, \ + { -NFS4ERR_SEQ_MISORDERED, "SEQ_MISORDERED" }, \ + { -NFS4ERR_SERVERFAULT, "SERVERFAULT" }, \ + { -NFS4ERR_STALE, "STALE" }, \ + { -NFS4ERR_STALE_CLIENTID, "STALE_CLIENTID" }, \ + { -NFS4ERR_STALE_STATEID, "STALE_STATEID" }, \ + { -NFS4ERR_SYMLINK, "SYMLINK" }, \ + { -NFS4ERR_TOOSMALL, "TOOSMALL" }, \ + { -NFS4ERR_TOO_MANY_OPS, "TOO_MANY_OPS" }, \ + { -NFS4ERR_UNKNOWN_LAYOUTTYPE, "UNKNOWN_LAYOUTTYPE" }, \ + { -NFS4ERR_UNSAFE_COMPOUND, "UNSAFE_COMPOUND" }, \ + { -NFS4ERR_WRONGSEC, "WRONGSEC" }, \ + { -NFS4ERR_WRONG_CRED, "WRONG_CRED" }, \ + { -NFS4ERR_WRONG_TYPE, "WRONG_TYPE" }, \ + { -NFS4ERR_XDEV, "XDEV" }) + +#define show_open_flags(flags) \ + __print_flags(flags, "|", \ + { O_CREAT, "O_CREAT" }, \ + { O_EXCL, "O_EXCL" }, \ + { O_TRUNC, "O_TRUNC" }, \ + { O_DIRECT, "O_DIRECT" }) + +#define show_fmode_flags(mode) \ + __print_flags(mode, "|", \ + { ((__force unsigned long)FMODE_READ), "READ" }, \ + { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \ + { ((__force unsigned long)FMODE_EXEC), "EXEC" }) + +#define show_nfs_fattr_flags(valid) \ + __print_flags((unsigned long)valid, "|", \ + { NFS_ATTR_FATTR_TYPE, "TYPE" }, \ + { NFS_ATTR_FATTR_MODE, "MODE" }, \ + { NFS_ATTR_FATTR_NLINK, "NLINK" }, \ + { NFS_ATTR_FATTR_OWNER, "OWNER" }, \ + { NFS_ATTR_FATTR_GROUP, "GROUP" }, \ + { NFS_ATTR_FATTR_RDEV, "RDEV" }, \ + { NFS_ATTR_FATTR_SIZE, "SIZE" }, \ + { NFS_ATTR_FATTR_FSID, "FSID" }, \ + { NFS_ATTR_FATTR_FILEID, "FILEID" }, \ + { NFS_ATTR_FATTR_ATIME, "ATIME" }, \ + { NFS_ATTR_FATTR_MTIME, "MTIME" }, \ + { NFS_ATTR_FATTR_CTIME, "CTIME" }, \ + { NFS_ATTR_FATTR_CHANGE, "CHANGE" }, \ + { NFS_ATTR_FATTR_OWNER_NAME, "OWNER_NAME" }, \ + { NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" }) + +DECLARE_EVENT_CLASS(nfs4_clientid_event, + TP_PROTO( + const struct nfs_client *clp, + int error + ), + + TP_ARGS(clp, error), + + TP_STRUCT__entry( + __string(dstaddr, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR)) + __field(int, error) + ), + + TP_fast_assign( + __entry->error = error; + __assign_str(dstaddr, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR)); + ), + + TP_printk( + "error=%d (%s) dstaddr=%s", + __entry->error, + show_nfsv4_errors(__entry->error), + __get_str(dstaddr) + ) +); +#define DEFINE_NFS4_CLIENTID_EVENT(name) \ + DEFINE_EVENT(nfs4_clientid_event, name, \ + TP_PROTO( \ + const struct nfs_client *clp, \ + int error \ + ), \ + TP_ARGS(clp, error)) +DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_setclientid_confirm); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_renew_async); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_CLIENTID_EVENT(nfs4_exchange_id); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_create_session); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_session); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_destroy_clientid); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence); +DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete); + +TRACE_EVENT(nfs4_setup_sequence, + TP_PROTO( + const struct nfs4_session *session, + const struct nfs4_sequence_args *args + ), + TP_ARGS(session, args), + + TP_STRUCT__entry( + __field(unsigned int, session) + __field(unsigned int, slot_nr) + __field(unsigned int, seq_nr) + __field(unsigned int, highest_used_slotid) + ), + + TP_fast_assign( + const struct nfs4_slot *sa_slot = args->sa_slot; + __entry->session = nfs_session_id_hash(&session->sess_id); + __entry->slot_nr = sa_slot->slot_nr; + __entry->seq_nr = sa_slot->seq_nr; + __entry->highest_used_slotid = + sa_slot->table->highest_used_slotid; + ), + TP_printk( + "session=0x%08x slot_nr=%u seq_nr=%u " + "highest_used_slotid=%u", + __entry->session, + __entry->slot_nr, + __entry->seq_nr, + __entry->highest_used_slotid + ) +); + +#define show_nfs4_sequence_status_flags(status) \ + __print_flags((unsigned long)status, "|", \ + { SEQ4_STATUS_CB_PATH_DOWN, "CB_PATH_DOWN" }, \ + { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING, \ + "CB_GSS_CONTEXTS_EXPIRING" }, \ + { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED, \ + "CB_GSS_CONTEXTS_EXPIRED" }, \ + { SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED, \ + "EXPIRED_ALL_STATE_REVOKED" }, \ + { SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED, \ + "EXPIRED_SOME_STATE_REVOKED" }, \ + { SEQ4_STATUS_ADMIN_STATE_REVOKED, \ + "ADMIN_STATE_REVOKED" }, \ + { SEQ4_STATUS_RECALLABLE_STATE_REVOKED, \ + "RECALLABLE_STATE_REVOKED" }, \ + { SEQ4_STATUS_LEASE_MOVED, "LEASE_MOVED" }, \ + { SEQ4_STATUS_RESTART_RECLAIM_NEEDED, \ + "RESTART_RECLAIM_NEEDED" }, \ + { SEQ4_STATUS_CB_PATH_DOWN_SESSION, \ + "CB_PATH_DOWN_SESSION" }, \ + { SEQ4_STATUS_BACKCHANNEL_FAULT, \ + "BACKCHANNEL_FAULT" }) + +TRACE_EVENT(nfs4_sequence_done, + TP_PROTO( + const struct nfs4_session *session, + const struct nfs4_sequence_res *res + ), + TP_ARGS(session, res), + + TP_STRUCT__entry( + __field(unsigned int, session) + __field(unsigned int, slot_nr) + __field(unsigned int, seq_nr) + __field(unsigned int, highest_slotid) + __field(unsigned int, target_highest_slotid) + __field(unsigned int, status_flags) + __field(int, error) + ), + + TP_fast_assign( + const struct nfs4_slot *sr_slot = res->sr_slot; + __entry->session = nfs_session_id_hash(&session->sess_id); + __entry->slot_nr = sr_slot->slot_nr; + __entry->seq_nr = sr_slot->seq_nr; + __entry->highest_slotid = res->sr_highest_slotid; + __entry->target_highest_slotid = + res->sr_target_highest_slotid; + __entry->error = res->sr_status; + ), + TP_printk( + "error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u " + "highest_slotid=%u target_highest_slotid=%u " + "status_flags=%u (%s)", + __entry->error, + show_nfsv4_errors(__entry->error), + __entry->session, + __entry->slot_nr, + __entry->seq_nr, + __entry->highest_slotid, + __entry->target_highest_slotid, + __entry->status_flags, + show_nfs4_sequence_status_flags(__entry->status_flags) + ) +); + +struct cb_sequenceargs; +struct cb_sequenceres; + +TRACE_EVENT(nfs4_cb_sequence, + TP_PROTO( + const struct cb_sequenceargs *args, + const struct cb_sequenceres *res, + __be32 status + ), + TP_ARGS(args, res, status), + + TP_STRUCT__entry( + __field(unsigned int, session) + __field(unsigned int, slot_nr) + __field(unsigned int, seq_nr) + __field(unsigned int, highest_slotid) + __field(unsigned int, cachethis) + __field(int, error) + ), + + TP_fast_assign( + __entry->session = nfs_session_id_hash(&args->csa_sessionid); + __entry->slot_nr = args->csa_slotid; + __entry->seq_nr = args->csa_sequenceid; + __entry->highest_slotid = args->csa_highestslotid; + __entry->cachethis = args->csa_cachethis; + __entry->error = -be32_to_cpu(status); + ), + + TP_printk( + "error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u " + "highest_slotid=%u", + __entry->error, + show_nfsv4_errors(__entry->error), + __entry->session, + __entry->slot_nr, + __entry->seq_nr, + __entry->highest_slotid + ) +); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_open_event, + TP_PROTO( + const struct nfs_open_context *ctx, + int flags, + int error + ), + + TP_ARGS(ctx, flags, error), + + TP_STRUCT__entry( + __field(int, error) + __field(unsigned int, flags) + __field(unsigned int, fmode) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, dir) + __string(name, ctx->dentry->d_name.name) + ), + + TP_fast_assign( + const struct nfs4_state *state = ctx->state; + const struct inode *inode = NULL; + + __entry->error = error; + __entry->flags = flags; + __entry->fmode = (__force unsigned int)ctx->mode; + __entry->dev = ctx->dentry->d_sb->s_dev; + if (!IS_ERR(state)) + inode = state->inode; + if (inode != NULL) { + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + } else { + __entry->fileid = 0; + __entry->fhandle = 0; + } + __entry->dir = NFS_FILEID(ctx->dentry->d_parent->d_inode); + __assign_str(name, ctx->dentry->d_name.name); + ), + + TP_printk( + "error=%d (%s) flags=%d (%s) fmode=%s " + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "name=%02x:%02x:%llu/%s", + __entry->error, + show_nfsv4_errors(__entry->error), + __entry->flags, + show_open_flags(__entry->flags), + show_fmode_flags(__entry->fmode), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS4_OPEN_EVENT(name) \ + DEFINE_EVENT(nfs4_open_event, name, \ + TP_PROTO( \ + const struct nfs_open_context *ctx, \ + int flags, \ + int error \ + ), \ + TP_ARGS(ctx, flags, error)) +DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim); +DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired); +DEFINE_NFS4_OPEN_EVENT(nfs4_open_file); + +TRACE_EVENT(nfs4_close, + TP_PROTO( + const struct nfs4_state *state, + const struct nfs_closeargs *args, + const struct nfs_closeres *res, + int error + ), + + TP_ARGS(state, args, res, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned int, fmode) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->fmode = (__force unsigned int)state->state; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fmode=%s fileid=%02x:%02x:%llu " + "fhandle=0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + __entry->fmode ? show_fmode_flags(__entry->fmode) : + "closed", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); + +#define show_lock_cmd(type) \ + __print_symbolic((int)type, \ + { F_GETLK, "GETLK" }, \ + { F_SETLK, "SETLK" }, \ + { F_SETLKW, "SETLKW" }) +#define show_lock_type(type) \ + __print_symbolic((int)type, \ + { F_RDLCK, "RDLCK" }, \ + { F_WRLCK, "WRLCK" }, \ + { F_UNLCK, "UNLCK" }) + +DECLARE_EVENT_CLASS(nfs4_lock_event, + TP_PROTO( + const struct file_lock *request, + const struct nfs4_state *state, + int cmd, + int error + ), + + TP_ARGS(request, state, cmd, error), + + TP_STRUCT__entry( + __field(int, error) + __field(int, cmd) + __field(char, type) + __field(loff_t, start) + __field(loff_t, end) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->error = error; + __entry->cmd = cmd; + __entry->type = request->fl_type; + __entry->start = request->fl_start; + __entry->end = request->fl_end; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + ), + + TP_printk( + "error=%d (%s) cmd=%s:%s range=%lld:%lld " + "fileid=%02x:%02x:%llu fhandle=0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + show_lock_cmd(__entry->cmd), + show_lock_type(__entry->type), + (long long)__entry->start, + (long long)__entry->end, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); + +#define DEFINE_NFS4_LOCK_EVENT(name) \ + DEFINE_EVENT(nfs4_lock_event, name, \ + TP_PROTO( \ + const struct file_lock *request, \ + const struct nfs4_state *state, \ + int cmd, \ + int error \ + ), \ + TP_ARGS(request, state, cmd, error)) +DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock); +DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock); +DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim); +DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired); +DEFINE_NFS4_LOCK_EVENT(nfs4_unlock); + +DECLARE_EVENT_CLASS(nfs4_set_delegation_event, + TP_PROTO( + const struct inode *inode, + fmode_t fmode + ), + + TP_ARGS(inode, fmode), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned int, fmode) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->fmode = (__force unsigned int)fmode; + ), + + TP_printk( + "fmode=%s fileid=%02x:%02x:%llu fhandle=0x%08x", + show_fmode_flags(__entry->fmode), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); +#define DEFINE_NFS4_SET_DELEGATION_EVENT(name) \ + DEFINE_EVENT(nfs4_set_delegation_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + fmode_t fmode \ + ), \ + TP_ARGS(inode, fmode)) +DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_set_delegation); +DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_reclaim_delegation); + +TRACE_EVENT(nfs4_delegreturn_exit, + TP_PROTO( + const struct nfs4_delegreturnargs *args, + const struct nfs4_delegreturnres *res, + int error + ), + + TP_ARGS(args, res, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(int, error) + ), + + TP_fast_assign( + __entry->dev = res->server->s_dev; + __entry->fhandle = nfs_fhandle_hash(args->fhandle); + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) dev=%02x:%02x fhandle=0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->fhandle + ) +); + +#ifdef CONFIG_NFS_V4_1 +DECLARE_EVENT_CLASS(nfs4_test_stateid_event, + TP_PROTO( + const struct nfs4_state *state, + const struct nfs4_lock_state *lsp, + int error + ), + + TP_ARGS(state, lsp, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->error = error; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); + +#define DEFINE_NFS4_TEST_STATEID_EVENT(name) \ + DEFINE_EVENT(nfs4_test_stateid_event, name, \ + TP_PROTO( \ + const struct nfs4_state *state, \ + const struct nfs4_lock_state *lsp, \ + int error \ + ), \ + TP_ARGS(state, lsp, error)) +DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_delegation_stateid); +DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_open_stateid); +DEFINE_NFS4_TEST_STATEID_EVENT(nfs4_test_lock_stateid); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_lookup_event, + TP_PROTO( + const struct inode *dir, + const struct qstr *name, + int error + ), + + TP_ARGS(dir, name, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, error) + __field(u64, dir) + __string(name, name->name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->error = error; + __assign_str(name, name->name); + ), + + TP_printk( + "error=%d (%s) name=%02x:%02x:%llu/%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS4_LOOKUP_EVENT(name) \ + DEFINE_EVENT(nfs4_lookup_event, name, \ + TP_PROTO( \ + const struct inode *dir, \ + const struct qstr *name, \ + int error \ + ), \ + TP_ARGS(dir, name, error)) + +DEFINE_NFS4_LOOKUP_EVENT(nfs4_lookup); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_symlink); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_mkdir); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_mknod); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_remove); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_get_fs_locations); +DEFINE_NFS4_LOOKUP_EVENT(nfs4_secinfo); + +TRACE_EVENT(nfs4_rename, + TP_PROTO( + const struct inode *olddir, + const struct qstr *oldname, + const struct inode *newdir, + const struct qstr *newname, + int error + ), + + TP_ARGS(olddir, oldname, newdir, newname, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, error) + __field(u64, olddir) + __string(oldname, oldname->name) + __field(u64, newdir) + __string(newname, newname->name) + ), + + TP_fast_assign( + __entry->dev = olddir->i_sb->s_dev; + __entry->olddir = NFS_FILEID(olddir); + __entry->newdir = NFS_FILEID(newdir); + __entry->error = error; + __assign_str(oldname, oldname->name); + __assign_str(newname, newname->name); + ), + + TP_printk( + "error=%d (%s) oldname=%02x:%02x:%llu/%s " + "newname=%02x:%02x:%llu/%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->olddir, + __get_str(oldname), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->newdir, + __get_str(newname) + ) +); + +DECLARE_EVENT_CLASS(nfs4_inode_event, + TP_PROTO( + const struct inode *inode, + int error + ), + + TP_ARGS(inode, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(int, error) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); + +#define DEFINE_NFS4_INODE_EVENT(name) \ + DEFINE_EVENT(nfs4_inode_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + int error \ + ), \ + TP_ARGS(inode, error)) + +DEFINE_NFS4_INODE_EVENT(nfs4_setattr); +DEFINE_NFS4_INODE_EVENT(nfs4_access); +DEFINE_NFS4_INODE_EVENT(nfs4_readlink); +DEFINE_NFS4_INODE_EVENT(nfs4_readdir); +DEFINE_NFS4_INODE_EVENT(nfs4_get_acl); +DEFINE_NFS4_INODE_EVENT(nfs4_set_acl); +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label); +DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label); +#endif /* CONFIG_NFS_V4_SECURITY_LABEL */ +DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation); +DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn); + +DECLARE_EVENT_CLASS(nfs4_getattr_event, + TP_PROTO( + const struct nfs_server *server, + const struct nfs_fh *fhandle, + const struct nfs_fattr *fattr, + int error + ), + + TP_ARGS(server, fhandle, fattr, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned int, valid) + __field(int, error) + ), + + TP_fast_assign( + __entry->dev = server->s_dev; + __entry->valid = fattr->valid; + __entry->fhandle = nfs_fhandle_hash(fhandle); + __entry->fileid = (fattr->valid & NFS_ATTR_FATTR_FILEID) ? fattr->fileid : 0; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "valid=%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + show_nfs_fattr_flags(__entry->valid) + ) +); + +#define DEFINE_NFS4_GETATTR_EVENT(name) \ + DEFINE_EVENT(nfs4_getattr_event, name, \ + TP_PROTO( \ + const struct nfs_server *server, \ + const struct nfs_fh *fhandle, \ + const struct nfs_fattr *fattr, \ + int error \ + ), \ + TP_ARGS(server, fhandle, fattr, error)) +DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr); +DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root); +DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo); + +DECLARE_EVENT_CLASS(nfs4_idmap_event, + TP_PROTO( + const char *name, + int len, + u32 id, + int error + ), + + TP_ARGS(name, len, id, error), + + TP_STRUCT__entry( + __field(int, error) + __field(u32, id) + __dynamic_array(char, name, len > 0 ? len + 1 : 1) + ), + + TP_fast_assign( + if (len < 0) + len = 0; + __entry->error = error < 0 ? error : 0; + __entry->id = id; + memcpy(__get_dynamic_array(name), name, len); + ((char *)__get_dynamic_array(name))[len] = 0; + ), + + TP_printk( + "error=%d id=%u name=%s", + __entry->error, + __entry->id, + __get_str(name) + ) +); +#define DEFINE_NFS4_IDMAP_EVENT(name) \ + DEFINE_EVENT(nfs4_idmap_event, name, \ + TP_PROTO( \ + const char *name, \ + int len, \ + u32 id, \ + int error \ + ), \ + TP_ARGS(name, len, id, error)) +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_name_to_uid); +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_group_to_gid); +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_uid_to_name); +DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group); + +DECLARE_EVENT_CLASS(nfs4_read_event, + TP_PROTO( + const struct nfs_pgio_data *data, + int error + ), + + TP_ARGS(data, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(size_t, count) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = data->header->inode; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->offset = data->args.offset; + __entry->count = data->args.count; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zu", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, + __entry->count + ) +); +#define DEFINE_NFS4_READ_EVENT(name) \ + DEFINE_EVENT(nfs4_read_event, name, \ + TP_PROTO( \ + const struct nfs_pgio_data *data, \ + int error \ + ), \ + TP_ARGS(data, error)) +DEFINE_NFS4_READ_EVENT(nfs4_read); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_write_event, + TP_PROTO( + const struct nfs_pgio_data *data, + int error + ), + + TP_ARGS(data, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(size_t, count) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = data->header->inode; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->offset = data->args.offset; + __entry->count = data->args.count; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zu", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, + __entry->count + ) +); + +#define DEFINE_NFS4_WRITE_EVENT(name) \ + DEFINE_EVENT(nfs4_write_event, name, \ + TP_PROTO( \ + const struct nfs_pgio_data *data, \ + int error \ + ), \ + TP_ARGS(data, error)) +DEFINE_NFS4_WRITE_EVENT(nfs4_write); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write); +#endif /* CONFIG_NFS_V4_1 */ + +DECLARE_EVENT_CLASS(nfs4_commit_event, + TP_PROTO( + const struct nfs_commit_data *data, + int error + ), + + TP_ARGS(data, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(size_t, count) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = data->inode; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->offset = data->args.offset; + __entry->count = data->args.count; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zu", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, + __entry->count + ) +); +#define DEFINE_NFS4_COMMIT_EVENT(name) \ + DEFINE_EVENT(nfs4_commit_event, name, \ + TP_PROTO( \ + const struct nfs_commit_data *data, \ + int error \ + ), \ + TP_ARGS(data, error)) +DEFINE_NFS4_COMMIT_EVENT(nfs4_commit); +#ifdef CONFIG_NFS_V4_1 +DEFINE_NFS4_COMMIT_EVENT(nfs4_pnfs_commit_ds); + +#define show_pnfs_iomode(iomode) \ + __print_symbolic(iomode, \ + { IOMODE_READ, "READ" }, \ + { IOMODE_RW, "RW" }, \ + { IOMODE_ANY, "ANY" }) + +TRACE_EVENT(nfs4_layoutget, + TP_PROTO( + const struct nfs_open_context *ctx, + const struct pnfs_layout_range *args, + const struct pnfs_layout_range *res, + int error + ), + + TP_ARGS(ctx, args, res, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u32, iomode) + __field(u64, offset) + __field(u64, count) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = ctx->dentry->d_inode; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->iomode = args->iomode; + __entry->offset = args->offset; + __entry->count = args->length; + __entry->error = error; + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "iomode=%s offset=%llu count=%llu", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + show_pnfs_iomode(__entry->iomode), + (unsigned long long)__entry->offset, + (unsigned long long)__entry->count + ) +); + +DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit); +DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn); + +#endif /* CONFIG_NFS_V4_1 */ + +#endif /* _TRACE_NFS4_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE nfs4trace +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index db1ed9c46ed..939ae606cfa 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -8,7 +8,7 @@ * * Kendrick Smith <kmsmith@umich.edu> * Andy Adamson <andros@umich.edu> - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -38,20 +38,27 @@ #include <linux/param.h> #include <linux/time.h> #include <linux/mm.h> -#include <linux/slab.h> -#include <linux/utsname.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/in.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> #include <linux/kdev_t.h> +#include <linux/module.h> +#include <linux/utsname.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/msg_prot.h> +#include <linux/sunrpc/gss_api.h> #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> #include <linux/nfs_idmap.h> + #include "nfs4_fs.h" +#include "internal.h" +#include "nfs4session.h" +#include "pnfs.h" +#include "netns.h" #define NFSDBG_FACILITY NFSDBG_XDR @@ -67,11 +74,11 @@ static int nfs4_stat_to_errno(int); #define NFS4_MAXTAGLEN 0 #endif -/* lock,open owner id: +/* lock,open owner id: * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) */ -#define open_owner_id_maxsz (1 + 4) -#define lock_owner_id_maxsz (1 + 4) +#define open_owner_id_maxsz (1 + 2 + 1 + 1 + 2) +#define lock_owner_id_maxsz (1 + 1 + 4) #define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) #define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) #define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) @@ -89,15 +96,25 @@ static int nfs4_stat_to_errno(int); #define encode_getfh_maxsz (op_encode_hdr_maxsz) #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ ((3+NFS4_FHSIZE) >> 2)) -#define nfs4_fattr_bitmap_maxsz 3 +#define nfs4_fattr_bitmap_maxsz 4 #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) #define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) #define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */ +#define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN)) +#else +#define nfs4_label_maxsz 0 +#endif +/* We support only one layout type per file system */ +#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) /* This is based on getfattr, which uses the most attributes: */ #define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ - 3 + 3 + 3 + nfs4_owner_maxsz + nfs4_group_maxsz)) + 3 + 3 + 3 + nfs4_owner_maxsz + \ + nfs4_group_maxsz + nfs4_label_maxsz + \ + decode_mdsthreshold_maxsz)) #define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ nfs4_fattr_value_maxsz) #define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz) @@ -105,13 +122,18 @@ static int nfs4_stat_to_errno(int); 1 + 2 + 1 + \ nfs4_owner_maxsz + \ nfs4_group_maxsz + \ + nfs4_label_maxsz + \ 4 + 4) #define encode_savefh_maxsz (op_encode_hdr_maxsz) #define decode_savefh_maxsz (op_decode_hdr_maxsz) #define encode_restorefh_maxsz (op_encode_hdr_maxsz) #define decode_restorefh_maxsz (op_decode_hdr_maxsz) -#define encode_fsinfo_maxsz (op_encode_hdr_maxsz + 2) -#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) +#define encode_fsinfo_maxsz (encode_getattr_maxsz) +/* The 5 accounts for the PNFS attributes, and assumes that at most three + * layout types will be returned. + */ +#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ + nfs4_fattr_bitmap_maxsz + 4 + 8 + 5) #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) #define decode_renew_maxsz (op_decode_hdr_maxsz) #define encode_setclientid_maxsz \ @@ -135,7 +157,7 @@ static int nfs4_stat_to_errno(int); #define decode_lookup_maxsz (op_decode_hdr_maxsz) #define encode_share_access_maxsz \ (2) -#define encode_createmode_maxsz (1 + encode_attrs_maxsz) +#define encode_createmode_maxsz (1 + encode_attrs_maxsz + encode_verifier_maxsz) #define encode_opentype_maxsz (1 + encode_createmode_maxsz) #define encode_claim_null_maxsz (1 + nfs4_name_maxsz) #define encode_open_maxsz (op_encode_hdr_maxsz + \ @@ -178,7 +200,8 @@ static int nfs4_stat_to_errno(int); encode_stateid_maxsz + 3) #define decode_read_maxsz (op_decode_hdr_maxsz + 2) #define encode_readdir_maxsz (op_encode_hdr_maxsz + \ - 2 + encode_verifier_maxsz + 5) + 2 + encode_verifier_maxsz + 5 + \ + nfs4_label_maxsz) #define decode_readdir_maxsz (op_decode_hdr_maxsz + \ decode_verifier_maxsz) #define encode_readlink_maxsz (op_encode_hdr_maxsz) @@ -192,20 +215,27 @@ static int nfs4_stat_to_errno(int); decode_verifier_maxsz) #define encode_remove_maxsz (op_encode_hdr_maxsz + \ nfs4_name_maxsz) +#define decode_remove_maxsz (op_decode_hdr_maxsz + \ + decode_change_info_maxsz) #define encode_rename_maxsz (op_encode_hdr_maxsz + \ 2 * nfs4_name_maxsz) -#define decode_rename_maxsz (op_decode_hdr_maxsz + 5 + 5) +#define decode_rename_maxsz (op_decode_hdr_maxsz + \ + decode_change_info_maxsz + \ + decode_change_info_maxsz) #define encode_link_maxsz (op_encode_hdr_maxsz + \ nfs4_name_maxsz) -#define decode_link_maxsz (op_decode_hdr_maxsz + 5) +#define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) +#define encode_lockowner_maxsz (7) #define encode_lock_maxsz (op_encode_hdr_maxsz + \ 7 + \ - 1 + encode_stateid_maxsz + 8) + 1 + encode_stateid_maxsz + 1 + \ + encode_lockowner_maxsz) #define decode_lock_denied_maxsz \ (8 + decode_lockowner_maxsz) #define decode_lock_maxsz (op_decode_hdr_maxsz + \ decode_lock_denied_maxsz) -#define encode_lockt_maxsz (op_encode_hdr_maxsz + 12) +#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ + encode_lockowner_maxsz) #define decode_lockt_maxsz (op_decode_hdr_maxsz + \ decode_lock_denied_maxsz) #define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ @@ -213,6 +243,11 @@ static int nfs4_stat_to_errno(int); 4) #define decode_locku_maxsz (op_decode_hdr_maxsz + \ decode_stateid_maxsz) +#define encode_release_lockowner_maxsz \ + (op_encode_hdr_maxsz + \ + encode_lockowner_maxsz) +#define decode_release_lockowner_maxsz \ + (op_decode_hdr_maxsz) #define encode_access_maxsz (op_encode_hdr_maxsz + 1) #define decode_access_maxsz (op_decode_hdr_maxsz + 2) #define encode_symlink_maxsz (op_encode_hdr_maxsz + \ @@ -240,57 +275,203 @@ static int nfs4_stat_to_errno(int); (encode_getattr_maxsz) #define decode_fs_locations_maxsz \ (0) +#define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) +#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4)) + +#if defined(CONFIG_NFS_V4_1) +#define NFS4_MAX_MACHINE_NAME_LEN (64) +#define IMPL_NAME_LIMIT (sizeof(utsname()->sysname) + sizeof(utsname()->release) + \ + sizeof(utsname()->version) + sizeof(utsname()->machine) + 8) + +#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \ + encode_verifier_maxsz + \ + 1 /* co_ownerid.len */ + \ + XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \ + 1 /* flags */ + \ + 1 /* spa_how */ + \ + /* max is SP4_MACH_CRED (for now) */ + \ + 1 + NFS4_OP_MAP_NUM_WORDS + \ + 1 + NFS4_OP_MAP_NUM_WORDS + \ + 1 /* implementation id array of size 1 */ + \ + 1 /* nii_domain */ + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + 1 /* nii_name */ + \ + XDR_QUADLEN(IMPL_NAME_LIMIT) + \ + 3 /* nii_date */) +#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \ + 2 /* eir_clientid */ + \ + 1 /* eir_sequenceid */ + \ + 1 /* eir_flags */ + \ + 1 /* spr_how */ + \ + /* max is SP4_MACH_CRED (for now) */ + \ + 1 + NFS4_OP_MAP_NUM_WORDS + \ + 1 + NFS4_OP_MAP_NUM_WORDS + \ + 2 /* eir_server_owner.so_minor_id */ + \ + /* eir_server_owner.so_major_id<> */ \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \ + /* eir_server_scope<> */ \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \ + 1 /* eir_server_impl_id array length */ + \ + 1 /* nii_domain */ + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + 1 /* nii_name */ + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + 3 /* nii_date */) +#define encode_channel_attrs_maxsz (6 + 1 /* ca_rdma_ird.len (0) */) +#define decode_channel_attrs_maxsz (6 + \ + 1 /* ca_rdma_ird.len */ + \ + 1 /* ca_rdma_ird */) +#define encode_create_session_maxsz (op_encode_hdr_maxsz + \ + 2 /* csa_clientid */ + \ + 1 /* csa_sequence */ + \ + 1 /* csa_flags */ + \ + encode_channel_attrs_maxsz + \ + encode_channel_attrs_maxsz + \ + 1 /* csa_cb_program */ + \ + 1 /* csa_sec_parms.len (1) */ + \ + 1 /* cb_secflavor (AUTH_SYS) */ + \ + 1 /* stamp */ + \ + 1 /* machinename.len */ + \ + XDR_QUADLEN(NFS4_MAX_MACHINE_NAME_LEN) + \ + 1 /* uid */ + \ + 1 /* gid */ + \ + 1 /* gids.len (0) */) +#define decode_create_session_maxsz (op_decode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + 1 /* csr_sequence */ + \ + 1 /* csr_flags */ + \ + decode_channel_attrs_maxsz + \ + decode_channel_attrs_maxsz) +#define encode_bind_conn_to_session_maxsz (op_encode_hdr_maxsz + \ + /* bctsa_sessid */ \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + 1 /* bctsa_dir */ + \ + 1 /* bctsa_use_conn_in_rdma_mode */) +#define decode_bind_conn_to_session_maxsz (op_decode_hdr_maxsz + \ + /* bctsr_sessid */ \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + 1 /* bctsr_dir */ + \ + 1 /* bctsr_use_conn_in_rdma_mode */) +#define encode_destroy_session_maxsz (op_encode_hdr_maxsz + 4) +#define decode_destroy_session_maxsz (op_decode_hdr_maxsz) +#define encode_destroy_clientid_maxsz (op_encode_hdr_maxsz + 2) +#define decode_destroy_clientid_maxsz (op_decode_hdr_maxsz) +#define encode_sequence_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4) +#define decode_sequence_maxsz (op_decode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) +#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) +#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) +#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ + encode_verifier_maxsz) +#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ + 2 /* nfs_cookie4 gdlr_cookie */ + \ + decode_verifier_maxsz \ + /* verifier4 gdlr_verifier */ + \ + 1 /* gdlr_deviceid_list count */ + \ + XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ + NFS4_DEVICEID4_SIZE) \ + /* gdlr_deviceid_list */ + \ + 1 /* bool gdlr_eof */) +#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ + XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) +#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ + 1 /* layout type */ + \ + 1 /* opaque devaddr4 length */ + \ + /* devaddr4 payload is read into page */ \ + 1 /* notification bitmap length */ + \ + 1 /* notification bitmap */) +#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ + encode_stateid_maxsz) +#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ + decode_stateid_maxsz + \ + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) +#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \ + 2 /* offset */ + \ + 2 /* length */ + \ + 1 /* reclaim */ + \ + encode_stateid_maxsz + \ + 1 /* new offset (true) */ + \ + 2 /* last byte written */ + \ + 1 /* nt_timechanged (false) */ + \ + 1 /* layoutupdate4 layout type */ + \ + 1 /* NULL filelayout layoutupdate4 payload */) +#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) +#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ + encode_stateid_maxsz + \ + 1 /* FIXME: opaque lrf_body always empty at the moment */) +#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ + 1 + decode_stateid_maxsz) +#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1) +#define decode_secinfo_no_name_maxsz decode_secinfo_maxsz +#define encode_test_stateid_maxsz (op_encode_hdr_maxsz + 2 + \ + XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_test_stateid_maxsz (op_decode_hdr_maxsz + 2 + 1) +#define encode_free_stateid_maxsz (op_encode_hdr_maxsz + 1 + \ + XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_free_stateid_maxsz (op_decode_hdr_maxsz) +#else /* CONFIG_NFS_V4_1 */ +#define encode_sequence_maxsz 0 +#define decode_sequence_maxsz 0 +#endif /* CONFIG_NFS_V4_1 */ + #define NFS4_enc_compound_sz (1024) /* XXX: large enough? */ #define NFS4_dec_compound_sz (1024) /* XXX: large enough? */ #define NFS4_enc_read_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_read_maxsz) #define NFS4_dec_read_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_read_maxsz) #define NFS4_enc_readlink_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_readlink_maxsz) #define NFS4_dec_readlink_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_readlink_maxsz) #define NFS4_enc_readdir_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_readdir_maxsz) #define NFS4_dec_readdir_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_readdir_maxsz) #define NFS4_enc_write_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_write_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_write_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_write_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_commit_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ - encode_commit_maxsz + \ - encode_getattr_maxsz) + encode_commit_maxsz) #define NFS4_dec_commit_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ - decode_commit_maxsz + \ - decode_getattr_maxsz) + decode_commit_maxsz) #define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ - encode_savefh_maxsz + \ encode_open_maxsz + \ + encode_access_maxsz + \ encode_getfh_maxsz + \ - encode_getattr_maxsz + \ - encode_restorefh_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ - decode_savefh_maxsz + \ decode_open_maxsz + \ + decode_access_maxsz + \ decode_getfh_maxsz + \ - decode_getattr_maxsz + \ - decode_restorefh_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_open_confirm_sz \ (compound_encode_hdr_maxsz + \ @@ -301,43 +482,55 @@ static int nfs4_stat_to_errno(int); decode_putfh_maxsz + \ decode_open_confirm_maxsz) #define NFS4_enc_open_noattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_open_maxsz + \ + encode_access_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_open_maxsz + \ + decode_access_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_open_downgrade_sz \ (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_open_downgrade_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_open_downgrade_sz \ (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_open_downgrade_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_close_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_close_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_setattr_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_setattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_setattr_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_fsinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_fsinfo_maxsz) #define NFS4_dec_fsinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_fsinfo_maxsz) #define NFS4_enc_renew_sz (compound_encode_hdr_maxsz + \ @@ -350,519 +543,733 @@ static int nfs4_stat_to_errno(int); decode_setclientid_maxsz) #define NFS4_enc_setclientid_confirm_sz \ (compound_encode_hdr_maxsz + \ - encode_setclientid_confirm_maxsz + \ - encode_putrootfh_maxsz + \ - encode_fsinfo_maxsz) + encode_setclientid_confirm_maxsz) #define NFS4_dec_setclientid_confirm_sz \ (compound_decode_hdr_maxsz + \ - decode_setclientid_confirm_maxsz + \ - decode_putrootfh_maxsz + \ - decode_fsinfo_maxsz) + decode_setclientid_confirm_maxsz) #define NFS4_enc_lock_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_lock_maxsz) #define NFS4_dec_lock_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_lock_maxsz) #define NFS4_enc_lockt_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_lockt_maxsz) #define NFS4_dec_lockt_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_lockt_maxsz) #define NFS4_enc_locku_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_locku_maxsz) #define NFS4_dec_locku_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_locku_maxsz) +#define NFS4_enc_release_lockowner_sz \ + (compound_encode_hdr_maxsz + \ + encode_lockowner_maxsz) +#define NFS4_dec_release_lockowner_sz \ + (compound_decode_hdr_maxsz + \ + decode_lockowner_maxsz) #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_access_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_access_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_access_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ - encode_getattr_maxsz) + encode_getattr_maxsz + \ + encode_renew_maxsz) #define NFS4_dec_getattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ - decode_getattr_maxsz) + decode_getattr_maxsz + \ + decode_renew_maxsz) #define NFS4_enc_lookup_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_lookup_maxsz + \ encode_getattr_maxsz + \ encode_getfh_maxsz) #define NFS4_dec_lookup_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_lookup_maxsz + \ decode_getattr_maxsz + \ decode_getfh_maxsz) #define NFS4_enc_lookup_root_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putrootfh_maxsz + \ encode_getattr_maxsz + \ encode_getfh_maxsz) #define NFS4_dec_lookup_root_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putrootfh_maxsz + \ decode_getattr_maxsz + \ decode_getfh_maxsz) #define NFS4_enc_remove_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ - encode_remove_maxsz + \ - encode_getattr_maxsz) + encode_remove_maxsz) #define NFS4_dec_remove_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ - op_decode_hdr_maxsz + 5 + \ - decode_getattr_maxsz) + decode_remove_maxsz) #define NFS4_enc_rename_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_savefh_maxsz + \ encode_putfh_maxsz + \ - encode_rename_maxsz + \ - encode_getattr_maxsz + \ - encode_restorefh_maxsz + \ - encode_getattr_maxsz) + encode_rename_maxsz) #define NFS4_dec_rename_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_savefh_maxsz + \ decode_putfh_maxsz + \ - decode_rename_maxsz + \ - decode_getattr_maxsz + \ - decode_restorefh_maxsz + \ - decode_getattr_maxsz) + decode_rename_maxsz) #define NFS4_enc_link_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_savefh_maxsz + \ encode_putfh_maxsz + \ encode_link_maxsz + \ - decode_getattr_maxsz + \ encode_restorefh_maxsz + \ - decode_getattr_maxsz) + encode_getattr_maxsz) #define NFS4_dec_link_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_savefh_maxsz + \ decode_putfh_maxsz + \ decode_link_maxsz + \ - decode_getattr_maxsz + \ decode_restorefh_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_symlink_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_symlink_maxsz + \ encode_getattr_maxsz + \ encode_getfh_maxsz) #define NFS4_dec_symlink_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_symlink_maxsz + \ decode_getattr_maxsz + \ decode_getfh_maxsz) #define NFS4_enc_create_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ - encode_savefh_maxsz + \ encode_create_maxsz + \ encode_getfh_maxsz + \ - encode_getattr_maxsz + \ - encode_restorefh_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_create_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ - decode_savefh_maxsz + \ decode_create_maxsz + \ decode_getfh_maxsz + \ - decode_getattr_maxsz + \ - decode_restorefh_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_pathconf_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_pathconf_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_statfs_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_statfs_maxsz) #define NFS4_dec_statfs_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_statfs_maxsz) #define NFS4_enc_server_caps_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_server_caps_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_delegreturn_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_delegreturn_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_getacl_maxsz) #define NFS4_dec_getacl_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_getacl_maxsz) #define NFS4_enc_setacl_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_setacl_maxsz) #define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_setacl_maxsz) #define NFS4_enc_fs_locations_sz \ (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_lookup_maxsz + \ - encode_fs_locations_maxsz) + encode_fs_locations_maxsz + \ + encode_renew_maxsz) #define NFS4_dec_fs_locations_sz \ (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_lookup_maxsz + \ - decode_fs_locations_maxsz) - -static struct { - unsigned int mode; - unsigned int nfs2type; -} nfs_type2fmt[] = { - { 0, NFNON }, - { S_IFREG, NFREG }, - { S_IFDIR, NFDIR }, - { S_IFBLK, NFBLK }, - { S_IFCHR, NFCHR }, - { S_IFLNK, NFLNK }, - { S_IFSOCK, NFSOCK }, - { S_IFIFO, NFFIFO }, - { 0, NFNON }, - { 0, NFNON }, + decode_fs_locations_maxsz + \ + decode_renew_maxsz) +#define NFS4_enc_secinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_secinfo_maxsz) +#define NFS4_dec_secinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_secinfo_maxsz) +#define NFS4_enc_fsid_present_sz \ + (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getfh_maxsz + \ + encode_renew_maxsz) +#define NFS4_dec_fsid_present_sz \ + (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getfh_maxsz + \ + decode_renew_maxsz) +#if defined(CONFIG_NFS_V4_1) +#define NFS4_enc_bind_conn_to_session_sz \ + (compound_encode_hdr_maxsz + \ + encode_bind_conn_to_session_maxsz) +#define NFS4_dec_bind_conn_to_session_sz \ + (compound_decode_hdr_maxsz + \ + decode_bind_conn_to_session_maxsz) +#define NFS4_enc_exchange_id_sz \ + (compound_encode_hdr_maxsz + \ + encode_exchange_id_maxsz) +#define NFS4_dec_exchange_id_sz \ + (compound_decode_hdr_maxsz + \ + decode_exchange_id_maxsz) +#define NFS4_enc_create_session_sz \ + (compound_encode_hdr_maxsz + \ + encode_create_session_maxsz) +#define NFS4_dec_create_session_sz \ + (compound_decode_hdr_maxsz + \ + decode_create_session_maxsz) +#define NFS4_enc_destroy_session_sz (compound_encode_hdr_maxsz + \ + encode_destroy_session_maxsz) +#define NFS4_dec_destroy_session_sz (compound_decode_hdr_maxsz + \ + decode_destroy_session_maxsz) +#define NFS4_enc_destroy_clientid_sz (compound_encode_hdr_maxsz + \ + encode_destroy_clientid_maxsz) +#define NFS4_dec_destroy_clientid_sz (compound_decode_hdr_maxsz + \ + decode_destroy_clientid_maxsz) +#define NFS4_enc_sequence_sz \ + (compound_decode_hdr_maxsz + \ + encode_sequence_maxsz) +#define NFS4_dec_sequence_sz \ + (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz) +#define NFS4_enc_get_lease_time_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putrootfh_maxsz + \ + encode_fsinfo_maxsz) +#define NFS4_dec_get_lease_time_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putrootfh_maxsz + \ + decode_fsinfo_maxsz) +#define NFS4_enc_reclaim_complete_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_reclaim_complete_maxsz) +#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_reclaim_complete_maxsz) +#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getdevicelist_maxsz) +#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getdevicelist_maxsz) +#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_getdeviceinfo_maxsz) +#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_getdeviceinfo_maxsz) +#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_layoutget_maxsz) +#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutget_maxsz) +#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_putfh_maxsz + \ + encode_layoutcommit_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutcommit_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_layoutreturn_maxsz) +#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutreturn_maxsz) +#define NFS4_enc_secinfo_no_name_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putrootfh_maxsz +\ + encode_secinfo_no_name_maxsz) +#define NFS4_dec_secinfo_no_name_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putrootfh_maxsz + \ + decode_secinfo_no_name_maxsz) +#define NFS4_enc_test_stateid_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_test_stateid_maxsz) +#define NFS4_dec_test_stateid_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_test_stateid_maxsz) +#define NFS4_enc_free_stateid_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_free_stateid_maxsz) +#define NFS4_dec_free_stateid_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_free_stateid_maxsz) + +const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + + encode_sequence_maxsz + + encode_putfh_maxsz + + encode_getattr_maxsz) * + XDR_UNIT); + +const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_decode_hdr_maxsz + + decode_sequence_maxsz + + decode_putfh_maxsz) * + XDR_UNIT); + +const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH + + compound_decode_hdr_maxsz + + decode_sequence_maxsz) * + XDR_UNIT); +EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead); +#endif /* CONFIG_NFS_V4_1 */ + +static const umode_t nfs_type2fmt[] = { + [NF4BAD] = 0, + [NF4REG] = S_IFREG, + [NF4DIR] = S_IFDIR, + [NF4BLK] = S_IFBLK, + [NF4CHR] = S_IFCHR, + [NF4LNK] = S_IFLNK, + [NF4SOCK] = S_IFSOCK, + [NF4FIFO] = S_IFIFO, + [NF4ATTRDIR] = 0, + [NF4NAMEDATTR] = 0, }; struct compound_hdr { int32_t status; uint32_t nops; + __be32 * nops_p; uint32_t taglen; char * tag; + uint32_t replen; /* expected reply words */ + u32 minorversion; }; -/* - * START OF "GENERIC" ENCODE ROUTINES. - * These may look a little ugly since they are imported from a "generic" - * set of XDR encode/decode routines which are intended to be shared by - * all of our NFSv4 implementations (OpenBSD, MacOS X...). - * - * If the pain of reading these is too great, it should be a straightforward - * task to translate them into Linux-specific versions which are more - * consistent with the style used in NFSv2/v3... - */ -#define WRITE32(n) *p++ = htonl(n) -#define WRITE64(n) do { \ - *p++ = htonl((uint32_t)((n) >> 32)); \ - *p++ = htonl((uint32_t)(n)); \ -} while (0) -#define WRITEMEM(ptr,nbytes) do { \ - p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ -} while (0) - -#define RESERVE_SPACE(nbytes) do { \ - p = xdr_reserve_space(xdr, nbytes); \ - BUG_ON(!p); \ -} while (0) +static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes) +{ + __be32 *p = xdr_reserve_space(xdr, nbytes); + BUG_ON(!p); + return p; +} + +static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, len); + xdr_encode_opaque_fixed(p, buf, len); +} static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) { __be32 *p; - p = xdr_reserve_space(xdr, 4 + len); - BUG_ON(p == NULL); + p = reserve_space(xdr, 4 + len); xdr_encode_opaque(p, str, len); } -static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) +static void encode_uint32(struct xdr_stream *xdr, u32 n) { __be32 *p; - dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); - BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); - RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2)); - WRITE32(hdr->taglen); - WRITEMEM(hdr->tag, hdr->taglen); - WRITE32(NFS4_MINOR_VERSION); - WRITE32(hdr->nops); - return 0; + p = reserve_space(xdr, 4); + *p = cpu_to_be32(n); } -static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) +static void encode_uint64(struct xdr_stream *xdr, u64 n) { __be32 *p; - p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); - BUG_ON(p == NULL); - xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE); + p = reserve_space(xdr, 8); + xdr_encode_hyper(p, n); } -static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server) +static void encode_nfs4_seqid(struct xdr_stream *xdr, + const struct nfs_seqid *seqid) +{ + encode_uint32(xdr, seqid->sequence->counter); +} + +static void encode_compound_hdr(struct xdr_stream *xdr, + struct rpc_rqst *req, + struct compound_hdr *hdr) +{ + __be32 *p; + struct rpc_auth *auth = req->rq_cred->cr_auth; + + /* initialize running count of expected bytes in reply. + * NOTE: the replied tag SHOULD be the same is the one sent, + * but this is not required as a MUST for the server to do so. */ + hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen; + + WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN); + encode_string(xdr, hdr->taglen, hdr->tag); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(hdr->minorversion); + hdr->nops_p = p; + *p = cpu_to_be32(hdr->nops); +} + +static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op, + uint32_t replen, + struct compound_hdr *hdr) +{ + encode_uint32(xdr, op); + hdr->nops++; + hdr->replen += replen; +} + +static void encode_nops(struct compound_hdr *hdr) +{ + WARN_ON_ONCE(hdr->nops > NFS4_MAX_OPS); + *hdr->nops_p = htonl(hdr->nops); +} + +static void encode_nfs4_stateid(struct xdr_stream *xdr, const nfs4_stateid *stateid) +{ + encode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); +} + +static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) +{ + encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE); +} + +static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, + const struct nfs4_label *label, + const struct nfs_server *server) { char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; int owner_namelen = 0; int owner_grouplen = 0; __be32 *p; - __be32 *q; - int len; - uint32_t bmval0 = 0; - uint32_t bmval1 = 0; - int status; + unsigned i; + uint32_t len = 0; + uint32_t bmval_len; + uint32_t bmval[3] = { 0 }; /* * We reserve enough space to write the entire attribute buffer at once. * In the worst-case, this would be - * 12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) - * = 36 bytes, plus any contribution from variable-length fields + * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) + * = 40 bytes, plus any contribution from variable-length fields * such as owner/group. */ - len = 16; - - /* Sigh */ - if (iap->ia_valid & ATTR_SIZE) + if (iap->ia_valid & ATTR_SIZE) { + bmval[0] |= FATTR4_WORD0_SIZE; len += 8; - if (iap->ia_valid & ATTR_MODE) + } + if (iap->ia_valid & ATTR_MODE) { + bmval[1] |= FATTR4_WORD1_MODE; len += 4; + } if (iap->ia_valid & ATTR_UID) { - owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name); + owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); if (owner_namelen < 0) { dprintk("nfs: couldn't resolve uid %d to string\n", - iap->ia_uid); + from_kuid(&init_user_ns, iap->ia_uid)); /* XXX */ strcpy(owner_name, "nobody"); owner_namelen = sizeof("nobody") - 1; /* goto out; */ } + bmval[1] |= FATTR4_WORD1_OWNER; len += 4 + (XDR_QUADLEN(owner_namelen) << 2); } if (iap->ia_valid & ATTR_GID) { - owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group); + owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ); if (owner_grouplen < 0) { dprintk("nfs: couldn't resolve gid %d to string\n", - iap->ia_gid); + from_kgid(&init_user_ns, iap->ia_gid)); strcpy(owner_group, "nobody"); owner_grouplen = sizeof("nobody") - 1; /* goto out; */ } + bmval[1] |= FATTR4_WORD1_OWNER_GROUP; len += 4 + (XDR_QUADLEN(owner_grouplen) << 2); } - if (iap->ia_valid & ATTR_ATIME_SET) + if (iap->ia_valid & ATTR_ATIME_SET) { + bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; len += 16; - else if (iap->ia_valid & ATTR_ATIME) + } else if (iap->ia_valid & ATTR_ATIME) { + bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; len += 4; - if (iap->ia_valid & ATTR_MTIME_SET) + } + if (iap->ia_valid & ATTR_MTIME_SET) { + bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; len += 16; - else if (iap->ia_valid & ATTR_MTIME) + } else if (iap->ia_valid & ATTR_MTIME) { + bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; len += 4; - RESERVE_SPACE(len); - - /* - * We write the bitmap length now, but leave the bitmap and the attribute - * buffer length to be backfilled at the end of this routine. - */ - WRITE32(2); - q = p; - p += 3; - - if (iap->ia_valid & ATTR_SIZE) { - bmval0 |= FATTR4_WORD0_SIZE; - WRITE64(iap->ia_size); } - if (iap->ia_valid & ATTR_MODE) { - bmval1 |= FATTR4_WORD1_MODE; - WRITE32(iap->ia_mode & S_IALLUGO); + if (label) { + len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); + bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; } - if (iap->ia_valid & ATTR_UID) { - bmval1 |= FATTR4_WORD1_OWNER; - WRITE32(owner_namelen); - WRITEMEM(owner_name, owner_namelen); - } - if (iap->ia_valid & ATTR_GID) { - bmval1 |= FATTR4_WORD1_OWNER_GROUP; - WRITE32(owner_grouplen); - WRITEMEM(owner_group, owner_grouplen); - } - if (iap->ia_valid & ATTR_ATIME_SET) { - bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; - WRITE32(NFS4_SET_TO_CLIENT_TIME); - WRITE32(0); - WRITE32(iap->ia_mtime.tv_sec); - WRITE32(iap->ia_mtime.tv_nsec); - } - else if (iap->ia_valid & ATTR_ATIME) { - bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; - WRITE32(NFS4_SET_TO_SERVER_TIME); - } - if (iap->ia_valid & ATTR_MTIME_SET) { - bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; - WRITE32(NFS4_SET_TO_CLIENT_TIME); - WRITE32(0); - WRITE32(iap->ia_mtime.tv_sec); - WRITE32(iap->ia_mtime.tv_nsec); + + if (bmval[2] != 0) + bmval_len = 3; + else if (bmval[1] != 0) + bmval_len = 2; + else + bmval_len = 1; + + p = reserve_space(xdr, 4 + (bmval_len << 2) + 4 + len); + + *p++ = cpu_to_be32(bmval_len); + for (i = 0; i < bmval_len; i++) + *p++ = cpu_to_be32(bmval[i]); + *p++ = cpu_to_be32(len); + + if (bmval[0] & FATTR4_WORD0_SIZE) + p = xdr_encode_hyper(p, iap->ia_size); + if (bmval[1] & FATTR4_WORD1_MODE) + *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO); + if (bmval[1] & FATTR4_WORD1_OWNER) + p = xdr_encode_opaque(p, owner_name, owner_namelen); + if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) + p = xdr_encode_opaque(p, owner_group, owner_grouplen); + if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { + if (iap->ia_valid & ATTR_ATIME_SET) { + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec); + *p++ = cpu_to_be32(iap->ia_atime.tv_nsec); + } else + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } - else if (iap->ia_valid & ATTR_MTIME) { - bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; - WRITE32(NFS4_SET_TO_SERVER_TIME); + if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { + if (iap->ia_valid & ATTR_MTIME_SET) { + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec); + *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); + } else + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } - - /* - * Now we backfill the bitmap and the attribute buffer length. - */ - if (len != ((char *)p - (char *)q) + 4) { - printk(KERN_ERR "nfs: Attr length error, %u != %Zu\n", - len, ((char *)p - (char *)q) + 4); - BUG(); + if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { + *p++ = cpu_to_be32(label->lfs); + *p++ = cpu_to_be32(label->pi); + *p++ = cpu_to_be32(label->len); + p = xdr_encode_opaque_fixed(p, label->label, label->len); } - len = (char *)p - (char *)q - 12; - *q++ = htonl(bmval0); - *q++ = htonl(bmval1); - *q++ = htonl(len); - status = 0; /* out: */ - return status; } -static int encode_access(struct xdr_stream *xdr, u32 access) +static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr) { - __be32 *p; - - RESERVE_SPACE(8); - WRITE32(OP_ACCESS); - WRITE32(access); - - return 0; + encode_op_hdr(xdr, OP_ACCESS, decode_access_maxsz, hdr); + encode_uint32(xdr, access); } -static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg) +static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) { - __be32 *p; - - RESERVE_SPACE(8+NFS4_STATEID_SIZE); - WRITE32(OP_CLOSE); - WRITE32(arg->seqid->sequence->counter); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); - - return 0; + encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr); + encode_nfs4_seqid(xdr, arg->seqid); + encode_nfs4_stateid(xdr, arg->stateid); } -static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args) +static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr) { __be32 *p; - - RESERVE_SPACE(16); - WRITE32(OP_COMMIT); - WRITE64(args->offset); - WRITE32(args->count); - return 0; + encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr); + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); } -static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create) +static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr) { __be32 *p; - - RESERVE_SPACE(8); - WRITE32(OP_CREATE); - WRITE32(create->ftype); + + encode_op_hdr(xdr, OP_CREATE, decode_create_maxsz, hdr); + encode_uint32(xdr, create->ftype); switch (create->ftype) { case NF4LNK: - RESERVE_SPACE(4); - WRITE32(create->u.symlink.len); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(create->u.symlink.len); xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); break; case NF4BLK: case NF4CHR: - RESERVE_SPACE(8); - WRITE32(create->u.device.specdata1); - WRITE32(create->u.device.specdata2); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(create->u.device.specdata1); + *p = cpu_to_be32(create->u.device.specdata2); break; default: break; } - RESERVE_SPACE(4 + create->name->len); - WRITE32(create->name->len); - WRITEMEM(create->name->name, create->name->len); - - return encode_attrs(xdr, create->attrs, create->server); + encode_string(xdr, create->name->len, create->name->name); + encode_attrs(xdr, create->attrs, create->label, create->server); } -static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap) +static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) { - __be32 *p; + __be32 *p; - RESERVE_SPACE(12); - WRITE32(OP_GETATTR); - WRITE32(1); - WRITE32(bitmap); - return 0; + encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(bitmap); } -static int encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1) +static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr) { - __be32 *p; + __be32 *p; - RESERVE_SPACE(16); - WRITE32(OP_GETATTR); - WRITE32(2); - WRITE32(bm0); - WRITE32(bm1); - return 0; + encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bm0); + *p = cpu_to_be32(bm1); } -static int encode_getfattr(struct xdr_stream *xdr, const u32* bitmask) +static void +encode_getattr_three(struct xdr_stream *xdr, + uint32_t bm0, uint32_t bm1, uint32_t bm2, + struct compound_hdr *hdr) { - return encode_getattr_two(xdr, - bitmask[0] & nfs4_fattr_bitmap[0], - bitmask[1] & nfs4_fattr_bitmap[1]); + __be32 *p; + + encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); + if (bm2) { + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(bm0); + *p++ = cpu_to_be32(bm1); + *p = cpu_to_be32(bm2); + } else if (bm1) { + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bm0); + *p = cpu_to_be32(bm1); + } else { + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(bm0); + } } -static int encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask) +static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - return encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], - bitmask[1] & nfs4_fsinfo_bitmap[1]); + encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0], + bitmask[1] & nfs4_fattr_bitmap[1], + bitmask[2] & nfs4_fattr_bitmap[2], + hdr); } -static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask) +static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask, + const u32 *open_bitmap, + struct compound_hdr *hdr) { - return encode_getattr_two(xdr, - bitmask[0] & nfs4_fs_locations_bitmap[0], - bitmask[1] & nfs4_fs_locations_bitmap[1]); + encode_getattr_three(xdr, + bitmask[0] & open_bitmap[0], + bitmask[1] & open_bitmap[1], + bitmask[2] & open_bitmap[2], + hdr); } -static int encode_getfh(struct xdr_stream *xdr) +static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - __be32 *p; - - RESERVE_SPACE(4); - WRITE32(OP_GETFH); + encode_getattr_three(xdr, + bitmask[0] & nfs4_fsinfo_bitmap[0], + bitmask[1] & nfs4_fsinfo_bitmap[1], + bitmask[2] & nfs4_fsinfo_bitmap[2], + hdr); +} - return 0; +static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +{ + encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0], + bitmask[1] & nfs4_fs_locations_bitmap[1], hdr); } -static int encode_link(struct xdr_stream *xdr, const struct qstr *name) +static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) { - __be32 *p; + encode_op_hdr(xdr, OP_GETFH, decode_getfh_maxsz, hdr); +} - RESERVE_SPACE(8 + name->len); - WRITE32(OP_LINK); - WRITE32(name->len); - WRITEMEM(name->name, name->len); - - return 0; +static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_LINK, decode_link_maxsz, hdr); + encode_string(xdr, name->len, name->name); } static inline int nfs4_lock_type(struct file_lock *fl, int block) { - if ((fl->fl_type & (F_RDLCK|F_WRLCK|F_UNLCK)) == F_RDLCK) + if (fl->fl_type == F_RDLCK) return block ? NFS4_READW_LT : NFS4_READ_LT; return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT; } @@ -874,104 +1281,101 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl) return fl->fl_end - fl->fl_start + 1; } +static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner) +{ + __be32 *p; + + p = reserve_space(xdr, 32); + p = xdr_encode_hyper(p, lowner->clientid); + *p++ = cpu_to_be32(20); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); + *p++ = cpu_to_be32(lowner->s_dev); + xdr_encode_hyper(p, lowner->id); +} + /* * opcode,type,reclaim,offset,length,new_lock_owner = 32 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 */ -static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args) +static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(32); - WRITE32(OP_LOCK); - WRITE32(nfs4_lock_type(args->fl, args->block)); - WRITE32(args->reclaim); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); - WRITE32(args->new_lock_owner); + encode_op_hdr(xdr, OP_LOCK, decode_lock_maxsz, hdr); + p = reserve_space(xdr, 28); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block)); + *p++ = cpu_to_be32(args->reclaim); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + *p = cpu_to_be32(args->new_lock_owner); if (args->new_lock_owner){ - RESERVE_SPACE(4+NFS4_STATEID_SIZE+32); - WRITE32(args->open_seqid->sequence->counter); - WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE); - WRITE32(args->lock_seqid->sequence->counter); - WRITE64(args->lock_owner.clientid); - WRITE32(16); - WRITEMEM("lock id:", 8); - WRITE64(args->lock_owner.id); + encode_nfs4_seqid(xdr, args->open_seqid); + encode_nfs4_stateid(xdr, args->open_stateid); + encode_nfs4_seqid(xdr, args->lock_seqid); + encode_lockowner(xdr, &args->lock_owner); } else { - RESERVE_SPACE(NFS4_STATEID_SIZE+4); - WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE); - WRITE32(args->lock_seqid->sequence->counter); + encode_nfs4_stateid(xdr, args->lock_stateid); + encode_nfs4_seqid(xdr, args->lock_seqid); } - - return 0; } -static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args) +static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(52); - WRITE32(OP_LOCKT); - WRITE32(nfs4_lock_type(args->fl, 0)); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); - WRITE64(args->lock_owner.clientid); - WRITE32(16); - WRITEMEM("lock id:", 8); - WRITE64(args->lock_owner.id); - - return 0; + encode_op_hdr(xdr, OP_LOCKT, decode_lockt_maxsz, hdr); + p = reserve_space(xdr, 20); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + encode_lockowner(xdr, &args->lock_owner); } -static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args) +static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(12+NFS4_STATEID_SIZE+16); - WRITE32(OP_LOCKU); - WRITE32(nfs4_lock_type(args->fl, 0)); - WRITE32(args->seqid->sequence->counter); - WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); - - return 0; + encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr); + encode_uint32(xdr, nfs4_lock_type(args->fl, 0)); + encode_nfs4_seqid(xdr, args->seqid); + encode_nfs4_stateid(xdr, args->stateid); + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->fl->fl_start); + xdr_encode_hyper(p, nfs4_lock_length(args->fl)); } -static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name) +static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr) { - int len = name->len; - __be32 *p; - - RESERVE_SPACE(8 + len); - WRITE32(OP_LOOKUP); - WRITE32(len); - WRITEMEM(name->name, len); + encode_op_hdr(xdr, OP_RELEASE_LOCKOWNER, decode_release_lockowner_maxsz, hdr); + encode_lockowner(xdr, lowner); +} - return 0; +static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_LOOKUP, decode_lookup_maxsz, hdr); + encode_string(xdr, name->len, name->name); } -static void encode_share_access(struct xdr_stream *xdr, int open_flags) +static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) { __be32 *p; - RESERVE_SPACE(8); - switch (open_flags & (FMODE_READ|FMODE_WRITE)) { - case FMODE_READ: - WRITE32(NFS4_SHARE_ACCESS_READ); - break; - case FMODE_WRITE: - WRITE32(NFS4_SHARE_ACCESS_WRITE); - break; - case FMODE_READ|FMODE_WRITE: - WRITE32(NFS4_SHARE_ACCESS_BOTH); - break; - default: - BUG(); + p = reserve_space(xdr, 8); + switch (fmode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_READ: + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ); + break; + case FMODE_WRITE: + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE); + break; + case FMODE_READ|FMODE_WRITE: + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH); + break; + default: + *p++ = cpu_to_be32(0); } - WRITE32(0); /* for linux, share_deny = 0 always */ + *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */ } static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg) @@ -981,30 +1385,41 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, * owner 4 = 32 */ - RESERVE_SPACE(8); - WRITE32(OP_OPEN); - WRITE32(arg->seqid->sequence->counter); - encode_share_access(xdr, arg->open_flags); - RESERVE_SPACE(28); - WRITE64(arg->clientid); - WRITE32(16); - WRITEMEM("open id:", 8); - WRITE64(arg->id); + encode_nfs4_seqid(xdr, arg->seqid); + encode_share_access(xdr, arg->fmode); + p = reserve_space(xdr, 36); + p = xdr_encode_hyper(p, arg->clientid); + *p++ = cpu_to_be32(24); + p = xdr_encode_opaque_fixed(p, "open id:", 8); + *p++ = cpu_to_be32(arg->server->s_dev); + *p++ = cpu_to_be32(arg->id.uniquifier); + xdr_encode_hyper(p, arg->id.create_time); } static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) { + struct iattr dummy; __be32 *p; - RESERVE_SPACE(4); - switch(arg->open_flags & O_EXCL) { - case 0: - WRITE32(NFS4_CREATE_UNCHECKED); - encode_attrs(xdr, arg->u.attrs, arg->server); - break; - default: - WRITE32(NFS4_CREATE_EXCLUSIVE); - encode_nfs4_verifier(xdr, &arg->u.verifier); + p = reserve_space(xdr, 4); + switch(arg->createmode) { + case NFS4_CREATE_UNCHECKED: + *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); + break; + case NFS4_CREATE_GUARDED: + *p = cpu_to_be32(NFS4_CREATE_GUARDED); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); + break; + case NFS4_CREATE_EXCLUSIVE: + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); + encode_nfs4_verifier(xdr, &arg->u.verifier); + break; + case NFS4_CREATE_EXCLUSIVE4_1: + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); + encode_nfs4_verifier(xdr, &arg->u.verifier); + dummy.ia_valid = 0; + encode_attrs(xdr, &dummy, arg->label, arg->server); } } @@ -1012,35 +1427,34 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch (arg->open_flags & O_CREAT) { - case 0: - WRITE32(NFS4_OPEN_NOCREATE); - break; - default: - BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); - WRITE32(NFS4_OPEN_CREATE); - encode_createmode(xdr, arg); + case 0: + *p = cpu_to_be32(NFS4_OPEN_NOCREATE); + break; + default: + *p = cpu_to_be32(NFS4_OPEN_CREATE); + encode_createmode(xdr, arg); } } -static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type) +static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type) { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch (delegation_type) { - case 0: - WRITE32(NFS4_OPEN_DELEGATE_NONE); - break; - case FMODE_READ: - WRITE32(NFS4_OPEN_DELEGATE_READ); - break; - case FMODE_WRITE|FMODE_READ: - WRITE32(NFS4_OPEN_DELEGATE_WRITE); - break; - default: - BUG(); + case 0: + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE); + break; + case FMODE_READ: + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ); + break; + case FMODE_WRITE|FMODE_READ: + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE); + break; + default: + BUG(); } } @@ -1048,17 +1462,17 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr * { __be32 *p; - RESERVE_SPACE(4); - WRITE32(NFS4_OPEN_CLAIM_NULL); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL); encode_string(xdr, name->len, name->name); } -static inline void encode_claim_previous(struct xdr_stream *xdr, int type) +static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(NFS4_OPEN_CLAIM_PREVIOUS); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS); encode_delegation_type(xdr, type); } @@ -1066,1183 +1480,1661 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR); - WRITEMEM(stateid->data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); + encode_nfs4_stateid(xdr, stateid); encode_string(xdr, name->len, name->name); } -static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg) -{ - encode_openhdr(xdr, arg); - encode_opentype(xdr, arg); - switch (arg->claim) { - case NFS4_OPEN_CLAIM_NULL: - encode_claim_null(xdr, arg->name); - break; - case NFS4_OPEN_CLAIM_PREVIOUS: - encode_claim_previous(xdr, arg->u.delegation_type); - break; - case NFS4_OPEN_CLAIM_DELEGATE_CUR: - encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation); - break; - default: - BUG(); - } - return 0; -} - -static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg) +static inline void encode_claim_fh(struct xdr_stream *xdr) { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); - WRITE32(OP_OPEN_CONFIRM); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); - WRITE32(arg->seqid->sequence->counter); - - return 0; + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_FH); } -static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg) +static inline void encode_claim_delegate_cur_fh(struct xdr_stream *xdr, const nfs4_stateid *stateid) { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); - WRITE32(OP_OPEN_DOWNGRADE); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); - WRITE32(arg->seqid->sequence->counter); - encode_share_access(xdr, arg->open_flags); - return 0; + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEG_CUR_FH); + encode_nfs4_stateid(xdr, stateid); } -static int -encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh) +static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr) { - int len = fh->size; - __be32 *p; - - RESERVE_SPACE(8 + len); - WRITE32(OP_PUTFH); - WRITE32(len); - WRITEMEM(fh->data, len); - - return 0; + encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr); + encode_openhdr(xdr, arg); + encode_opentype(xdr, arg); + switch (arg->claim) { + case NFS4_OPEN_CLAIM_NULL: + encode_claim_null(xdr, arg->name); + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + encode_claim_previous(xdr, arg->u.delegation_type); + break; + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation); + break; + case NFS4_OPEN_CLAIM_FH: + encode_claim_fh(xdr); + break; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + encode_claim_delegate_cur_fh(xdr, &arg->u.delegation); + break; + default: + BUG(); + } } -static int encode_putrootfh(struct xdr_stream *xdr) +static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr) { - __be32 *p; - - RESERVE_SPACE(4); - WRITE32(OP_PUTROOTFH); + encode_op_hdr(xdr, OP_OPEN_CONFIRM, decode_open_confirm_maxsz, hdr); + encode_nfs4_stateid(xdr, arg->stateid); + encode_nfs4_seqid(xdr, arg->seqid); +} - return 0; +static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr); + encode_nfs4_stateid(xdr, arg->stateid); + encode_nfs4_seqid(xdr, arg->seqid); + encode_share_access(xdr, arg->fmode); } -static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) +static void +encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr) { - nfs4_stateid stateid; - __be32 *p; + encode_op_hdr(xdr, OP_PUTFH, decode_putfh_maxsz, hdr); + encode_string(xdr, fh->size, fh->data); +} - RESERVE_SPACE(NFS4_STATEID_SIZE); - if (ctx->state != NULL) { - nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); - WRITEMEM(stateid.data, NFS4_STATEID_SIZE); - } else - WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); +static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr); } -static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args) +static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args, + struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_READ); - - encode_stateid(xdr, args->context); + encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->stateid); - RESERVE_SPACE(12); - WRITE64(args->offset); - WRITE32(args->count); - - return 0; + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); } -static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req) +static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) { - uint32_t attrs[2] = { - FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID, + uint32_t attrs[3] = { + FATTR4_WORD0_RDATTR_ERROR, FATTR4_WORD1_MOUNTED_ON_FILEID, }; - __be32 *p; + uint32_t dircount = readdir->count >> 1; + __be32 *p, verf[2]; + uint32_t attrlen = 0; + unsigned int i; + + if (readdir->plus) { + attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| + FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID; + attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER| + FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV| + FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| + FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; + dircount >>= 1; + } + /* Use mounted_on_fileid only if the server supports it */ + if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) + attrs[0] |= FATTR4_WORD0_FILEID; + for (i = 0; i < ARRAY_SIZE(attrs); i++) { + attrs[i] &= readdir->bitmask[i]; + if (attrs[i] != 0) + attrlen = i+1; + } - RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20); - WRITE32(OP_READDIR); - WRITE64(readdir->cookie); - WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE); - WRITE32(readdir->count >> 1); /* We're not doing readdirplus */ - WRITE32(readdir->count); - WRITE32(2); - /* Switch to mounted_on_fileid if the server supports it */ - if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) - attrs[0] &= ~FATTR4_WORD0_FILEID; - else - attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; - WRITE32(attrs[0] & readdir->bitmask[0]); - WRITE32(attrs[1] & readdir->bitmask[1]); - dprintk("%s: cookie = %Lu, verifier = 0x%x%x, bitmap = 0x%x%x\n", - __FUNCTION__, + encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr); + encode_uint64(xdr, readdir->cookie); + encode_nfs4_verifier(xdr, &readdir->verifier); + p = reserve_space(xdr, 12 + (attrlen << 2)); + *p++ = cpu_to_be32(dircount); + *p++ = cpu_to_be32(readdir->count); + *p++ = cpu_to_be32(attrlen); + for (i = 0; i < attrlen; i++) + *p++ = cpu_to_be32(attrs[i]); + memcpy(verf, readdir->verifier.data, sizeof(verf)); + + dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n", + __func__, (unsigned long long)readdir->cookie, - ((u32 *)readdir->verifier.data)[0], - ((u32 *)readdir->verifier.data)[1], + verf[0], verf[1], attrs[0] & readdir->bitmask[0], - attrs[1] & readdir->bitmask[1]); + attrs[1] & readdir->bitmask[1], + attrs[2] & readdir->bitmask[2]); +} - return 0; +static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_READLINK, decode_readlink_maxsz, hdr); } -static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req) +static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) { - __be32 *p; + encode_op_hdr(xdr, OP_REMOVE, decode_remove_maxsz, hdr); + encode_string(xdr, name->len, name->name); +} - RESERVE_SPACE(4); - WRITE32(OP_READLINK); +static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_RENAME, decode_rename_maxsz, hdr); + encode_string(xdr, oldname->len, oldname->name); + encode_string(xdr, newname->len, newname->name); +} - return 0; +static void encode_renew(struct xdr_stream *xdr, clientid4 clid, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_RENEW, decode_renew_maxsz, hdr); + encode_uint64(xdr, clid); } -static int encode_remove(struct xdr_stream *xdr, const struct qstr *name) +static void +encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr); +} + +static void +encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(8 + name->len); - WRITE32(OP_REMOVE); - WRITE32(name->len); - WRITEMEM(name->name, name->len); + encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr); + encode_nfs4_stateid(xdr, &zero_stateid); + p = reserve_space(xdr, 2*4); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(FATTR4_WORD0_ACL); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(arg->acl_len); + xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); +} - return 0; +static void +encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_SAVEFH, decode_savefh_maxsz, hdr); +} + +static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); + encode_nfs4_stateid(xdr, &arg->stateid); + encode_attrs(xdr, arg->iap, arg->label, server); } -static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname) +static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(8 + oldname->len); - WRITE32(OP_RENAME); - WRITE32(oldname->len); - WRITEMEM(oldname->name, oldname->len); - - RESERVE_SPACE(4 + newname->len); - WRITE32(newname->len); - WRITEMEM(newname->name, newname->len); + encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr); + encode_nfs4_verifier(xdr, setclientid->sc_verifier); - return 0; + encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(setclientid->sc_prog); + encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); + encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(setclientid->sc_cb_ident); } -static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid) +static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM, + decode_setclientid_confirm_maxsz, hdr); + encode_uint64(xdr, arg->clientid); + encode_nfs4_verifier(xdr, &arg->confirm); +} + +static void encode_write(struct xdr_stream *xdr, const struct nfs_pgio_args *args, + struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(12); - WRITE32(OP_RENEW); - WRITE64(client_stateid->cl_clientid); + encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->stateid); - return 0; + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->offset); + *p++ = cpu_to_be32(args->stable); + *p = cpu_to_be32(args->count); + + xdr_write_pages(xdr, args->pages, args->pgbase, args->count); } -static int -encode_restorefh(struct xdr_stream *xdr) +static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_DELEGRETURN, decode_delegreturn_maxsz, hdr); + encode_nfs4_stateid(xdr, stateid); +} + +static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_SECINFO, decode_secinfo_maxsz, hdr); + encode_string(xdr, name->len, name->name); +} + +#if defined(CONFIG_NFS_V4_1) +/* NFSv4.1 operations */ +static void encode_bind_conn_to_session(struct xdr_stream *xdr, + struct nfs4_session *session, + struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_RESTOREFH); + encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION, + decode_bind_conn_to_session_maxsz, hdr); + encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + p = xdr_reserve_space(xdr, 8); + *p++ = cpu_to_be32(NFS4_CDFC4_BACK_OR_BOTH); + *p = 0; /* use_conn_in_rdma_mode = False */ +} - return 0; +static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) +{ + unsigned int i; + encode_uint32(xdr, NFS4_OP_MAP_NUM_WORDS); + for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) + encode_uint32(xdr, op_map->u.words[i]); } -static int -encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg) +static void encode_exchange_id(struct xdr_stream *xdr, + struct nfs41_exchange_id_args *args, + struct compound_hdr *hdr) { __be32 *p; + char impl_name[IMPL_NAME_LIMIT]; + int len = 0; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(OP_SETATTR); - WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); - RESERVE_SPACE(2*4); - WRITE32(1); - WRITE32(FATTR4_WORD0_ACL); - if (arg->acl_len % 4) - return -EINVAL; - RESERVE_SPACE(4); - WRITE32(arg->acl_len); - xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); - return 0; + encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); + encode_nfs4_verifier(xdr, args->verifier); + + encode_string(xdr, args->id_len, args->id); + + encode_uint32(xdr, args->flags); + encode_uint32(xdr, args->state_protect.how); + + switch (args->state_protect.how) { + case SP4_NONE: + break; + case SP4_MACH_CRED: + encode_op_map(xdr, &args->state_protect.enforce); + encode_op_map(xdr, &args->state_protect.allow); + break; + default: + WARN_ON_ONCE(1); + break; + } + + if (send_implementation_id && + sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 && + sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) + <= sizeof(impl_name) + 1) + len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s", + utsname()->sysname, utsname()->release, + utsname()->version, utsname()->machine); + + if (len > 0) { + encode_uint32(xdr, 1); /* implementation id array length=1 */ + + encode_string(xdr, + sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - 1, + CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN); + encode_string(xdr, len, impl_name); + /* just send zeros for nii_date - the date is in nii_name */ + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, 0); + *p = cpu_to_be32(0); + } else + encode_uint32(xdr, 0); /* implementation id array length=0 */ } -static int -encode_savefh(struct xdr_stream *xdr) +static void encode_create_session(struct xdr_stream *xdr, + struct nfs41_create_session_args *args, + struct compound_hdr *hdr) { __be32 *p; + char machine_name[NFS4_MAX_MACHINE_NAME_LEN]; + uint32_t len; + struct nfs_client *clp = args->client; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + u32 max_resp_sz_cached; - RESERVE_SPACE(4); - WRITE32(OP_SAVEFH); + /* + * Assumes OPEN is the biggest non-idempotent compound. + * 2 is the verifier. + */ + max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + + RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT; - return 0; + len = scnprintf(machine_name, sizeof(machine_name), "%s", + clp->cl_ipaddr); + + encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr); + p = reserve_space(xdr, 16 + 2*28 + 20 + len + 12); + p = xdr_encode_hyper(p, clp->cl_clientid); + *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ + *p++ = cpu_to_be32(args->flags); /*flags */ + + /* Fore Channel */ + *p++ = cpu_to_be32(0); /* header padding size */ + *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ + + /* Back Channel */ + *p++ = cpu_to_be32(0); /* header padding size */ + *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->bc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ + + *p++ = cpu_to_be32(args->cb_program); /* cb_program */ + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ + + /* authsys_parms rfc1831 */ + *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */ + p = xdr_encode_opaque(p, machine_name, len); + *p++ = cpu_to_be32(0); /* UID */ + *p++ = cpu_to_be32(0); /* GID */ + *p = cpu_to_be32(0); /* No more gids */ } -static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server) +static void encode_destroy_session(struct xdr_stream *xdr, + struct nfs4_session *session, + struct compound_hdr *hdr) { - int status; - __be32 *p; - - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(OP_SETATTR); - WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE); + encode_op_hdr(xdr, OP_DESTROY_SESSION, decode_destroy_session_maxsz, hdr); + encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); +} - if ((status = encode_attrs(xdr, arg->iap, server))) - return status; +static void encode_destroy_clientid(struct xdr_stream *xdr, + uint64_t clientid, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_DESTROY_CLIENTID, decode_destroy_clientid_maxsz, hdr); + encode_uint64(xdr, clientid); +} - return 0; +static void encode_reclaim_complete(struct xdr_stream *xdr, + struct nfs41_reclaim_complete_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_RECLAIM_COMPLETE, decode_reclaim_complete_maxsz, hdr); + encode_uint32(xdr, args->one_fs); } +#endif /* CONFIG_NFS_V4_1 */ -static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid) +static void encode_sequence(struct xdr_stream *xdr, + const struct nfs4_sequence_args *args, + struct compound_hdr *hdr) { +#if defined(CONFIG_NFS_V4_1) + struct nfs4_session *session; + struct nfs4_slot_table *tp; + struct nfs4_slot *slot = args->sa_slot; __be32 *p; - RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE); - WRITE32(OP_SETCLIENTID); - WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); + tp = slot->table; + session = tp->session; + if (!session) + return; - encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); - RESERVE_SPACE(4); - WRITE32(setclientid->sc_prog); - encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); - encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); - RESERVE_SPACE(4); - WRITE32(setclientid->sc_cb_ident); + encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr); - return 0; + /* + * Sessionid + seqid + slotid + max slotid + cache_this + */ + dprintk("%s: sessionid=%u:%u:%u:%u seqid=%d slotid=%d " + "max_slotid=%d cache_this=%d\n", + __func__, + ((u32 *)session->sess_id.data)[0], + ((u32 *)session->sess_id.data)[1], + ((u32 *)session->sess_id.data)[2], + ((u32 *)session->sess_id.data)[3], + slot->seq_nr, slot->slot_nr, + tp->highest_used_slotid, args->sa_cache_this); + p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16); + p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(slot->seq_nr); + *p++ = cpu_to_be32(slot->slot_nr); + *p++ = cpu_to_be32(tp->highest_used_slotid); + *p = cpu_to_be32(args->sa_cache_this); +#endif /* CONFIG_NFS_V4_1 */ +} + +#ifdef CONFIG_NFS_V4_1 +static void +encode_getdevicelist(struct xdr_stream *xdr, + const struct nfs4_getdevicelist_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + nfs4_verifier dummy = { + .data = "dummmmmy", + }; + + encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr); + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(args->layoutclass); + *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); + xdr_encode_hyper(p, 0ULL); /* cookie */ + encode_nfs4_verifier(xdr, &dummy); } -static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state) +static void +encode_getdeviceinfo(struct xdr_stream *xdr, + const struct nfs4_getdeviceinfo_args *args, + struct compound_hdr *hdr) { - __be32 *p; - - RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE); - WRITE32(OP_SETCLIENTID_CONFIRM); - WRITE64(client_state->cl_clientid); - WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); + __be32 *p; - return 0; + encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr); + p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE); + p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(args->pdev->layout_type); + *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */ + *p++ = cpu_to_be32(0); /* bitmap length 0 */ } -static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args) +static void +encode_layoutget(struct xdr_stream *xdr, + const struct nfs4_layoutget_args *args, + struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_WRITE); - - encode_stateid(xdr, args->context); + encode_op_hdr(xdr, OP_LAYOUTGET, decode_layoutget_maxsz, hdr); + p = reserve_space(xdr, 36); + *p++ = cpu_to_be32(0); /* Signal layout available */ + *p++ = cpu_to_be32(args->type); + *p++ = cpu_to_be32(args->range.iomode); + p = xdr_encode_hyper(p, args->range.offset); + p = xdr_encode_hyper(p, args->range.length); + p = xdr_encode_hyper(p, args->minlength); + encode_nfs4_stateid(xdr, &args->stateid); + encode_uint32(xdr, args->maxcount); + + dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", + __func__, + args->type, + args->range.iomode, + (unsigned long)args->range.offset, + (unsigned long)args->range.length, + args->maxcount); +} - RESERVE_SPACE(16); - WRITE64(args->offset); - WRITE32(args->stable); - WRITE32(args->count); +static int +encode_layoutcommit(struct xdr_stream *xdr, + struct inode *inode, + const struct nfs4_layoutcommit_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; - xdr_write_pages(xdr, args->pages, args->pgbase, args->count); + dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten, + NFS_SERVER(args->inode)->pnfs_curr_ld->id); + + encode_op_hdr(xdr, OP_LAYOUTCOMMIT, decode_layoutcommit_maxsz, hdr); + p = reserve_space(xdr, 20); + /* Only whole file layouts */ + p = xdr_encode_hyper(p, 0); /* offset */ + p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ + *p = cpu_to_be32(0); /* reclaim */ + encode_nfs4_stateid(xdr, &args->stateid); + p = reserve_space(xdr, 20); + *p++ = cpu_to_be32(1); /* newoffset = TRUE */ + p = xdr_encode_hyper(p, args->lastbytewritten); + *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ + *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ + + if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) + NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( + NFS_I(inode)->layout, xdr, args); + else + encode_uint32(xdr, 0); /* no layout-type payload */ return 0; } -static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid) +static void +encode_layoutreturn(struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args, + struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); + encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr); + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */ + *p++ = cpu_to_be32(args->layout_type); + *p++ = cpu_to_be32(IOMODE_ANY); + *p = cpu_to_be32(RETURN_FILE); + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, 0); + p = xdr_encode_hyper(p, NFS4_MAX_UINT64); + spin_lock(&args->inode->i_lock); + encode_nfs4_stateid(xdr, &args->stateid); + spin_unlock(&args->inode->i_lock); + if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) { + NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn( + NFS_I(args->inode)->layout, xdr, args); + } else + encode_uint32(xdr, 0); +} - WRITE32(OP_DELEGRETURN); - WRITEMEM(stateid->data, NFS4_STATEID_SIZE); +static int +encode_secinfo_no_name(struct xdr_stream *xdr, + const struct nfs41_secinfo_no_name_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_SECINFO_NO_NAME, decode_secinfo_no_name_maxsz, hdr); + encode_uint32(xdr, args->style); return 0; +} + +static void encode_test_stateid(struct xdr_stream *xdr, + struct nfs41_test_stateid_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_TEST_STATEID, decode_test_stateid_maxsz, hdr); + encode_uint32(xdr, 1); + encode_nfs4_stateid(xdr, args->stateid); +} +static void encode_free_stateid(struct xdr_stream *xdr, + struct nfs41_free_stateid_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->stateid); } +#endif /* CONFIG_NFS_V4_1 */ + /* * END OF "GENERIC" ENCODE ROUTINES. */ +static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args) +{ +#if defined(CONFIG_NFS_V4_1) + struct nfs4_session *session = args->sa_slot->table->session; + if (session) + return session->clp->cl_mvops->minor_version; +#endif /* CONFIG_NFS_V4_1 */ + return 0; +} + /* * Encode an ACCESS request */ -static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args) +static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_accessargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status != 0) - goto out; - status = encode_access(&xdr, args->access); - if (status != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_access(xdr, args->access, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); } /* * Encode LOOKUP request */ -static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args) +static void nfs4_xdr_enc_lookup(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_lookup_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 4, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) - goto out; - if ((status = encode_lookup(&xdr, args->name)) != 0) - goto out; - if ((status = encode_getfh(&xdr)) != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_lookup(xdr, args->name, &hdr); + encode_getfh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); } /* * Encode LOOKUP_ROOT request */ -static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args) +static void nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs4_lookup_root_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putrootfh(&xdr)) != 0) - goto out; - if ((status = encode_getfh(&xdr)) == 0) - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putrootfh(xdr, &hdr); + encode_getfh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); } /* * Encode REMOVE request */ -static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs *args) +static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs_removeargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->fh)) != 0) - goto out; - if ((status = encode_remove(&xdr, &args->name)) != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_remove(xdr, &args->name, &hdr); + encode_nops(&hdr); } /* * Encode RENAME request */ -static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args) +static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs_renameargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 7, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->old_dir)) != 0) - goto out; - if ((status = encode_savefh(&xdr)) != 0) - goto out; - if ((status = encode_putfh(&xdr, args->new_dir)) != 0) - goto out; - if ((status = encode_rename(&xdr, args->old_name, args->new_name)) != 0) - goto out; - if ((status = encode_getfattr(&xdr, args->bitmask)) != 0) - goto out; - if ((status = encode_restorefh(&xdr)) != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->old_dir, &hdr); + encode_savefh(xdr, &hdr); + encode_putfh(xdr, args->new_dir, &hdr); + encode_rename(xdr, args->old_name, args->new_name, &hdr); + encode_nops(&hdr); } /* * Encode LINK request */ -static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args) +static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_link_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 7, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->fh)) != 0) - goto out; - if ((status = encode_savefh(&xdr)) != 0) - goto out; - if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) - goto out; - if ((status = encode_link(&xdr, args->name)) != 0) - goto out; - if ((status = encode_getfattr(&xdr, args->bitmask)) != 0) - goto out; - if ((status = encode_restorefh(&xdr)) != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_savefh(xdr, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_link(xdr, args->name, &hdr); + encode_restorefh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); } /* * Encode CREATE request */ -static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args) +static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_create_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 7, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) - goto out; - if ((status = encode_savefh(&xdr)) != 0) - goto out; - if ((status = encode_create(&xdr, args)) != 0) - goto out; - if ((status = encode_getfh(&xdr)) != 0) - goto out; - if ((status = encode_getfattr(&xdr, args->bitmask)) != 0) - goto out; - if ((status = encode_restorefh(&xdr)) != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_create(xdr, args, &hdr); + encode_getfh(xdr, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); } /* * Encode SYMLINK request */ -static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args) +static void nfs4_xdr_enc_symlink(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_create_arg *args) { - return nfs4_xdr_enc_create(req, p, args); + nfs4_xdr_enc_create(req, xdr, args); } /* * Encode GETATTR request */ -static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args) +static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_getattr_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->fh)) == 0) - status = encode_getfattr(&xdr, args->bitmask); - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); } /* * Encode a CLOSE request */ -static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args) -{ - struct xdr_stream xdr; - struct compound_hdr hdr = { - .nops = 3, - }; - int status; - - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_close(&xdr, args); - if (status != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; +static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_closeargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_close(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); } /* * Encode an OPEN request */ -static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args) +static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_openargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 7, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_savefh(&xdr); - if (status) - goto out; - status = encode_open(&xdr, args); - if (status) - goto out; - status = encode_getfh(&xdr); - if (status) - goto out; - status = encode_getfattr(&xdr, args->bitmask); - if (status) - goto out; - status = encode_restorefh(&xdr); - if (status) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_open(xdr, args, &hdr); + encode_getfh(xdr, &hdr); + if (args->access) + encode_access(xdr, args->access, &hdr); + encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); + encode_nops(&hdr); } /* * Encode an OPEN_CONFIRM request */ -static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args) +static void nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_open_confirmargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 0, }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_open_confirm(&xdr, args); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_open_confirm(xdr, args, &hdr); + encode_nops(&hdr); } /* * Encode an OPEN request with no attributes. */ -static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args) +static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_openargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_open(&xdr, args); - if (status) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_open(xdr, args, &hdr); + if (args->access) + encode_access(xdr, args->access, &hdr); + encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); + encode_nops(&hdr); } /* * Encode an OPEN_DOWNGRADE request */ -static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args) +static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_closeargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_open_downgrade(&xdr, args); - if (status != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_open_downgrade(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); } /* * Encode a LOCK request */ -static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args) +static void nfs4_xdr_enc_lock(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_lock_args *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_lock(&xdr, args); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_lock(xdr, args, &hdr); + encode_nops(&hdr); } /* * Encode a LOCKT request */ -static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args) +static void nfs4_xdr_enc_lockt(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_lockt_args *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_lockt(&xdr, args); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_lockt(xdr, args, &hdr); + encode_nops(&hdr); } /* * Encode a LOCKU request */ -static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args) +static void nfs4_xdr_enc_locku(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_locku_args *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_locku(&xdr, args); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_locku(xdr, args, &hdr); + encode_nops(&hdr); +} + +static void nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_release_lockowner_args *args) +{ + struct compound_hdr hdr = { + .minorversion = 0, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_release_lockowner(xdr, &args->lock_owner, &hdr); + encode_nops(&hdr); } /* * Encode a READLINK request */ -static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args) +static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_readlink *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; - unsigned int replen; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_readlink(&xdr, args, req); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_readlink(xdr, args, req, &hdr); - /* set up reply kvec - * toplevel_status + taglen + rescount + OP_PUTFH + status - * + OP_READLINK + status + string length = 8 - */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_readlink_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, args->pgbase, args->pglen); - -out: - return status; + encode_nops(&hdr); } /* * Encode a READDIR request */ -static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args) +static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_readdir_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; - int replen; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_readdir(&xdr, args, req); + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_readdir(xdr, args, req, &hdr); - /* set up reply kvec - * toplevel_status + taglen + rescount + OP_PUTFH + status - * + OP_READDIR + status + verifer(2) = 9 - */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_readdir_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, args->pgbase, args->count); dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", - __FUNCTION__, replen, args->pages, + __func__, hdr.replen << 2, args->pages, args->pgbase, args->count); - -out: - return status; + encode_nops(&hdr); } /* * Encode a READ request */ -static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) +static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_pgio_args *args) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int replen, status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_read(&xdr, args); - if (status) - goto out; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_read(xdr, args, &hdr); - /* set up reply kvec - * toplevel status + taglen=0 + rescount + OP_PUTFH + status - * + OP_READ + status + eof + datalen = 9 - */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_read_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, args->pgbase, args->count); req->rq_rcv_buf.flags |= XDRBUF_READ; -out: - return status; + encode_nops(&hdr); } /* * Encode an SETATTR request */ -static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args) - -{ - struct xdr_stream xdr; - struct compound_hdr hdr = { - .nops = 3, - }; - int status; - - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_setattr(&xdr, args, args->server); - if(status) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; +static void nfs4_xdr_enc_setattr(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_setattrargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_setattr(xdr, args, args->server, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); } /* * Encode a GETACL request */ -static int -nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p, - struct nfs_getaclargs *args) +static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_getaclargs *args) { - struct xdr_stream xdr; - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; struct compound_hdr hdr = { - .nops = 2, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int replen, status; + uint32_t replen; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0); - /* set up reply buffer: */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen + op_decode_hdr_maxsz + 1; + encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, replen << 2, args->acl_pages, args->acl_pgbase, args->acl_len); -out: - return status; + + encode_nops(&hdr); } /* * Encode a WRITE request */ -static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) +static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_pgio_args *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_write(&xdr, args); - if (status) - goto out; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_write(xdr, args, &hdr); req->rq_snd_buf.flags |= XDRBUF_WRITE; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + if (args->bitmask) + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); } /* * a COMMIT request */ -static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args) +static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_commitargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_commit(&xdr, args); - if (status) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_commit(xdr, args, &hdr); + encode_nops(&hdr); } /* * FSINFO request */ -static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args) +static void nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_fsinfo_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (!status) - status = encode_fsinfo(&xdr, args->bitmask); - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_fsinfo(xdr, args->bitmask, &hdr); + encode_nops(&hdr); } /* * a PATHCONF request */ -static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args) +static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_pathconf_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (!status) - status = encode_getattr_one(&xdr, - args->bitmask[0] & nfs4_pathconf_bitmap[0]); - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0], + &hdr); + encode_nops(&hdr); } /* * a STATFS request */ -static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args) +static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr, + const struct nfs4_statfs_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status == 0) - status = encode_getattr_two(&xdr, - args->bitmask[0] & nfs4_statfs_bitmap[0], - args->bitmask[1] & nfs4_statfs_bitmap[1]); - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0], + args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr); + encode_nops(&hdr); } /* * GETATTR_BITMAP request */ -static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struct nfs_fh *fhandle) +static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_server_caps_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, fhandle); - if (status == 0) - status = encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS| - FATTR4_WORD0_LINK_SUPPORT| - FATTR4_WORD0_SYMLINK_SUPPORT| - FATTR4_WORD0_ACLSUPPORT); - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fhandle, &hdr); + encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS| + FATTR4_WORD0_FH_EXPIRE_TYPE| + FATTR4_WORD0_LINK_SUPPORT| + FATTR4_WORD0_SYMLINK_SUPPORT| + FATTR4_WORD0_ACLSUPPORT, &hdr); + encode_nops(&hdr); } /* * a RENEW request */ -static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) +static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_client *clp) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 1, + .nops = 0, }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - return encode_renew(&xdr, clp); + encode_compound_hdr(xdr, req, &hdr); + encode_renew(xdr, clp->cl_clientid, &hdr); + encode_nops(&hdr); } /* * a SETCLIENTID request */ -static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc) +static void nfs4_xdr_enc_setclientid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_setclientid *sc) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 1, + .nops = 0, }; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - return encode_setclientid(&xdr, sc); + encode_compound_hdr(xdr, req, &hdr); + encode_setclientid(xdr, sc, &hdr); + encode_nops(&hdr); } /* * a SETCLIENTID_CONFIRM request */ -static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) +static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_setclientid_res *arg) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .nops = 0, }; - const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_setclientid_confirm(&xdr, clp); - if (!status) - status = encode_putrootfh(&xdr); - if (!status) - status = encode_fsinfo(&xdr, lease_bitmap); - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_setclientid_confirm(xdr, arg, &hdr); + encode_nops(&hdr); } /* * DELEGRETURN request */ -static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args) +static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfs4_delegreturnargs *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - int status; - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fhandle); - if (status != 0) - goto out; - status = encode_delegreturn(&xdr, args->stateid); - if (status != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fhandle, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_delegreturn(xdr, args->stateid, &hdr); + encode_nops(&hdr); } /* * Encode FS_LOCATIONS request */ -static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args) +static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_fs_locations_arg *args) { - struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; - int replen; - int status; + uint32_t replen; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + if (args->migration) { + encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen; + encode_fs_locations(xdr, args->bitmask, &hdr); + if (args->renew) + encode_renew(xdr, args->clientid, &hdr); + } else { + encode_putfh(xdr, args->dir_fh, &hdr); + encode_lookup(xdr, args->name, &hdr); + replen = hdr.replen; + encode_fs_locations(xdr, args->bitmask, &hdr); + } - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) - goto out; - if ((status = encode_lookup(&xdr, args->name)) != 0) - goto out; - if ((status = encode_fs_locations(&xdr, args->bitmask)) != 0) - goto out; - /* set up reply - * toplevel_status + OP_PUTFH + status - * + OP_LOOKUP + status + OP_GETATTR + status = 7 - */ - replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2; - xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page, + /* Set up reply kvec to capture returned fs_locations array. */ + xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page, 0, PAGE_SIZE); -out: - return status; + encode_nops(&hdr); } /* - * START OF "GENERIC" DECODE ROUTINES. - * These may look a little ugly since they are imported from a "generic" - * set of XDR encode/decode routines which are intended to be shared by - * all of our NFSv4 implementations (OpenBSD, MacOS X...). - * - * If the pain of reading these is too great, it should be a straightforward - * task to translate them into Linux-specific versions which are more - * consistent with the style used in NFSv2/v3... + * Encode SECINFO request + */ +static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_secinfo_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_secinfo(xdr, args->name, &hdr); + encode_nops(&hdr); +} + +/* + * Encode FSID_PRESENT request + */ +static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_fsid_present_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getfh(xdr, &hdr); + if (args->renew) + encode_renew(xdr, args->clientid, &hdr); + encode_nops(&hdr); +} + +#if defined(CONFIG_NFS_V4_1) +/* + * BIND_CONN_TO_SESSION request + */ +static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_client *clp) +{ + struct compound_hdr hdr = { + .minorversion = clp->cl_mvops->minor_version, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_bind_conn_to_session(xdr, clp->cl_session, &hdr); + encode_nops(&hdr); +} + +/* + * EXCHANGE_ID request */ -#define READ32(x) (x) = ntohl(*p++) -#define READ64(x) do { \ - (x) = (u64)ntohl(*p++) << 32; \ - (x) |= ntohl(*p++); \ -} while (0) -#define READTIME(x) do { \ - p++; \ - (x.tv_sec) = ntohl(*p++); \ - (x.tv_nsec) = ntohl(*p++); \ -} while (0) -#define COPYMEM(x,nbytes) do { \ - memcpy((x), p, nbytes); \ - p += XDR_QUADLEN(nbytes); \ -} while (0) - -#define READ_BUF(nbytes) do { \ - p = xdr_inline_decode(xdr, nbytes); \ - if (unlikely(!p)) { \ - dprintk("nfs: %s: prematurely hit end of receive" \ - " buffer\n", __FUNCTION__); \ - dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \ - __FUNCTION__, xdr->p, nbytes, xdr->end); \ - return -EIO; \ - } \ -} while (0) +static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_exchange_id_args *args) +{ + struct compound_hdr hdr = { + .minorversion = args->client->cl_mvops->minor_version, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_exchange_id(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * a CREATE_SESSION request + */ +static void nfs4_xdr_enc_create_session(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_create_session_args *args) +{ + struct compound_hdr hdr = { + .minorversion = args->client->cl_mvops->minor_version, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_create_session(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * a DESTROY_SESSION request + */ +static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_session *session) +{ + struct compound_hdr hdr = { + .minorversion = session->clp->cl_mvops->minor_version, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_destroy_session(xdr, session, &hdr); + encode_nops(&hdr); +} + +/* + * a DESTROY_CLIENTID request + */ +static void nfs4_xdr_enc_destroy_clientid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_client *clp) +{ + struct compound_hdr hdr = { + .minorversion = clp->cl_mvops->minor_version, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_destroy_clientid(xdr, clp->cl_clientid, &hdr); + encode_nops(&hdr); +} + +/* + * a SEQUENCE request + */ +static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_sequence_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * a GET_LEASE_TIME request + */ +static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_get_lease_time_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), + }; + const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->la_seq_args, &hdr); + encode_putrootfh(xdr, &hdr); + encode_fsinfo(xdr, lease_bitmap, &hdr); + encode_nops(&hdr); +} + +/* + * a RECLAIM_COMPLETE request + */ +static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_reclaim_complete_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args) + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_reclaim_complete(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode GETDEVICELIST request + */ +static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_getdevicelist_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getdevicelist(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode GETDEVICEINFO request + */ +static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_getdeviceinfo_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_getdeviceinfo(xdr, args, &hdr); + + /* set up reply kvec. Subtract notification bitmap max size (2) + * so that notification bitmap is put in xdr_buf tail */ + xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2, + args->pdev->pages, args->pdev->pgbase, + args->pdev->pglen); + + encode_nops(&hdr); +} + +/* + * Encode LAYOUTGET request + */ +static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_layoutget_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + encode_layoutget(xdr, args, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, + args->layout.pages, 0, args->layout.pglen); + + encode_nops(&hdr); +} + +/* + * Encode LAYOUTCOMMIT request + */ +static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_layoutcommit_args *args) +{ + struct nfs4_layoutcommit_data *data = + container_of(args, struct nfs4_layoutcommit_data, args); + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + encode_layoutcommit(xdr, data->args.inode, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode LAYOUTRETURN request + */ +static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_layoutreturn_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + encode_layoutreturn(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode SECINFO_NO_NAME request + */ +static int nfs4_xdr_enc_secinfo_no_name(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_secinfo_no_name_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putrootfh(xdr, &hdr); + encode_secinfo_no_name(xdr, args, &hdr); + encode_nops(&hdr); + return 0; +} + +/* + * Encode TEST_STATEID request + */ +static void nfs4_xdr_enc_test_stateid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_test_stateid_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_test_stateid(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode FREE_STATEID request + */ +static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_free_stateid_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_free_stateid(xdr, args, &hdr); + encode_nops(&hdr); +} +#endif /* CONFIG_NFS_V4_1 */ + +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("nfs: %s: prematurely hit end of receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) { __be32 *p; - READ_BUF(4); - READ32(*len); - READ_BUF(*len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, *len); + if (unlikely(!p)) + goto out_overflow; *string = (char *)p; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - READ_BUF(8); - READ32(hdr->status); - READ32(hdr->taglen); - - READ_BUF(hdr->taglen + 4); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + hdr->status = be32_to_cpup(p++); + hdr->taglen = be32_to_cpup(p); + + p = xdr_inline_decode(xdr, hdr->taglen + 4); + if (unlikely(!p)) + goto out_overflow; hdr->tag = (char *)p; p += XDR_QUADLEN(hdr->taglen); - READ32(hdr->nops); + hdr->nops = be32_to_cpup(p); + if (unlikely(hdr->nops < 1)) + return nfs4_stat_to_errno(hdr->status); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, + int *nfs_retval) { __be32 *p; uint32_t opnum; int32_t nfserr; - READ_BUF(8); - READ32(opnum); - if (opnum != expected) { - dprintk("nfs: Server returned operation" - " %d but we issued a request for %d\n", - opnum, expected); - return -EIO; - } - READ32(nfserr); - if (nfserr != NFS_OK) - return -nfs4_stat_to_errno(nfserr); - return 0; + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + opnum = be32_to_cpup(p++); + if (unlikely(opnum != expected)) + goto out_bad_operation; + nfserr = be32_to_cpup(p); + if (nfserr == NFS_OK) + *nfs_retval = 0; + else + *nfs_retval = nfs4_stat_to_errno(nfserr); + return true; +out_bad_operation: + dprintk("nfs: Server returned operation" + " %d but we issued a request for %d\n", + opnum, expected); + *nfs_retval = -EREMOTEIO; + return false; +out_overflow: + print_overflow_msg(__func__, xdr); + *nfs_retval = -EIO; + return false; +} + +static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +{ + int retval; + + __decode_op_hdr(xdr, expected, &retval); + return retval; } /* Dummy routine */ @@ -2252,8 +3144,11 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp) unsigned int strlen; char *str; - READ_BUF(12); - return decode_opaque_inline(xdr, &strlen, &str); + p = xdr_inline_decode(xdr, 12); + if (likely(p)) + return decode_opaque_inline(xdr, &strlen, &str); + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) @@ -2261,91 +3156,153 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) uint32_t bmlen; __be32 *p; - READ_BUF(4); - READ32(bmlen); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); - bitmap[0] = bitmap[1] = 0; - READ_BUF((bmlen << 2)); + bitmap[0] = bitmap[1] = bitmap[2] = 0; + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; if (bmlen > 0) { - READ32(bitmap[0]); - if (bmlen > 1) - READ32(bitmap[1]); + bitmap[0] = be32_to_cpup(p++); + if (bmlen > 1) { + bitmap[1] = be32_to_cpup(p++); + if (bmlen > 2) + bitmap[2] = be32_to_cpup(p); + } } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep) +static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigned int *savep) { __be32 *p; - READ_BUF(4); - READ32(*attrlen); - *savep = xdr->p; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *attrlen = be32_to_cpup(p); + *savep = xdr_stream_pos(xdr); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) { if (likely(bitmap[0] & FATTR4_WORD0_SUPPORTED_ATTRS)) { - decode_attr_bitmap(xdr, bitmask); + int ret; + ret = decode_attr_bitmap(xdr, bitmask); + if (unlikely(ret < 0)) + return ret; bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; } else - bitmask[0] = bitmask[1] = 0; - dprintk("%s: bitmask=0x%x%x\n", __FUNCTION__, bitmask[0], bitmask[1]); + bitmask[0] = bitmask[1] = bitmask[2] = 0; + dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, + bitmask[0], bitmask[1], bitmask[2]); return 0; } static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type) { __be32 *p; + int ret = 0; *type = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { - READ_BUF(4); - READ32(*type); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *type = be32_to_cpup(p); if (*type < NF4REG || *type > NF4NAMEDATTR) { - dprintk("%s: bad type %d\n", __FUNCTION__, *type); + dprintk("%s: bad type %d\n", __func__, *type); return -EIO; } bitmap[0] &= ~FATTR4_WORD0_TYPE; + ret = NFS_ATTR_FATTR_TYPE; } - dprintk("%s: type=0%o\n", __FUNCTION__, nfs_type2fmt[*type].nfs2type); + dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_fh_expire_type(struct xdr_stream *xdr, + uint32_t *bitmap, uint32_t *type) +{ + __be32 *p; + + *type = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_FH_EXPIRE_TYPE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *type = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE; + } + dprintk("%s: expire type=0x%x\n", __func__, *type); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) { __be32 *p; + int ret = 0; *change = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { - READ_BUF(8); - READ64(*change); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, change); bitmap[0] &= ~FATTR4_WORD0_CHANGE; + ret = NFS_ATTR_FATTR_CHANGE; } - dprintk("%s: change attribute=%Lu\n", __FUNCTION__, + dprintk("%s: change attribute=%Lu\n", __func__, (unsigned long long)*change); - return 0; + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) { __be32 *p; + int ret = 0; *size = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { - READ_BUF(8); - READ64(*size); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, size); bitmap[0] &= ~FATTR4_WORD0_SIZE; + ret = NFS_ATTR_FATTR_SIZE; } - dprintk("%s: file size=%Lu\n", __FUNCTION__, (unsigned long long)*size); - return 0; + dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2356,12 +3313,17 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; } - dprintk("%s: link support=%s\n", __FUNCTION__, *res == 0 ? "false" : "true"); + dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2372,32 +3334,44 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; } - dprintk("%s: symlink support=%s\n", __FUNCTION__, *res == 0 ? "false" : "true"); + dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) { __be32 *p; + int ret = 0; fsid->major = 0; fsid->minor = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { - READ_BUF(16); - READ64(fsid->major); - READ64(fsid->minor); + p = xdr_inline_decode(xdr, 16); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &fsid->major); + xdr_decode_hyper(p, &fsid->minor); bitmap[0] &= ~FATTR4_WORD0_FSID; + ret = NFS_ATTR_FATTR_FSID; } - dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __FUNCTION__, + dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__, (unsigned long long)fsid->major, (unsigned long long)fsid->minor); - return 0; + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2408,60 +3382,135 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; } - dprintk("%s: file size=%u\n", __FUNCTION__, (unsigned int)*res); + dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res) +{ + __be32 *p; + + if (unlikely(bitmap[0] & (FATTR4_WORD0_RDATTR_ERROR - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; + *res = -be32_to_cpup(p); + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh) +{ + __be32 *p; + int len; + + if (fh != NULL) + memset(fh, 0, sizeof(*fh)); + + if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEHANDLE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len > NFS4_FHSIZE) + return -EIO; + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (fh != NULL) { + memcpy(fh->data, p, len); + fh->size = len; + } + bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) { __be32 *p; - *res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL; + *res = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; } - dprintk("%s: ACLs supported=%u\n", __FUNCTION__, (unsigned int)*res); + dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) { __be32 *p; + int ret = 0; *fileid = 0; if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { - READ_BUF(8); - READ64(*fileid); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, fileid); bitmap[0] &= ~FATTR4_WORD0_FILEID; + ret = NFS_ATTR_FATTR_FILEID; } - dprintk("%s: fileid=%Lu\n", __FUNCTION__, (unsigned long long)*fileid); - return 0; + dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) { __be32 *p; + int ret = 0; *fileid = 0; if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { - READ_BUF(8); - READ64(*fileid); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; + ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID; } - dprintk("%s: fileid=%Lu\n", __FUNCTION__, (unsigned long long)*fileid); - return 0; + dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2473,12 +3522,17 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; } - dprintk("%s: files avail=%Lu\n", __FUNCTION__, (unsigned long long)*res); + dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2490,12 +3544,17 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; } - dprintk("%s: files free=%Lu\n", __FUNCTION__, (unsigned long long)*res); + dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2507,12 +3566,17 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; } - dprintk("%s: files total=%Lu\n", __FUNCTION__, (unsigned long long)*res); + dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) @@ -2521,41 +3585,43 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) __be32 *p; int status = 0; - READ_BUF(4); - READ32(n); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + n = be32_to_cpup(p); if (n == 0) goto root_path; - dprintk("path "); - path->ncomponents = 0; - while (path->ncomponents < n) { + dprintk("pathname4: "); + if (n > NFS4_PATHNAME_MAXCOMPONENTS) { + dprintk("cannot parse %d components in path\n", n); + goto out_eio; + } + for (path->ncomponents = 0; path->ncomponents < n; path->ncomponents++) { struct nfs4_string *component = &path->components[path->ncomponents]; status = decode_opaque_inline(xdr, &component->len, &component->data); if (unlikely(status != 0)) goto out_eio; - if (path->ncomponents != n) - dprintk("/"); - dprintk("%s", component->data); - if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS) - path->ncomponents++; - else { - dprintk("cannot parse %d components in path\n", n); - goto out_eio; - } + ifdebug (XDR) + pr_cont("%s%.*s ", + (path->ncomponents != n ? "/ " : ""), + component->len, component->data); } out: - dprintk("\n"); return status; root_path: /* a root pathname is sent as a zero component4 */ path->ncomponents = 1; path->components[0].len=0; path->components[0].data=NULL; - dprintk("path /\n"); + dprintk("pathname4: /\n"); goto out; out_eio: dprintk(" status %d", status); status = -EIO; goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) @@ -2569,37 +3635,41 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st status = 0; if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS))) goto out; - dprintk("%s: fsroot ", __FUNCTION__); + status = -EIO; + /* Ignore borken servers that return unrequested attrs */ + if (unlikely(res == NULL)) + goto out; + dprintk("%s: fsroot:\n", __func__); status = decode_pathname(xdr, &res->fs_path); if (unlikely(status != 0)) goto out; - READ_BUF(4); - READ32(n); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + n = be32_to_cpup(p); if (n <= 0) goto out_eio; - res->nlocations = 0; - while (res->nlocations < n) { + for (res->nlocations = 0; res->nlocations < n; res->nlocations++) { u32 m; - struct nfs4_fs_location *loc = &res->locations[res->nlocations]; + struct nfs4_fs_location *loc; - READ_BUF(4); - READ32(m); + if (res->nlocations == NFS4_FS_LOCATIONS_MAXENTRIES) + break; + loc = &res->locations[res->nlocations]; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + m = be32_to_cpup(p); - loc->nservers = 0; - dprintk("%s: servers ", __FUNCTION__); - while (loc->nservers < m) { - struct nfs4_string *server = &loc->servers[loc->nservers]; - status = decode_opaque_inline(xdr, &server->len, &server->data); - if (unlikely(status != 0)) - goto out_eio; - dprintk("%s ", server->data); - if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS) - loc->nservers++; - else { + dprintk("%s: servers:\n", __func__); + for (loc->nservers = 0; loc->nservers < m; loc->nservers++) { + struct nfs4_string *server; + + if (loc->nservers == NFS4_FS_LOCATION_MAXSERVERS) { unsigned int i; dprintk("%s: using first %u of %u servers " "returned for location %u\n", - __FUNCTION__, + __func__, NFS4_FS_LOCATION_MAXSERVERS, m, res->nlocations); for (i = loc->nservers; i < m; i++) { @@ -2609,17 +3679,25 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st if (unlikely(status != 0)) goto out_eio; } + break; } + server = &loc->servers[loc->nservers]; + status = decode_opaque_inline(xdr, &server->len, &server->data); + if (unlikely(status != 0)) + goto out_eio; + dprintk("%s ", server->data); } status = decode_pathname(xdr, &loc->rootpath); if (unlikely(status != 0)) goto out_eio; - if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES) - res->nlocations++; } + if (res->nlocations != 0) + status = NFS_ATTR_FATTR_V4_LOCATIONS; out: - dprintk("%s: fs_locations done, error = %d\n", __FUNCTION__, status); + dprintk("%s: fs_locations done, error = %d\n", __func__, status); return status; +out_overflow: + print_overflow_msg(__func__, xdr); out_eio: status = -EIO; goto out; @@ -2634,12 +3712,17 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; } - dprintk("%s: maxfilesize=%Lu\n", __FUNCTION__, (unsigned long long)*res); + dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) @@ -2651,12 +3734,17 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { - READ_BUF(4); - READ32(*maxlink); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *maxlink = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXLINK; } - dprintk("%s: maxlink=%u\n", __FUNCTION__, *maxlink); + dprintk("%s: maxlink=%u\n", __func__, *maxlink); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) @@ -2668,12 +3756,17 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { - READ_BUF(4); - READ32(*maxname); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *maxname = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXNAME; } - dprintk("%s: maxname=%u\n", __FUNCTION__, *maxname); + dprintk("%s: maxname=%u\n", __func__, *maxname); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2686,15 +3779,20 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) { uint64_t maxread; - READ_BUF(8); - READ64(maxread); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &maxread); if (maxread > 0x7FFFFFFF) maxread = 0x7FFFFFFF; *res = (uint32_t)maxread; bitmap[0] &= ~FATTR4_WORD0_MAXREAD; } - dprintk("%s: maxread=%lu\n", __FUNCTION__, (unsigned long)*res); + dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2707,104 +3805,161 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) { uint64_t maxwrite; - READ_BUF(8); - READ64(maxwrite); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &maxwrite); if (maxwrite > 0x7FFFFFFF) maxwrite = 0x7FFFFFFF; *res = (uint32_t)maxwrite; bitmap[0] &= ~FATTR4_WORD0_MAXWRITE; } - dprintk("%s: maxwrite=%lu\n", __FUNCTION__, (unsigned long)*res); + dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode) +static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) { + uint32_t tmp; __be32 *p; + int ret = 0; *mode = 0; if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { - READ_BUF(4); - READ32(*mode); - *mode &= ~S_IFMT; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + tmp = be32_to_cpup(p); + *mode = tmp & ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; + ret = NFS_ATTR_FATTR_MODE; } - dprintk("%s: file mode=0%o\n", __FUNCTION__, (unsigned int)*mode); - return 0; + dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) { __be32 *p; + int ret = 0; *nlink = 1; if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { - READ_BUF(4); - READ32(*nlink); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *nlink = be32_to_cpup(p); bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; + ret = NFS_ATTR_FATTR_NLINK; } - dprintk("%s: nlink=%u\n", __FUNCTION__, (unsigned int)*nlink); - return 0; + dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) +static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, + const struct nfs_server *server, kuid_t *uid, + struct nfs4_string *owner_name) { uint32_t len; __be32 *p; + int ret = 0; - *uid = -2; + *uid = make_kuid(&init_user_ns, -2); if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { - READ_BUF(4); - READ32(len); - READ_BUF(len); - if (len < XDR_MAX_NETOBJ) { - if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0) + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (owner_name != NULL) { + owner_name->data = kmemdup(p, len, GFP_NOWAIT); + if (owner_name->data != NULL) { + owner_name->len = len; + ret = NFS_ATTR_FATTR_OWNER_NAME; + } + } else if (len < XDR_MAX_NETOBJ) { + if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0) + ret = NFS_ATTR_FATTR_OWNER; + else dprintk("%s: nfs_map_name_to_uid failed!\n", - __FUNCTION__); + __func__); } else dprintk("%s: name too long (%u)!\n", - __FUNCTION__, len); + __func__, len); bitmap[1] &= ~FATTR4_WORD1_OWNER; } - dprintk("%s: uid=%d\n", __FUNCTION__, (int)*uid); - return 0; + dprintk("%s: uid=%d\n", __func__, (int)from_kuid(&init_user_ns, *uid)); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) +static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, + const struct nfs_server *server, kgid_t *gid, + struct nfs4_string *group_name) { uint32_t len; __be32 *p; + int ret = 0; - *gid = -2; + *gid = make_kgid(&init_user_ns, -2); if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { - READ_BUF(4); - READ32(len); - READ_BUF(len); - if (len < XDR_MAX_NETOBJ) { - if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0) + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (group_name != NULL) { + group_name->data = kmemdup(p, len, GFP_NOWAIT); + if (group_name->data != NULL) { + group_name->len = len; + ret = NFS_ATTR_FATTR_GROUP_NAME; + } + } else if (len < XDR_MAX_NETOBJ) { + if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0) + ret = NFS_ATTR_FATTR_GROUP; + else dprintk("%s: nfs_map_group_to_gid failed!\n", - __FUNCTION__); + __func__); } else dprintk("%s: name too long (%u)!\n", - __FUNCTION__, len); + __func__, len); bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; } - dprintk("%s: gid=%d\n", __FUNCTION__, (int)*gid); - return 0; + dprintk("%s: gid=%d\n", __func__, (int)from_kgid(&init_user_ns, *gid)); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) { uint32_t major = 0, minor = 0; __be32 *p; + int ret = 0; *rdev = MKDEV(0,0); if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U))) @@ -2812,16 +3967,22 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) { dev_t tmp; - READ_BUF(8); - READ32(major); - READ32(minor); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + major = be32_to_cpup(p++); + minor = be32_to_cpup(p); tmp = MKDEV(major, minor); if (MAJOR(tmp) == major && MINOR(tmp) == minor) *rdev = tmp; bitmap[1] &= ~ FATTR4_WORD1_RAWDEV; + ret = NFS_ATTR_FATTR_RDEV; } - dprintk("%s: rdev=(0x%x:0x%x)\n", __FUNCTION__, major, minor); - return 0; + dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2833,12 +3994,17 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; } - dprintk("%s: space avail=%Lu\n", __FUNCTION__, (unsigned long long)*res); + dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2850,12 +4016,17 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; } - dprintk("%s: space free=%Lu\n", __FUNCTION__, (unsigned long long)*res); + dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2867,29 +4038,41 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; } - dprintk("%s: space total=%Lu\n", __FUNCTION__, (unsigned long long)*res); + dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) { __be32 *p; + int ret = 0; *used = 0; if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { - READ_BUF(8); - READ64(*used); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, used); bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; + ret = NFS_ATTR_FATTR_SPACE_USED; } - dprintk("%s: space used=%Lu\n", __FUNCTION__, + dprintk("%s: space used=%Lu\n", __func__, (unsigned long long)*used); - return 0; + return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) @@ -2898,12 +4081,17 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) uint64_t sec; uint32_t nsec; - READ_BUF(12); - READ64(sec); - READ32(nsec); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &sec); + nsec = be32_to_cpup(p); time->tv_sec = (time_t)sec; time->tv_nsec = (long)nsec; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) @@ -2916,9 +4104,11 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) { status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_ATIME; bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS; } - dprintk("%s: atime=%ld\n", __FUNCTION__, (long)time->tv_sec); + dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec); return status; } @@ -2932,12 +4122,82 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) { status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_CTIME; bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA; } - dprintk("%s: ctime=%ld\n", __FUNCTION__, (long)time->tv_sec); + dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec); return status; } +static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap, + struct timespec *time) +{ + int status = 0; + + time->tv_sec = 0; + time->tv_nsec = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_DELTA - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_TIME_DELTA)) { + status = decode_attr_time(xdr, time); + bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA; + } + dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec, + (long)time->tv_nsec); + return status; +} + +static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs4_label *label) +{ + uint32_t pi = 0; + uint32_t lfs = 0; + __u32 len; + __be32 *p; + int status = 0; + + if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U))) + return -EIO; + if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + lfs = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + pi = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (len < NFS4_MAXLABELLEN) { + if (label) { + memcpy(label->label, p, len); + label->len = len; + label->pi = pi; + label->lfs = lfs; + status = NFS_ATTR_FATTR_V4_SECURITY_LABEL; + } + bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + } else + printk(KERN_WARNING "%s: label too long (%u)!\n", + __func__, len); + } + if (label && label->label) + dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__, + (char *)label->label, label->len, label->pi, label->lfs); + return status; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) { int status = 0; @@ -2948,21 +4208,23 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) { status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_MTIME; bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY; } - dprintk("%s: mtime=%ld\n", __FUNCTION__, (long)time->tv_sec); + dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec); return status; } -static int verify_attr_len(struct xdr_stream *xdr, __be32 *savep, uint32_t attrlen) +static int verify_attr_len(struct xdr_stream *xdr, unsigned int savep, uint32_t attrlen) { unsigned int attrwords = XDR_QUADLEN(attrlen); - unsigned int nwords = xdr->p - savep; + unsigned int nwords = (xdr_stream_pos(xdr) - savep) >> 2; if (unlikely(attrwords != nwords)) { dprintk("%s: server returned incorrect attribute length: " "%u %c %u\n", - __FUNCTION__, + __func__, attrwords << 2, (attrwords < nwords) ? '<' : '>', nwords << 2); @@ -2975,14 +4237,19 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c { __be32 *p; - READ_BUF(20); - READ32(cinfo->atomic); - READ64(cinfo->before); - READ64(cinfo->after); + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; + cinfo->atomic = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &cinfo->before); + xdr_decode_hyper(p, &cinfo->after); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) +static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access) { __be32 *p; uint32_t supp, acc; @@ -2991,38 +4258,67 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) status = decode_op_hdr(xdr, OP_ACCESS); if (status) return status; - READ_BUF(8); - READ32(supp); - READ32(acc); - access->supported = supp; - access->access = acc; + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + supp = be32_to_cpup(p++); + acc = be32_to_cpup(p); + *supported = supp; + *access = acc; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) { __be32 *p; + + p = xdr_inline_decode(xdr, len); + if (likely(p)) { + memcpy(buf, p, len); + return 0; + } + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); +} + +static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +{ int status; status = decode_op_hdr(xdr, OP_CLOSE); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; } -static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) +static int decode_verifier(struct xdr_stream *xdr, void *verifier) +{ + return decode_opaque_fixed(xdr, verifier, NFS4_VERIFIER_SIZE); +} + +static int decode_write_verifier(struct xdr_stream *xdr, struct nfs_write_verifier *verifier) +{ + return decode_opaque_fixed(xdr, verifier->data, NFS4_VERIFIER_SIZE); +} + +static int decode_commit(struct xdr_stream *xdr, struct nfs_commitres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_COMMIT); - if (status) - return status; - READ_BUF(8); - COPYMEM(res->verf->verifier, 8); - return 0; + if (!status) + status = decode_write_verifier(xdr, &res->verf->verifier); + return status; } static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -3036,17 +4332,22 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) return status; if ((status = decode_change_info(xdr, cinfo))) return status; - READ_BUF(4); - READ32(bmlen); - READ_BUF(bmlen << 2); - return 0; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) { - __be32 *savep; - uint32_t attrlen, - bitmap[2] = {0}; + unsigned int savep; + uint32_t attrlen, bitmap[3] = {0}; int status; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -3057,6 +4358,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re goto xdr_error; if ((status = decode_attr_supported(xdr, bitmap, res->attr_bitmask)) != 0) goto xdr_error; + if ((status = decode_attr_fh_expire_type(xdr, bitmap, + &res->fh_expire_type)) != 0) + goto xdr_error; if ((status = decode_attr_link_support(xdr, bitmap, &res->has_links)) != 0) goto xdr_error; if ((status = decode_attr_symlink_support(xdr, bitmap, &res->has_symlinks)) != 0) @@ -3065,17 +4369,16 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re goto xdr_error; status = verify_attr_len(xdr, savep, attrlen); xdr_error: - dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status); + dprintk("%s: xdr returned %d!\n", __func__, -status); return status; } - + static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) { - __be32 *savep; - uint32_t attrlen, - bitmap[2] = {0}; + unsigned int savep; + uint32_t attrlen, bitmap[3] = {0}; int status; - + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) goto xdr_error; if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) @@ -3098,17 +4401,16 @@ static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) status = verify_attr_len(xdr, savep, attrlen); xdr_error: - dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status); + dprintk("%s: xdr returned %d!\n", __func__, -status); return status; } static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) { - __be32 *savep; - uint32_t attrlen, - bitmap[2] = {0}; + unsigned int savep; + uint32_t attrlen, bitmap[3] = {0}; int status; - + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) goto xdr_error; if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) @@ -3123,83 +4425,366 @@ static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf status = verify_attr_len(xdr, savep, attrlen); xdr_error: - dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status); + dprintk("%s: xdr returned %d!\n", __func__, -status); return status; } -static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server) +static int decode_threshold_hint(struct xdr_stream *xdr, + uint32_t *bitmap, + uint64_t *res, + uint32_t hint_bit) { - __be32 *savep; - uint32_t attrlen, - bitmap[2] = {0}, - type; - int status, fmode = 0; - uint64_t fileid; + __be32 *p; - if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) + *res = 0; + if (likely(bitmap[0] & hint_bit)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_first_threshold_item4(struct xdr_stream *xdr, + struct nfs4_threshold *res) +{ + __be32 *p; + unsigned int savep; + uint32_t bitmap[3] = {0,}, attrlen; + int status; + + /* layout type */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) { + print_overflow_msg(__func__, xdr); + return -EIO; + } + res->l_type = be32_to_cpup(p); + + /* thi_hintset bitmap */ + status = decode_attr_bitmap(xdr, bitmap); + if (status < 0) goto xdr_error; - if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) + + /* thi_hintlist length */ + status = decode_attr_length(xdr, &attrlen, &savep); + if (status < 0) + goto xdr_error; + /* thi_hintlist */ + status = decode_threshold_hint(xdr, bitmap, &res->rd_sz, THRESHOLD_RD); + if (status < 0) + goto xdr_error; + status = decode_threshold_hint(xdr, bitmap, &res->wr_sz, THRESHOLD_WR); + if (status < 0) + goto xdr_error; + status = decode_threshold_hint(xdr, bitmap, &res->rd_io_sz, + THRESHOLD_RD_IO); + if (status < 0) + goto xdr_error; + status = decode_threshold_hint(xdr, bitmap, &res->wr_io_sz, + THRESHOLD_WR_IO); + if (status < 0) goto xdr_error; - fattr->bitmap[0] = bitmap[0]; - fattr->bitmap[1] = bitmap[1]; + status = verify_attr_len(xdr, savep, attrlen); + res->bm = bitmap[0]; - if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) + dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n", + __func__, res->bm, res->rd_sz, res->wr_sz, res->rd_io_sz, + res->wr_io_sz); +xdr_error: + dprintk("%s ret=%d!\n", __func__, status); + return status; +} + +/* + * Thresholds on pNFS direct I/O vrs MDS I/O + */ +static int decode_attr_mdsthreshold(struct xdr_stream *xdr, + uint32_t *bitmap, + struct nfs4_threshold *res) +{ + __be32 *p; + int status = 0; + uint32_t num; + + if (unlikely(bitmap[2] & (FATTR4_WORD2_MDSTHRESHOLD - 1U))) + return -EIO; + if (bitmap[2] & FATTR4_WORD2_MDSTHRESHOLD) { + /* Did the server return an unrequested attribute? */ + if (unlikely(res == NULL)) + return -EREMOTEIO; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + num = be32_to_cpup(p); + if (num == 0) + return 0; + if (num > 1) + printk(KERN_INFO "%s: Warning: Multiple pNFS layout " + "drivers per filesystem not supported\n", + __func__); + + status = decode_first_threshold_item4(xdr, res); + bitmap[2] &= ~FATTR4_WORD2_MDSTHRESHOLD; + } + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs_fattr *fattr, struct nfs_fh *fh, + struct nfs4_fs_locations *fs_loc, struct nfs4_label *label, + const struct nfs_server *server) +{ + int status; + umode_t fmode = 0; + uint32_t type; + int32_t err; + + status = decode_attr_type(xdr, bitmap, &type); + if (status < 0) + goto xdr_error; + fattr->mode = 0; + if (status != 0) { + fattr->mode |= nfs_type2fmt[type]; + fattr->valid |= status; + } + + status = decode_attr_change(xdr, bitmap, &fattr->change_attr); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_size(xdr, bitmap, &fattr->size); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + status = decode_attr_fsid(xdr, bitmap, &fattr->fsid); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + + err = 0; + status = decode_attr_error(xdr, bitmap, &err); + if (status < 0) + goto xdr_error; + + status = decode_attr_filehandle(xdr, bitmap, fh); + if (status < 0) goto xdr_error; + status = decode_attr_fileid(xdr, bitmap, &fattr->fileid); + if (status < 0) + goto xdr_error; + fattr->valid |= status; - if ((status = decode_attr_type(xdr, bitmap, &type)) != 0) + status = decode_attr_fs_locations(xdr, bitmap, fs_loc); + if (status < 0) goto xdr_error; - fattr->type = nfs_type2fmt[type].nfs2type; - fmode = nfs_type2fmt[type].mode; + fattr->valid |= status; - if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0) + status = decode_attr_mode(xdr, bitmap, &fmode); + if (status < 0) goto xdr_error; - if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0) + if (status != 0) { + fattr->mode |= fmode; + fattr->valid |= status; + } + + status = decode_attr_nlink(xdr, bitmap, &fattr->nlink); + if (status < 0) goto xdr_error; - if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0) + fattr->valid |= status; + + status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, fattr->owner_name); + if (status < 0) goto xdr_error; - if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0) + fattr->valid |= status; + + status = decode_attr_group(xdr, bitmap, server, &fattr->gid, fattr->group_name); + if (status < 0) goto xdr_error; - if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, - struct nfs4_fs_locations, - fattr))) != 0) + fattr->valid |= status; + + status = decode_attr_rdev(xdr, bitmap, &fattr->rdev); + if (status < 0) goto xdr_error; - if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0) + fattr->valid |= status; + + status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used); + if (status < 0) goto xdr_error; - fattr->mode |= fmode; - if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0) + fattr->valid |= status; + + status = decode_attr_time_access(xdr, bitmap, &fattr->atime); + if (status < 0) goto xdr_error; - if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0) + fattr->valid |= status; + + status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime); + if (status < 0) goto xdr_error; - if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0) + fattr->valid |= status; + + status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime); + if (status < 0) goto xdr_error; - if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0) + fattr->valid |= status; + + status = decode_attr_mounted_on_fileid(xdr, bitmap, &fattr->mounted_on_fileid); + if (status < 0) goto xdr_error; - if ((status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used)) != 0) + fattr->valid |= status; + + status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold); + if (status < 0) goto xdr_error; - if ((status = decode_attr_time_access(xdr, bitmap, &fattr->atime)) != 0) + + if (label) { + status = decode_attr_security_label(xdr, bitmap, label); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + } + +xdr_error: + dprintk("%s: xdr returned %d\n", __func__, -status); + return status; +} + +static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc, + struct nfs4_label *label, const struct nfs_server *server) +{ + unsigned int savep; + uint32_t attrlen, + bitmap[3] = {0}; + int status; + + status = decode_op_hdr(xdr, OP_GETATTR); + if (status < 0) goto xdr_error; - if ((status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime)) != 0) + + status = decode_attr_bitmap(xdr, bitmap); + if (status < 0) goto xdr_error; - if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0) + + status = decode_attr_length(xdr, &attrlen, &savep); + if (status < 0) goto xdr_error; - if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0) + + status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, + label, server); + if (status < 0) goto xdr_error; - if (fattr->fileid == 0 && fileid != 0) - fattr->fileid = fileid; - if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) - fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4; + + status = verify_attr_len(xdr, savep, attrlen); xdr_error: - dprintk("%s: xdr returned %d\n", __FUNCTION__, -status); + dprintk("%s: xdr returned %d\n", __func__, -status); return status; } +static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct nfs4_label *label, const struct nfs_server *server) +{ + return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server); +} + +static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, + const struct nfs_server *server) +{ + return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server); +} + +/* + * Decode potentially multiple layout types. Currently we only support + * one layout driver per file system. + */ +static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, + uint32_t *layouttype) +{ + __be32 *p; + int num; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + num = be32_to_cpup(p); + + /* pNFS is not supported by the underlying file system */ + if (num == 0) { + *layouttype = 0; + return 0; + } + if (num > 1) + printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout " + "drivers per filesystem not supported\n", __func__); + + /* Decode and set first layout type, move xdr->p past unused types */ + p = xdr_inline_decode(xdr, num * 4); + if (unlikely(!p)) + goto out_overflow; + *layouttype = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * The type of file system exported. + * Note we must ensure that layouttype is set in any non-error case. + */ +static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, + uint32_t *layouttype) +{ + int status = 0; + + dprintk("%s: bitmap is %x\n", __func__, bitmap[1]); + if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) + return -EIO; + if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) { + status = decode_first_pnfs_layout_type(xdr, layouttype); + bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; + } else + *layouttype = 0; + return status; +} + +/* + * The prefered block size for layout directed io + */ +static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, + uint32_t *res) +{ + __be32 *p; + + dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); + *res = 0; + if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) { + print_overflow_msg(__func__, xdr); + return -EIO; + } + *res = be32_to_cpup(p); + bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; + } + return 0; +} static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) { - __be32 *savep; - uint32_t attrlen, bitmap[2]; + unsigned int savep; + uint32_t attrlen, bitmap[3]; int status; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -3221,10 +4806,19 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) goto xdr_error; fsinfo->wtpref = fsinfo->wtmax; + status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta); + if (status != 0) + goto xdr_error; + status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); + if (status != 0) + goto xdr_error; + status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); + if (status) + goto xdr_error; status = verify_attr_len(xdr, savep, attrlen); xdr_error: - dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status); + dprintk("%s: xdr returned %d!\n", __func__, -status); return status; } @@ -3241,20 +4835,27 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) if (status) return status; - READ_BUF(4); - READ32(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); if (len > NFS4_FHSIZE) return -EIO; fh->size = len; - READ_BUF(len); - COPYMEM(fh->data, len); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + memcpy(fh->data, p, len); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) { int status; - + status = decode_op_hdr(xdr, OP_LINK); if (status) return status; @@ -3270,11 +4871,13 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) __be32 *p; uint32_t namelen, type; - READ_BUF(32); - READ64(offset); - READ64(length); - READ32(type); - if (fl != NULL) { + p = xdr_inline_decode(xdr, 32); /* read 32 bytes */ + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */ + p = xdr_decode_hyper(p, &length); + type = be32_to_cpup(p++); /* 4 byte read */ + if (fl != NULL) { /* manipulate file lock */ fl->fl_start = (loff_t)offset; fl->fl_end = fl->fl_start + (loff_t)length - 1; if (length == ~(uint64_t)0) @@ -3284,23 +4887,33 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) fl->fl_type = F_RDLCK; fl->fl_pid = 0; } - READ64(clientid); - READ32(namelen); - READ_BUF(namelen); - return -NFS4ERR_DENIED; + p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */ + namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */ + p = xdr_inline_decode(xdr, namelen); /* variable size field */ + if (likely(p)) + return -NFS4ERR_DENIED; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_LOCK); + if (status == -EIO) + goto out; if (status == 0) { - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) + goto out; } else if (status == -NFS4ERR_DENIED) - return decode_lock_denied(xdr, NULL); + status = decode_lock_denied(xdr, NULL); + if (res->open_seqid != NULL) + nfs_increment_open_seqid(status, res->open_seqid); + nfs_increment_lock_seqid(status, res->lock_seqid); +out: return status; } @@ -3315,17 +4928,21 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res) static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_LOCKU); - if (status == 0) { - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - } + if (status != -EIO) + nfs_increment_lock_seqid(status, res->seqid); + if (status == 0) + status = decode_stateid(xdr, &res->stateid); return status; } +static int decode_release_lockowner(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER); +} + static int decode_lookup(struct xdr_stream *xdr) { return decode_op_hdr(xdr, OP_LOOKUP); @@ -3334,106 +4951,130 @@ static int decode_lookup(struct xdr_stream *xdr) /* This is too sick! */ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) { - __be32 *p; + __be32 *p; uint32_t limit_type, nblocks, blocksize; - READ_BUF(12); - READ32(limit_type); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + limit_type = be32_to_cpup(p++); switch (limit_type) { - case 1: - READ64(*maxsize); - break; - case 2: - READ32(nblocks); - READ32(blocksize); - *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; + case 1: + xdr_decode_hyper(p, maxsize); + break; + case 2: + nblocks = be32_to_cpup(p++); + blocksize = be32_to_cpup(p); + *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) { - __be32 *p; - uint32_t delegation_type; + __be32 *p; + uint32_t delegation_type; + int status; - READ_BUF(4); - READ32(delegation_type); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + delegation_type = be32_to_cpup(p); if (delegation_type == NFS4_OPEN_DELEGATE_NONE) { res->delegation_type = 0; return 0; } - READ_BUF(NFS4_STATEID_SIZE+4); - COPYMEM(res->delegation.data, NFS4_STATEID_SIZE); - READ32(res->do_recall); + status = decode_stateid(xdr, &res->delegation); + if (unlikely(status)) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->do_recall = be32_to_cpup(p); + switch (delegation_type) { - case NFS4_OPEN_DELEGATE_READ: - res->delegation_type = FMODE_READ; - break; - case NFS4_OPEN_DELEGATE_WRITE: - res->delegation_type = FMODE_WRITE|FMODE_READ; - if (decode_space_limit(xdr, &res->maxsize) < 0) + case NFS4_OPEN_DELEGATE_READ: + res->delegation_type = FMODE_READ; + break; + case NFS4_OPEN_DELEGATE_WRITE: + res->delegation_type = FMODE_WRITE|FMODE_READ; + if (decode_space_limit(xdr, &res->maxsize) < 0) return -EIO; } return decode_ace(xdr, NULL, res->server->nfs_client); +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) { - __be32 *p; + __be32 *p; uint32_t savewords, bmlen, i; - int status; + int status; - status = decode_op_hdr(xdr, OP_OPEN); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); + if (!__decode_op_hdr(xdr, OP_OPEN, &status)) + return status; + nfs_increment_open_seqid(status, res->seqid); + if (status) + return status; + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) + return status; - decode_change_info(xdr, &res->cinfo); + decode_change_info(xdr, &res->cinfo); - READ_BUF(8); - READ32(res->rflags); - READ32(bmlen); - if (bmlen > 10) - goto xdr_error; + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + res->rflags = be32_to_cpup(p++); + bmlen = be32_to_cpup(p); + if (bmlen > 10) + goto xdr_error; - READ_BUF(bmlen << 2); + p = xdr_inline_decode(xdr, bmlen << 2); + if (unlikely(!p)) + goto out_overflow; savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); for (i = 0; i < savewords; ++i) - READ32(res->attrset[i]); + res->attrset[i] = be32_to_cpup(p++); for (; i < NFS4_BITMAP_SIZE; i++) res->attrset[i] = 0; return decode_delegation(xdr, res); xdr_error: - dprintk("%s: Bitmap too large! Length = %u\n", __FUNCTION__, bmlen); + dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); return -EIO; } static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) { - __be32 *p; int status; - status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; } static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; } static int decode_putfh(struct xdr_stream *xdr) @@ -3446,119 +5087,57 @@ static int decode_putrootfh(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_PUTROOTFH); } -static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res) +static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs_pgio_res *res) { - struct kvec *iov = req->rq_rcv_buf.head; __be32 *p; - uint32_t count, eof, recvd, hdrlen; + uint32_t count, eof, recvd; int status; status = decode_op_hdr(xdr, OP_READ); if (status) return status; - READ_BUF(8); - READ32(eof); - READ32(count); - hdrlen = (u8 *) p - (u8 *) iov->iov_base; - recvd = req->rq_rcv_buf.len - hdrlen; + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + eof = be32_to_cpup(p++); + count = be32_to_cpup(p); + recvd = xdr_read_pages(xdr, count); if (count > recvd) { dprintk("NFS: server cheating in read reply: " "count %u > recvd %u\n", count, recvd); count = recvd; eof = 0; } - xdr_read_pages(xdr, count); res->eof = eof; res->count = count; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) { - struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - struct page *page = *rcvbuf->pages; - struct kvec *iov = rcvbuf->head; - size_t hdrlen; - u32 recvd, pglen = rcvbuf->page_len; - __be32 *end, *entry, *p, *kaddr; - unsigned int nr; int status; + __be32 verf[2]; status = decode_op_hdr(xdr, OP_READDIR); - if (status) + if (!status) + status = decode_verifier(xdr, readdir->verifier.data); + if (unlikely(status)) return status; - READ_BUF(8); - COPYMEM(readdir->verifier.data, 8); - dprintk("%s: verifier = 0x%x%x\n", - __FUNCTION__, - ((u32 *)readdir->verifier.data)[0], - ((u32 *)readdir->verifier.data)[1]); - - - hdrlen = (char *) p - (char *) iov->iov_base; - recvd = rcvbuf->len - hdrlen; - if (pglen > recvd) - pglen = recvd; - xdr_read_pages(xdr, pglen); - - BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE); - kaddr = p = kmap_atomic(page, KM_USER0); - end = p + ((pglen + readdir->pgbase) >> 2); - entry = p; - for (nr = 0; *p++; nr++) { - u32 len, attrlen, xlen; - if (end - p < 3) - goto short_pkt; - dprintk("cookie = %Lu, ", *((unsigned long long *)p)); - p += 2; /* cookie */ - len = ntohl(*p++); /* filename length */ - if (len > NFS4_MAXNAMLEN) { - dprintk("NFS: giant filename in readdir (len 0x%x)\n", - len); - goto err_unmap; - } - xlen = XDR_QUADLEN(len); - if (end - p < xlen + 1) - goto short_pkt; - dprintk("filename = %*s\n", len, (char *)p); - p += xlen; - len = ntohl(*p++); /* bitmap length */ - if (end - p < len + 1) - goto short_pkt; - p += len; - attrlen = XDR_QUADLEN(ntohl(*p++)); - if (end - p < attrlen + 2) - goto short_pkt; - p += attrlen; /* attributes */ - entry = p; - } - if (!nr && (entry[0] != 0 || entry[1] == 0)) - goto short_pkt; -out: - kunmap_atomic(kaddr, KM_USER0); - return 0; -short_pkt: - dprintk("%s: short packet at entry %d\n", __FUNCTION__, nr); - entry[0] = entry[1] = 0; - /* truncate listing ? */ - if (!nr) { - dprintk("NFS: readdir reply truncated!\n"); - entry[1] = 1; - } - goto out; -err_unmap: - kunmap_atomic(kaddr, KM_USER0); - return -errno_NFSERR_IO; + memcpy(verf, readdir->verifier.data, sizeof(verf)); + dprintk("%s: verifier = %08x:%08x\n", + __func__, verf[0], verf[1]); + return xdr_read_pages(xdr, xdr->buf->page_len); } static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) { struct xdr_buf *rcvbuf = &req->rq_rcv_buf; - struct kvec *iov = rcvbuf->head; - size_t hdrlen; u32 len, recvd; __be32 *p; - char *kaddr; int status; status = decode_op_hdr(xdr, OP_READLINK); @@ -3566,20 +5145,20 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) return status; /* Convert length of symlink */ - READ_BUF(4); - READ32(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); if (len >= rcvbuf->page_len || len <= 0) { dprintk("nfs: server returned giant symlink!\n"); return -ENAMETOOLONG; } - hdrlen = (char *) xdr->p - (char *) iov->iov_base; - recvd = req->rq_rcv_buf.len - hdrlen; + recvd = xdr_read_pages(xdr, len); if (recvd < len) { dprintk("NFS: server cheating in readlink reply: " "count %u > recvd %u\n", len, recvd); return -EIO; } - xdr_read_pages(xdr, len); /* * The XDR encode routine has set things up so that * the link text will be copied directly into the @@ -3587,10 +5166,11 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) * and and null-terminate the text (the VFS expects * null-termination). */ - kaddr = (char *)kmap_atomic(rcvbuf->pages[0], KM_USER0); - kaddr[len+rcvbuf->page_base] = '\0'; - kunmap_atomic(kaddr, KM_USER0); + xdr_terminate_string(rcvbuf, len); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -3632,17 +5212,23 @@ decode_restorefh(struct xdr_stream *xdr) } static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, - size_t *acl_len) + struct nfs_getaclres *res) { - __be32 *savep; + unsigned int savep; uint32_t attrlen, - bitmap[2] = {0}; - struct kvec *iov = req->rq_rcv_buf.head; + bitmap[3] = {0}; int status; + unsigned int pg_offset; - *acl_len = 0; + res->acl_len = 0; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) goto out; + + xdr_enter_page(xdr, xdr->buf->page_len); + + /* Calculate the offset of the page data */ + pg_offset = xdr->buf->head[0].iov_len; + if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) goto out; if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) @@ -3651,21 +5237,20 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_ACL)) { - size_t hdrlen; - u32 recvd; - - /* We ignore &savep and don't do consistency checks on - * the attr length. Let userspace figure it out.... */ - hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base; - recvd = req->rq_rcv_buf.len - hdrlen; - if (attrlen > recvd) { - dprintk("NFS: server cheating in getattr" - " acl reply: attrlen %u > recvd %u\n", - attrlen, recvd); - return -EINVAL; + + /* The bitmap (xdr len + bitmaps) and the attr xdr len words + * are stored with the acl data to handle the problem of + * variable length bitmaps.*/ + res->acl_data_offset = xdr_stream_pos(xdr) - pg_offset; + res->acl_len = attrlen; + + /* Check for receive buffer overflow */ + if (res->acl_len > (xdr->nwords << 2) || + res->acl_len + res->acl_data_offset > xdr->buf->page_len) { + res->acl_flags |= NFS4_ACL_TRUNC; + dprintk("NFS: acl reply: attrlen %u > page_len %u\n", + attrlen, xdr->nwords << 2); } - xdr_read_pages(xdr, attrlen); - *acl_len = attrlen; } else status = -EOPNOTSUPP; @@ -3679,57 +5264,77 @@ decode_savefh(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_SAVEFH); } -static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res) +static int decode_setattr(struct xdr_stream *xdr) { __be32 *p; uint32_t bmlen; int status; - status = decode_op_hdr(xdr, OP_SETATTR); if (status) return status; - READ_BUF(4); - READ32(bmlen); - READ_BUF(bmlen << 2); - return 0; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) +static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res) { __be32 *p; uint32_t opnum; int32_t nfserr; - READ_BUF(8); - READ32(opnum); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + opnum = be32_to_cpup(p++); if (opnum != OP_SETCLIENTID) { dprintk("nfs: decode_setclientid: Server returned operation" - " %d\n", opnum); + " %d\n", opnum); return -EIO; } - READ32(nfserr); + nfserr = be32_to_cpup(p); if (nfserr == NFS_OK) { - READ_BUF(8 + NFS4_VERIFIER_SIZE); - READ64(clp->cl_clientid); - COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE); + p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &res->clientid); + memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE); } else if (nfserr == NFSERR_CLID_INUSE) { uint32_t len; /* skip netid string */ - READ_BUF(4); - READ32(len); - READ_BUF(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; /* skip uaddr string */ - READ_BUF(4); - READ32(len); - READ_BUF(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; return -NFSERR_CLID_INUSE; } else - return -nfs4_stat_to_errno(nfserr); + return nfs4_stat_to_errno(nfserr); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_setclientid_confirm(struct xdr_stream *xdr) @@ -3737,7 +5342,7 @@ static int decode_setclientid_confirm(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM); } -static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) +static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res) { __be32 *p; int status; @@ -3746,11 +5351,15 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) if (status) return status; - READ_BUF(16); - READ32(res->count); - READ32(res->verf->committed); - COPYMEM(res->verf->verifier, 8); - return 0; + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + res->count = be32_to_cpup(p++); + res->verf->committed = be32_to_cpup(p++); + return decode_write_verifier(xdr, &res->verf->verifier); +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_delegreturn(struct xdr_stream *xdr) @@ -3758,53 +5367,724 @@ static int decode_delegreturn(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_DELEGRETURN); } +static int decode_secinfo_gss(struct xdr_stream *xdr, + struct nfs4_secinfo4 *flavor) +{ + u32 oid_len; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + oid_len = be32_to_cpup(p); + if (oid_len > GSS_OID_MAX_LEN) + goto out_err; + + p = xdr_inline_decode(xdr, oid_len); + if (unlikely(!p)) + goto out_overflow; + memcpy(flavor->flavor_info.oid.data, p, oid_len); + flavor->flavor_info.oid.len = oid_len; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + flavor->flavor_info.qop = be32_to_cpup(p++); + flavor->flavor_info.service = be32_to_cpup(p); + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +out_err: + return -EINVAL; +} + +static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) +{ + struct nfs4_secinfo4 *sec_flavor; + unsigned int i, num_flavors; + int status; + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + + res->flavors->num_flavors = 0; + num_flavors = be32_to_cpup(p); + + for (i = 0; i < num_flavors; i++) { + sec_flavor = &res->flavors->flavors[i]; + if ((char *)&sec_flavor[1] - (char *)res->flavors > PAGE_SIZE) + break; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + sec_flavor->flavor = be32_to_cpup(p); + + if (sec_flavor->flavor == RPC_AUTH_GSS) { + status = decode_secinfo_gss(xdr, sec_flavor); + if (status) + goto out; + } + res->flavors->num_flavors++; + } + + status = 0; +out: + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) +{ + int status = decode_op_hdr(xdr, OP_SECINFO); + if (status) + return status; + return decode_secinfo_common(xdr, res); +} + +#if defined(CONFIG_NFS_V4_1) +static int decode_secinfo_no_name(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) +{ + int status = decode_op_hdr(xdr, OP_SECINFO_NO_NAME); + if (status) + return status; + return decode_secinfo_common(xdr, res); +} + +static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) +{ + __be32 *p; + uint32_t bitmap_words; + unsigned int i; + + p = xdr_inline_decode(xdr, 4); + bitmap_words = be32_to_cpup(p++); + if (bitmap_words > NFS4_OP_MAP_NUM_WORDS) + return -EIO; + p = xdr_inline_decode(xdr, 4 * bitmap_words); + for (i = 0; i < bitmap_words; i++) + op_map->u.words[i] = be32_to_cpup(p++); + + return 0; +} + +static int decode_exchange_id(struct xdr_stream *xdr, + struct nfs41_exchange_id_res *res) +{ + __be32 *p; + uint32_t dummy; + char *dummy_str; + int status; + uint32_t impl_id_count; + + status = decode_op_hdr(xdr, OP_EXCHANGE_ID); + if (status) + return status; + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &res->clientid); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + res->seqid = be32_to_cpup(p++); + res->flags = be32_to_cpup(p++); + + res->state_protect.how = be32_to_cpup(p); + switch (res->state_protect.how) { + case SP4_NONE: + break; + case SP4_MACH_CRED: + status = decode_op_map(xdr, &res->state_protect.enforce); + if (status) + return status; + status = decode_op_map(xdr, &res->state_protect.allow); + if (status) + return status; + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + + /* server_owner4.so_minor_id */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &res->server_owner->minor_id); + + /* server_owner4.so_major_id */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + memcpy(res->server_owner->major_id, dummy_str, dummy); + res->server_owner->major_id_sz = dummy; + + /* server_scope4 */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + memcpy(res->server_scope->server_scope, dummy_str, dummy); + res->server_scope->server_scope_sz = dummy; + + /* Implementation Id */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + impl_id_count = be32_to_cpup(p++); + + if (impl_id_count) { + /* nii_domain */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + memcpy(res->impl_id->domain, dummy_str, dummy); + + /* nii_name */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + memcpy(res->impl_id->name, dummy_str, dummy); + + /* nii_date */ + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &res->impl_id->date.seconds); + res->impl_id->date.nseconds = be32_to_cpup(p); + + /* if there's more than one entry, ignore the rest */ + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_chan_attrs(struct xdr_stream *xdr, + struct nfs4_channel_attrs *attrs) +{ + __be32 *p; + u32 nr_attrs, val; + + p = xdr_inline_decode(xdr, 28); + if (unlikely(!p)) + goto out_overflow; + val = be32_to_cpup(p++); /* headerpadsz */ + if (val) + return -EINVAL; /* no support for header padding yet */ + attrs->max_rqst_sz = be32_to_cpup(p++); + attrs->max_resp_sz = be32_to_cpup(p++); + attrs->max_resp_sz_cached = be32_to_cpup(p++); + attrs->max_ops = be32_to_cpup(p++); + attrs->max_reqs = be32_to_cpup(p++); + nr_attrs = be32_to_cpup(p); + if (unlikely(nr_attrs > 1)) { + printk(KERN_WARNING "NFS: %s: Invalid rdma channel attrs " + "count %u\n", __func__, nr_attrs); + return -EINVAL; + } + if (nr_attrs == 1) { + p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */ + if (unlikely(!p)) + goto out_overflow; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) +{ + return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN); +} + +static int decode_bind_conn_to_session(struct xdr_stream *xdr, + struct nfs41_bind_conn_to_session_res *res) +{ + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION); + if (!status) + status = decode_sessionid(xdr, &res->session->sess_id); + if (unlikely(status)) + return status; + + /* dir flags, rdma mode bool */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + + res->dir = be32_to_cpup(p++); + if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH) + return -EIO; + if (be32_to_cpup(p) == 0) + res->use_conn_in_rdma_mode = false; + else + res->use_conn_in_rdma_mode = true; + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_create_session(struct xdr_stream *xdr, + struct nfs41_create_session_res *res) +{ + __be32 *p; + int status; + struct nfs_client *clp = res->client; + struct nfs4_session *session = clp->cl_session; + + status = decode_op_hdr(xdr, OP_CREATE_SESSION); + if (!status) + status = decode_sessionid(xdr, &session->sess_id); + if (unlikely(status)) + return status; + + /* seqid, flags */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + clp->cl_seqid = be32_to_cpup(p++); + session->flags = be32_to_cpup(p); + + /* Channel attributes */ + status = decode_chan_attrs(xdr, &session->fc_attrs); + if (!status) + status = decode_chan_attrs(xdr, &session->bc_attrs); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) +{ + return decode_op_hdr(xdr, OP_DESTROY_SESSION); +} + +static int decode_destroy_clientid(struct xdr_stream *xdr, void *dummy) +{ + return decode_op_hdr(xdr, OP_DESTROY_CLIENTID); +} + +static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy) +{ + return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE); +} +#endif /* CONFIG_NFS_V4_1 */ + +static int decode_sequence(struct xdr_stream *xdr, + struct nfs4_sequence_res *res, + struct rpc_rqst *rqstp) +{ +#if defined(CONFIG_NFS_V4_1) + struct nfs4_session *session; + struct nfs4_sessionid id; + u32 dummy; + int status; + __be32 *p; + + if (res->sr_slot == NULL) + return 0; + if (!res->sr_slot->table->session) + return 0; + + status = decode_op_hdr(xdr, OP_SEQUENCE); + if (!status) + status = decode_sessionid(xdr, &id); + if (unlikely(status)) + goto out_err; + + /* + * If the server returns different values for sessionID, slotID or + * sequence number, the server is looney tunes. + */ + status = -EREMOTEIO; + session = res->sr_slot->table->session; + + if (memcmp(id.data, session->sess_id.data, + NFS4_MAX_SESSIONID_LEN)) { + dprintk("%s Invalid session id\n", __func__); + goto out_err; + } + + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; + + /* seqid */ + dummy = be32_to_cpup(p++); + if (dummy != res->sr_slot->seq_nr) { + dprintk("%s Invalid sequence number\n", __func__); + goto out_err; + } + /* slot id */ + dummy = be32_to_cpup(p++); + if (dummy != res->sr_slot->slot_nr) { + dprintk("%s Invalid slot id\n", __func__); + goto out_err; + } + /* highest slot id */ + res->sr_highest_slotid = be32_to_cpup(p++); + /* target highest slot id */ + res->sr_target_highest_slotid = be32_to_cpup(p++); + /* result flags */ + res->sr_status_flags = be32_to_cpup(p); + status = 0; +out_err: + res->sr_status = status; + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + status = -EIO; + goto out_err; +#else /* CONFIG_NFS_V4_1 */ + return 0; +#endif /* CONFIG_NFS_V4_1 */ +} + +#if defined(CONFIG_NFS_V4_1) /* - * Decode OPEN_DOWNGRADE response + * TODO: Need to handle case when EOF != true; */ -static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) -{ - struct xdr_stream xdr; - struct compound_hdr hdr; - int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); - if (status) - goto out; - status = decode_putfh(&xdr); - if (status) - goto out; - status = decode_open_downgrade(&xdr, res); - if (status != 0) +static int decode_getdevicelist(struct xdr_stream *xdr, + struct pnfs_devicelist *res) +{ + __be32 *p; + int status, i; + nfs4_verifier verftemp; + + status = decode_op_hdr(xdr, OP_GETDEVICELIST); + if (status) + return status; + + p = xdr_inline_decode(xdr, 8 + 8 + 4); + if (unlikely(!p)) + goto out_overflow; + + /* TODO: Skip cookie for now */ + p += 2; + + /* Read verifier */ + p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE); + + res->num_devs = be32_to_cpup(p); + + dprintk("%s: num_dev %d\n", __func__, res->num_devs); + + if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) { + printk(KERN_ERR "NFS: %s too many result dev_num %u\n", + __func__, res->num_devs); + return -EIO; + } + + p = xdr_inline_decode(xdr, + res->num_devs * NFS4_DEVICEID4_SIZE + 4); + if (unlikely(!p)) + goto out_overflow; + for (i = 0; i < res->num_devs; i++) + p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, + NFS4_DEVICEID4_SIZE); + res->eof = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_getdeviceinfo(struct xdr_stream *xdr, + struct pnfs_device *pdev) +{ + __be32 *p; + uint32_t len, type; + int status; + + status = decode_op_hdr(xdr, OP_GETDEVICEINFO); + if (status) { + if (status == -ETOOSMALL) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + pdev->mincount = be32_to_cpup(p); + dprintk("%s: Min count too small. mincnt = %u\n", + __func__, pdev->mincount); + } + return status; + } + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + type = be32_to_cpup(p++); + if (type != pdev->layout_type) { + dprintk("%s: layout mismatch req: %u pdev: %u\n", + __func__, pdev->layout_type, type); + return -EINVAL; + } + /* + * Get the length of the opaque device_addr4. xdr_read_pages places + * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages) + * and places the remaining xdr data in xdr_buf->tail + */ + pdev->mincount = be32_to_cpup(p); + if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount) + goto out_overflow; + + /* Parse notification bitmap, verifying that it is zero. */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len) { + uint32_t i; + + p = xdr_inline_decode(xdr, 4 * len); + if (unlikely(!p)) + goto out_overflow; + for (i = 0; i < len; i++, p++) { + if (be32_to_cpup(p)) { + dprintk("%s: notifications not supported\n", + __func__); + return -EIO; + } + } + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs4_layoutget_res *res) +{ + __be32 *p; + int status; + u32 layout_count; + u32 recvd; + + status = decode_op_hdr(xdr, OP_LAYOUTGET); + if (status) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->return_on_close = be32_to_cpup(p); + decode_stateid(xdr, &res->stateid); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + layout_count = be32_to_cpup(p); + if (!layout_count) { + dprintk("%s: server responded with empty layout array\n", + __func__); + return -EINVAL; + } + + p = xdr_inline_decode(xdr, 28); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &res->range.offset); + p = xdr_decode_hyper(p, &res->range.length); + res->range.iomode = be32_to_cpup(p++); + res->type = be32_to_cpup(p++); + res->layoutp->len = be32_to_cpup(p); + + dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", + __func__, + (unsigned long)res->range.offset, + (unsigned long)res->range.length, + res->range.iomode, + res->type, + res->layoutp->len); + + recvd = xdr_read_pages(xdr, res->layoutp->len); + if (res->layoutp->len > recvd) { + dprintk("NFS: server cheating in layoutget reply: " + "layout len %u > recvd %u\n", + res->layoutp->len, recvd); + return -EINVAL; + } + + if (layout_count > 1) { + /* We only handle a length one array at the moment. Any + * further entries are just ignored. Note that this means + * the client may see a response that is less than the + * minimum it requested. + */ + dprintk("%s: server responded with %d layouts, dropping tail\n", + __func__, layout_count); + } + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_layoutreturn(struct xdr_stream *xdr, + struct nfs4_layoutreturn_res *res) +{ + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_LAYOUTRETURN); + if (status) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->lrs_present = be32_to_cpup(p); + if (res->lrs_present) + status = decode_stateid(xdr, &res->stateid); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_layoutcommit(struct xdr_stream *xdr, + struct rpc_rqst *req, + struct nfs4_layoutcommit_res *res) +{ + __be32 *p; + __u32 sizechanged; + int status; + + status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); + res->status = status; + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + sizechanged = be32_to_cpup(p); + + if (sizechanged) { + /* throw away new size */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_test_stateid(struct xdr_stream *xdr, + struct nfs41_test_stateid_res *res) +{ + __be32 *p; + int status; + int num_res; + + status = decode_op_hdr(xdr, OP_TEST_STATEID); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + num_res = be32_to_cpup(p++); + if (num_res != 1) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->status = be32_to_cpup(p++); + + return status; +out_overflow: + print_overflow_msg(__func__, xdr); out: - return status; + return -EIO; +} + +static int decode_free_stateid(struct xdr_stream *xdr, + struct nfs41_free_stateid_res *res) +{ + res->status = decode_op_hdr(xdr, OP_FREE_STATEID); + return res->status; } +#endif /* CONFIG_NFS_V4_1 */ /* * END OF "GENERIC" DECODE ROUTINES. */ /* + * Decode OPEN_DOWNGRADE response + */ +static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_closeres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_open_downgrade(xdr, res); + if (status != 0) + goto out; + decode_getfattr(xdr, res->fattr, res->server); +out: + return status; +} + +/* * Decode ACCESS response */ -static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res) +static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_accessres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status != 0) goto out; - status = decode_access(&xdr, res); + status = decode_access(xdr, &res->supported, &res->access); if (status != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -3812,22 +6092,28 @@ out: /* * Decode LOOKUP response */ -static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res) +static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_lookup_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - if ((status = decode_lookup(&xdr)) != 0) + status = decode_lookup(xdr); + if (status) goto out; - if ((status = decode_getfh(&xdr, res->fh)) != 0) + status = decode_getfh(xdr, res->fh); + if (status) goto out; - status = decode_getfattr(&xdr, res->fattr, res->server); + status = decode_getfattr_label(xdr, res->fattr, res->label, res->server); out: return status; } @@ -3835,19 +6121,26 @@ out: /* * Decode LOOKUP_ROOT response */ -static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res) +static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_lookup_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) + + status = decode_compound_hdr(xdr, &hdr); + if (status) goto out; - if ((status = decode_putrootfh(&xdr)) != 0) + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) goto out; - if ((status = decode_getfh(&xdr, res->fh)) == 0) - status = decode_getfattr(&xdr, res->fattr, res->server); + status = decode_putrootfh(xdr); + if (status) + goto out; + status = decode_getfh(xdr, res->fh); + if (status == 0) + status = decode_getfattr_label(xdr, res->fattr, + res->label, res->server); out: return status; } @@ -3855,20 +6148,22 @@ out: /* * Decode REMOVE response */ -static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_removeres *res) +static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_removeres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) + + status = decode_compound_hdr(xdr, &hdr); + if (status) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) goto out; - if ((status = decode_remove(&xdr, &res->cinfo)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - decode_getfattr(&xdr, &res->dir_attr, res->server); + status = decode_remove(xdr, &res->cinfo); out: return status; } @@ -3876,29 +6171,28 @@ out: /* * Decode RENAME response */ -static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res) +static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_renameres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) - goto out; - if ((status = decode_putfh(&xdr)) != 0) - goto out; - if ((status = decode_savefh(&xdr)) != 0) + + status = decode_compound_hdr(xdr, &hdr); + if (status) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) goto out; - if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - /* Current FH is target directory */ - if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0) + status = decode_savefh(xdr); + if (status) goto out; - if ((status = decode_restorefh(&xdr)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - decode_getfattr(&xdr, res->old_fattr, res->server); + status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo); out: return status; } @@ -3906,32 +6200,38 @@ out: /* * Decode LINK response */ -static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res) +static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_link_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) + + status = decode_compound_hdr(xdr, &hdr); + if (status) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) goto out; - if ((status = decode_savefh(&xdr)) != 0) + status = decode_savefh(xdr); + if (status) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - if ((status = decode_link(&xdr, &res->cinfo)) != 0) + status = decode_link(xdr, &res->cinfo); + if (status) goto out; /* * Note order: OP_LINK leaves the directory as the current * filehandle. */ - if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0) - goto out; - if ((status = decode_restorefh(&xdr)) != 0) + status = decode_restorefh(xdr); + if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr_label(xdr, res->fattr, res->label, res->server); out: return status; } @@ -3939,28 +6239,28 @@ out: /* * Decode CREATE response */ -static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res) +static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_create_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) - goto out; - if ((status = decode_putfh(&xdr)) != 0) - goto out; - if ((status = decode_savefh(&xdr)) != 0) + + status = decode_compound_hdr(xdr, &hdr); + if (status) goto out; - if ((status = decode_create(&xdr,&res->dir_cinfo)) != 0) + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) goto out; - if ((status = decode_getfh(&xdr, res->fh)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - if (decode_getfattr(&xdr, res->fattr, res->server) != 0) + status = decode_create(xdr, &res->dir_cinfo); + if (status) goto out; - if ((status = decode_restorefh(&xdr)) != 0) + status = decode_getfh(xdr, res->fh); + if (status) goto out; - decode_getfattr(&xdr, res->dir_fattr, res->server); + decode_getfattr_label(xdr, res->fattr, res->label, res->server); out: return status; } @@ -3968,72 +6268,72 @@ out: /* * Decode SYMLINK response */ -static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res) +static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_create_res *res) { - return nfs4_xdr_dec_create(rqstp, p, res); + return nfs4_xdr_dec_create(rqstp, xdr, res); } /* * Decode GETATTR response */ -static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res) +static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_getattr_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_getfattr(&xdr, res->fattr, res->server); + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_getfattr_label(xdr, res->fattr, res->label, res->server); out: return status; - } /* * Encode an SETACL request */ -static int -nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args) -{ - struct xdr_stream xdr; - struct compound_hdr hdr = { - .nops = 2, - }; - int status; - - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_setacl(&xdr, args); -out: - return status; +static void nfs4_xdr_enc_setacl(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs_setaclargs *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_setacl(xdr, args, &hdr); + encode_nops(&hdr); } + /* * Decode SETACL response */ static int -nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, void *res) +nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_setaclres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_setattr(&xdr, res); + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_setattr(xdr); out: return status; } @@ -4042,20 +6342,26 @@ out: * Decode GETACL response */ static int -nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, size_t *acl_len) +nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_getaclres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + if (res->acl_scratch != NULL) { + void *p = page_address(res->acl_scratch); + xdr_set_scratch_buffer(xdr, p, PAGE_SIZE); + } + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_getacl(&xdr, rqstp, acl_len); + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_getacl(xdr, rqstp, res); out: return status; @@ -4064,20 +6370,22 @@ out: /* * Decode CLOSE response */ -static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) -{ - struct xdr_stream xdr; - struct compound_hdr hdr; - int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); - if (status) - goto out; - status = decode_putfh(&xdr); - if (status) - goto out; - status = decode_close(&xdr, res); +static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_closeres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_close(xdr, res); if (status != 0) goto out; /* @@ -4086,132 +6394,138 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos * an ESTALE error. Shouldn't be a problem, * though, since fattr->valid will remain unset. */ - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(xdr, res->fattr, res->server); out: - return status; + return status; } /* * Decode OPEN response */ -static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) +static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_openres *res) { - struct xdr_stream xdr; - struct compound_hdr hdr; - int status; + struct compound_hdr hdr; + int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); - if (status) - goto out; - status = decode_putfh(&xdr); - if (status) - goto out; - status = decode_savefh(&xdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_open(&xdr, res); - if (status) - goto out; - if (decode_getfh(&xdr, &res->fh) != 0) + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) goto out; - if (decode_getfattr(&xdr, res->f_attr, res->server) != 0) + status = decode_putfh(xdr); + if (status) goto out; - if (decode_restorefh(&xdr) != 0) + status = decode_open(xdr, res); + if (status) + goto out; + status = decode_getfh(xdr, &res->fh); + if (status) goto out; - decode_getfattr(&xdr, res->dir_attr, res->server); + if (res->access_request) + decode_access(xdr, &res->access_supported, &res->access_result); + decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server); out: - return status; + return status; } /* * Decode OPEN_CONFIRM response */ -static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res) -{ - struct xdr_stream xdr; - struct compound_hdr hdr; - int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); - if (status) - goto out; - status = decode_putfh(&xdr); - if (status) - goto out; - status = decode_open_confirm(&xdr, res); +static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_open_confirmres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_open_confirm(xdr, res); out: - return status; + return status; } /* * Decode OPEN response */ -static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) -{ - struct xdr_stream xdr; - struct compound_hdr hdr; - int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); - if (status) - goto out; - status = decode_putfh(&xdr); - if (status) - goto out; - status = decode_open(&xdr, res); - if (status) - goto out; - decode_getfattr(&xdr, res->f_attr, res->server); +static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_openres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_open(xdr, res); + if (status) + goto out; + if (res->access_request) + decode_access(xdr, &res->access_supported, &res->access_result); + decode_getfattr(xdr, res->f_attr, res->server); out: - return status; + return status; } /* * Decode SETATTR response */ -static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res) -{ - struct xdr_stream xdr; - struct compound_hdr hdr; - int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); - if (status) - goto out; - status = decode_putfh(&xdr); - if (status) - goto out; - status = decode_setattr(&xdr, res); - if (status) - goto out; - status = decode_getfattr(&xdr, res->fattr, res->server); - if (status == NFS4ERR_DELAY) - status = 0; +static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs_setattrres *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_setattr(xdr); + if (status) + goto out; + decode_getfattr_label(xdr, res->fattr, res->label, res->server); out: - return status; + return status; } /* * Decode LOCK response */ -static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res) +static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_lock_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_lock(&xdr, res); + status = decode_lock(xdr, res); out: return status; } @@ -4219,20 +6533,22 @@ out: /* * Decode LOCKT response */ -static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res) +static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_lockt_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_lockt(&xdr, res); + status = decode_lockt(xdr, res); out: return status; } @@ -4240,41 +6556,58 @@ out: /* * Decode LOCKU response */ -static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res) +static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_locku_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_locku(&xdr, res); + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_locku(xdr, res); out: return status; } +static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, void *dummy) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_release_lockowner(xdr); + return status; +} + /* * Decode READLINK response */ -static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, void *res) +static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_readlink_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_readlink(&xdr, rqstp); + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_readlink(xdr, rqstp); out: return status; } @@ -4282,20 +6615,22 @@ out: /* * Decode READDIR response */ -static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res) +static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs4_readdir_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_readdir(&xdr, rqstp, res); + status = decode_readdir(xdr, rqstp, res); out: return status; } @@ -4303,20 +6638,22 @@ out: /* * Decode Read response */ -static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res) +static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_pgio_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_read(&xdr, rqstp, res); + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_read(xdr, rqstp, res); if (!status) status = res->count; out: @@ -4326,23 +6663,26 @@ out: /* * Decode WRITE response */ -static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res) +static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_pgio_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (status) goto out; - status = decode_write(&xdr, res); + status = decode_write(xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + if (res->fattr) + decode_getfattr(xdr, res->fattr, res->server); if (!status) status = res->count; out: @@ -4352,98 +6692,103 @@ out: /* * Decode COMMIT response */ -static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res) +static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + struct nfs_commitres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; - status = decode_putfh(&xdr); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status) goto out; - status = decode_commit(&xdr, res); + status = decode_putfh(xdr); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + status = decode_commit(xdr, res); out: return status; } /* - * FSINFO request + * Decode FSINFO response */ -static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo) +static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_fsinfo_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_putfh(&xdr); + status = decode_sequence(xdr, &res->seq_res, req); if (!status) - status = decode_fsinfo(&xdr, fsinfo); + status = decode_putfh(xdr); if (!status) - status = -nfs4_stat_to_errno(hdr.status); + status = decode_fsinfo(xdr, res->fsinfo); return status; } /* - * PATHCONF request + * Decode PATHCONF response */ -static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *pathconf) +static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_pathconf_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, &res->seq_res, req); if (!status) - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (!status) - status = decode_pathconf(&xdr, pathconf); + status = decode_pathconf(xdr, res->pathconf); return status; } /* - * STATFS request + * Decode STATFS response */ -static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *fsstat) +static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, struct xdr_stream *xdr, + struct nfs4_statfs_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, &res->seq_res, req); if (!status) - status = decode_putfh(&xdr); + status = decode_putfh(xdr); if (!status) - status = decode_statfs(&xdr, fsstat); + status = decode_statfs(xdr, res->fsstat); return status; } /* - * GETATTR_BITMAP request + * Decode GETATTR_BITMAP response */ -static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res) +static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_server_caps_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) + status = decode_compound_hdr(xdr, &hdr); + if (status) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_sequence(xdr, &res->seq_res, req); + if (status) goto out; - status = decode_server_caps(&xdr, res); + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_server_caps(xdr, res); out: return status; } @@ -4451,122 +6796,539 @@ out: /* * Decode RENEW response */ -static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy) +static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + void *__unused) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_renew(&xdr); + status = decode_renew(xdr); return status; } /* - * a SETCLIENTID request + * Decode SETCLIENTID response */ -static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, - struct nfs_client *clp) +static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_setclientid_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_setclientid(&xdr, clp); + status = decode_setclientid(xdr, res); + return status; +} + +/* + * Decode SETCLIENTID_CONFIRM response + */ +static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, + struct xdr_stream *xdr) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = -nfs4_stat_to_errno(hdr.status); + status = decode_setclientid_confirm(xdr); return status; } /* - * a SETCLIENTID_CONFIRM request + * Decode DELEGRETURN response */ -static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo) +static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_delegreturnres *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status != 0) + goto out; + status = decode_getfattr(xdr, res->fattr, res->server); + if (status != 0) + goto out; + status = decode_delegreturn(xdr); +out: + return status; +} + +/* + * Decode FS_LOCATIONS response + */ +static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_fs_locations_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, req); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + if (res->migration) { + xdr_enter_page(xdr, PAGE_SIZE); + status = decode_getfattr_generic(xdr, + &res->fs_locations->fattr, + NULL, res->fs_locations, + NULL, res->fs_locations->server); + if (status) + goto out; + if (res->renew) + status = decode_renew(xdr); + } else { + status = decode_lookup(xdr); + if (status) + goto out; + xdr_enter_page(xdr, PAGE_SIZE); + status = decode_getfattr_generic(xdr, + &res->fs_locations->fattr, + NULL, res->fs_locations, + NULL, res->fs_locations->server); + } +out: + return status; +} + +/* + * Decode SECINFO response + */ +static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_secinfo_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_secinfo(xdr, res); +out: + return status; +} + +/* + * Decode FSID_PRESENT response + */ +static int nfs4_xdr_dec_fsid_present(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_fsid_present_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_getfh(xdr, res->fh); + if (status) + goto out; + if (res->renew) + status = decode_renew(xdr); +out: + return status; +} + +#if defined(CONFIG_NFS_V4_1) +/* + * Decode BIND_CONN_TO_SESSION response + */ +static int nfs4_xdr_dec_bind_conn_to_session(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_setclientid_confirm(&xdr); + status = decode_bind_conn_to_session(xdr, res); + return status; +} + +/* + * Decode EXCHANGE_ID response + */ +static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_putrootfh(&xdr); + status = decode_exchange_id(xdr, res); + return status; +} + +/* + * Decode CREATE_SESSION response + */ +static int nfs4_xdr_dec_create_session(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_create_session_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = decode_fsinfo(&xdr, fsinfo); + status = decode_create_session(xdr, res); + return status; +} + +/* + * Decode DESTROY_SESSION response + */ +static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); if (!status) - status = -nfs4_stat_to_errno(hdr.status); + status = decode_destroy_session(xdr, res); return status; } /* - * DELEGRETURN request + * Decode DESTROY_CLIENTID response + */ +static int nfs4_xdr_dec_destroy_clientid(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_destroy_clientid(xdr, res); + return status; +} + +/* + * Decode SEQUENCE response */ -static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res) +static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_sequence_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, res, rqstp); + return status; +} + +/* + * Decode GET_LEASE_TIME response + */ +static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_get_lease_time_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, &res->lr_seq_res, rqstp); + if (!status) + status = decode_putrootfh(xdr); + if (!status) + status = decode_fsinfo(xdr, res->lr_fsinfo); + return status; +} + +/* + * Decode RECLAIM_COMPLETE response + */ +static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_reclaim_complete_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (!status) + status = decode_reclaim_complete(xdr, (void *)NULL); + return status; +} + +/* + * Decode GETDEVICELIST response + */ +static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_getdevicelist_res *res) +{ + struct compound_hdr hdr; + int status; + + dprintk("encoding getdevicelist!\n"); + + status = decode_compound_hdr(xdr, &hdr); if (status != 0) goto out; - status = decode_putfh(&xdr); + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status != 0) goto out; - status = decode_delegreturn(&xdr); - decode_getfattr(&xdr, res->fattr, res->server); + status = decode_putfh(xdr); + if (status != 0) + goto out; + status = decode_getdevicelist(xdr, res->devlist); out: return status; } /* - * FS_LOCATIONS request + * Decode GETDEVINFO response */ -static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations *res) +static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_getdeviceinfo_res *res) { - struct xdr_stream xdr; struct compound_hdr hdr; int status; - xdr_init_decode(&xdr, &req->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); + status = decode_compound_hdr(xdr, &hdr); + if (status != 0) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); if (status != 0) goto out; - if ((status = decode_putfh(&xdr)) != 0) + status = decode_getdeviceinfo(xdr, res->pdev); +out: + return status; +} + +/* + * Decode LAYOUTGET response + */ +static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_layoutget_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) goto out; - if ((status = decode_lookup(&xdr)) != 0) + status = decode_putfh(xdr); + if (status) goto out; - xdr_enter_page(&xdr, PAGE_SIZE); - status = decode_getfattr(&xdr, &res->fattr, res->server); + status = decode_layoutget(xdr, rqstp, res); out: return status; } -__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +/* + * Decode LAYOUTRETURN response + */ +static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_layoutreturn_res *res) { - uint32_t bitmap[2] = {0}; - uint32_t len; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_layoutreturn(xdr, res); +out: + return status; +} - if (!*p++) { - if (!*p) - return ERR_PTR(-EAGAIN); +/* + * Decode LAYOUTCOMMIT response + */ +static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_layoutcommit_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_layoutcommit(xdr, rqstp, res); + if (status) + goto out; + decode_getfattr(xdr, res->fattr, res->server); +out: + return status; +} + +/* + * Decode SECINFO_NO_NAME response + */ +static int nfs4_xdr_dec_secinfo_no_name(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_secinfo_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putrootfh(xdr); + if (status) + goto out; + status = decode_secinfo_no_name(xdr, res); +out: + return status; +} + +/* + * Decode TEST_STATEID response + */ +static int nfs4_xdr_dec_test_stateid(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_test_stateid_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_test_stateid(xdr, res); +out: + return status; +} + +/* + * Decode FREE_STATEID response + */ +static int nfs4_xdr_dec_free_stateid(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_free_stateid_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_free_stateid(xdr, res); +out: + return status; +} +#endif /* CONFIG_NFS_V4_1 */ + +/** + * nfs4_decode_dirent - Decode a single NFSv4 directory entry stored in + * the local page cache. + * @xdr: XDR stream where entry resides + * @entry: buffer to fill in with entry data + * @plus: boolean indicating whether this should be a readdirplus entry + * + * Returns zero if successful, otherwise a negative errno value is + * returned. + * + * This function is not invoked during READDIR reply decoding, but + * rather whenever an application invokes the getdents(2) system call + * on a directory already in our cache. + */ +int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, + int plus) +{ + unsigned int savep; + uint32_t bitmap[3] = {0}; + uint32_t len; + __be32 *p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (*p == xdr_zero) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + if (*p == xdr_zero) + return -EAGAIN; entry->eof = 1; - return ERR_PTR(-EBADCOOKIE); + return -EBADCOOKIE; } + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; entry->prev_cookie = entry->cookie; p = xdr_decode_hyper(p, &entry->cookie); - entry->len = ntohl(*p++); + entry->len = be32_to_cpup(p); + + p = xdr_inline_decode(xdr, entry->len); + if (unlikely(!p)) + goto out_overflow; entry->name = (const char *) p; - p += XDR_QUADLEN(entry->len); /* * In case the server doesn't return an inode number, @@ -4574,32 +7336,31 @@ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) * since glibc seems to choke on it...) */ entry->ino = 1; + entry->fattr->valid = 0; - len = ntohl(*p++); /* bitmap length */ - if (len-- > 0) { - bitmap[0] = ntohl(*p++); - if (len-- > 0) { - bitmap[1] = ntohl(*p++); - p += len; - } - } - len = XDR_QUADLEN(ntohl(*p++)); /* attribute buffer length */ - if (len > 0) { - if (bitmap[0] & FATTR4_WORD0_RDATTR_ERROR) { - bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; - /* Ignore the return value of rdattr_error for now */ - p++; - len--; - } - if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID) - xdr_decode_hyper(p, &entry->ino); - else if (bitmap[0] == FATTR4_WORD0_FILEID) - xdr_decode_hyper(p, &entry->ino); - p += len; - } + if (decode_attr_bitmap(xdr, bitmap) < 0) + goto out_overflow; - entry->eof = !p[0] && p[1]; - return p; + if (decode_attr_length(xdr, &len, &savep) < 0) + goto out_overflow; + + if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, + NULL, entry->label, entry->server) < 0) + goto out_overflow; + if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + entry->ino = entry->fattr->mounted_on_fileid; + else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) + entry->ino = entry->fattr->fileid; + + entry->d_type = DT_UNKNOWN; + if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) + entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EAGAIN; } /* @@ -4611,42 +7372,35 @@ static struct { int errno; } nfs_errtbl[] = { { NFS4_OK, 0 }, - { NFS4ERR_PERM, EPERM }, - { NFS4ERR_NOENT, ENOENT }, - { NFS4ERR_IO, errno_NFSERR_IO }, - { NFS4ERR_NXIO, ENXIO }, - { NFS4ERR_ACCESS, EACCES }, - { NFS4ERR_EXIST, EEXIST }, - { NFS4ERR_XDEV, EXDEV }, - { NFS4ERR_NOTDIR, ENOTDIR }, - { NFS4ERR_ISDIR, EISDIR }, - { NFS4ERR_INVAL, EINVAL }, - { NFS4ERR_FBIG, EFBIG }, - { NFS4ERR_NOSPC, ENOSPC }, - { NFS4ERR_ROFS, EROFS }, - { NFS4ERR_MLINK, EMLINK }, - { NFS4ERR_NAMETOOLONG, ENAMETOOLONG }, - { NFS4ERR_NOTEMPTY, ENOTEMPTY }, - { NFS4ERR_DQUOT, EDQUOT }, - { NFS4ERR_STALE, ESTALE }, - { NFS4ERR_BADHANDLE, EBADHANDLE }, - { NFS4ERR_BADOWNER, EINVAL }, - { NFS4ERR_BADNAME, EINVAL }, - { NFS4ERR_BAD_COOKIE, EBADCOOKIE }, - { NFS4ERR_NOTSUPP, ENOTSUPP }, - { NFS4ERR_TOOSMALL, ETOOSMALL }, - { NFS4ERR_SERVERFAULT, ESERVERFAULT }, - { NFS4ERR_BADTYPE, EBADTYPE }, - { NFS4ERR_LOCKED, EAGAIN }, - { NFS4ERR_RESOURCE, EREMOTEIO }, - { NFS4ERR_SYMLINK, ELOOP }, - { NFS4ERR_OP_ILLEGAL, EOPNOTSUPP }, - { NFS4ERR_DEADLOCK, EDEADLK }, - { NFS4ERR_WRONGSEC, EPERM }, /* FIXME: this needs - * to be handled by a - * middle-layer. - */ - { -1, EIO } + { NFS4ERR_PERM, -EPERM }, + { NFS4ERR_NOENT, -ENOENT }, + { NFS4ERR_IO, -errno_NFSERR_IO}, + { NFS4ERR_NXIO, -ENXIO }, + { NFS4ERR_ACCESS, -EACCES }, + { NFS4ERR_EXIST, -EEXIST }, + { NFS4ERR_XDEV, -EXDEV }, + { NFS4ERR_NOTDIR, -ENOTDIR }, + { NFS4ERR_ISDIR, -EISDIR }, + { NFS4ERR_INVAL, -EINVAL }, + { NFS4ERR_FBIG, -EFBIG }, + { NFS4ERR_NOSPC, -ENOSPC }, + { NFS4ERR_ROFS, -EROFS }, + { NFS4ERR_MLINK, -EMLINK }, + { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFS4ERR_NOTEMPTY, -ENOTEMPTY }, + { NFS4ERR_DQUOT, -EDQUOT }, + { NFS4ERR_STALE, -ESTALE }, + { NFS4ERR_BADHANDLE, -EBADHANDLE }, + { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, + { NFS4ERR_NOTSUPP, -ENOTSUPP }, + { NFS4ERR_TOOSMALL, -ETOOSMALL }, + { NFS4ERR_SERVERFAULT, -EREMOTEIO }, + { NFS4ERR_BADTYPE, -EBADTYPE }, + { NFS4ERR_LOCKED, -EAGAIN }, + { NFS4ERR_SYMLINK, -ELOOP }, + { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, + { NFS4ERR_DEADLOCK, -EDEADLK }, + { -1, -EIO } }; /* @@ -4663,65 +7417,87 @@ nfs4_stat_to_errno(int stat) } if (stat <= 10000 || stat > 10100) { /* The server is looney tunes. */ - return ESERVERFAULT; + return -EREMOTEIO; } /* If we cannot translate the error, the recovery routines should * handle it. * Note: remaining NFSv4 error codes have values > 10000, so should * not conflict with native Linux error codes. */ - return stat; + return -stat; } #define PROC(proc, argtype, restype) \ [NFSPROC4_CLNT_##proc] = { \ .p_proc = NFSPROC4_COMPOUND, \ - .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ - .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ + .p_encode = (kxdreproc_t)nfs4_xdr_##argtype, \ + .p_decode = (kxdrdproc_t)nfs4_xdr_##restype, \ .p_arglen = NFS4_##argtype##_sz, \ .p_replen = NFS4_##restype##_sz, \ .p_statidx = NFSPROC4_CLNT_##proc, \ .p_name = #proc, \ - } +} struct rpc_procinfo nfs4_procedures[] = { - PROC(READ, enc_read, dec_read), - PROC(WRITE, enc_write, dec_write), - PROC(COMMIT, enc_commit, dec_commit), - PROC(OPEN, enc_open, dec_open), - PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm), - PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr), - PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade), - PROC(CLOSE, enc_close, dec_close), - PROC(SETATTR, enc_setattr, dec_setattr), - PROC(FSINFO, enc_fsinfo, dec_fsinfo), - PROC(RENEW, enc_renew, dec_renew), - PROC(SETCLIENTID, enc_setclientid, dec_setclientid), - PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm), - PROC(LOCK, enc_lock, dec_lock), - PROC(LOCKT, enc_lockt, dec_lockt), - PROC(LOCKU, enc_locku, dec_locku), - PROC(ACCESS, enc_access, dec_access), - PROC(GETATTR, enc_getattr, dec_getattr), - PROC(LOOKUP, enc_lookup, dec_lookup), - PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root), - PROC(REMOVE, enc_remove, dec_remove), - PROC(RENAME, enc_rename, dec_rename), - PROC(LINK, enc_link, dec_link), - PROC(SYMLINK, enc_symlink, dec_symlink), - PROC(CREATE, enc_create, dec_create), - PROC(PATHCONF, enc_pathconf, dec_pathconf), - PROC(STATFS, enc_statfs, dec_statfs), - PROC(READLINK, enc_readlink, dec_readlink), - PROC(READDIR, enc_readdir, dec_readdir), - PROC(SERVER_CAPS, enc_server_caps, dec_server_caps), - PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn), - PROC(GETACL, enc_getacl, dec_getacl), - PROC(SETACL, enc_setacl, dec_setacl), - PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), + PROC(READ, enc_read, dec_read), + PROC(WRITE, enc_write, dec_write), + PROC(COMMIT, enc_commit, dec_commit), + PROC(OPEN, enc_open, dec_open), + PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm), + PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr), + PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade), + PROC(CLOSE, enc_close, dec_close), + PROC(SETATTR, enc_setattr, dec_setattr), + PROC(FSINFO, enc_fsinfo, dec_fsinfo), + PROC(RENEW, enc_renew, dec_renew), + PROC(SETCLIENTID, enc_setclientid, dec_setclientid), + PROC(SETCLIENTID_CONFIRM, enc_setclientid_confirm, dec_setclientid_confirm), + PROC(LOCK, enc_lock, dec_lock), + PROC(LOCKT, enc_lockt, dec_lockt), + PROC(LOCKU, enc_locku, dec_locku), + PROC(ACCESS, enc_access, dec_access), + PROC(GETATTR, enc_getattr, dec_getattr), + PROC(LOOKUP, enc_lookup, dec_lookup), + PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root), + PROC(REMOVE, enc_remove, dec_remove), + PROC(RENAME, enc_rename, dec_rename), + PROC(LINK, enc_link, dec_link), + PROC(SYMLINK, enc_symlink, dec_symlink), + PROC(CREATE, enc_create, dec_create), + PROC(PATHCONF, enc_pathconf, dec_pathconf), + PROC(STATFS, enc_statfs, dec_statfs), + PROC(READLINK, enc_readlink, dec_readlink), + PROC(READDIR, enc_readdir, dec_readdir), + PROC(SERVER_CAPS, enc_server_caps, dec_server_caps), + PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn), + PROC(GETACL, enc_getacl, dec_getacl), + PROC(SETACL, enc_setacl, dec_setacl), + PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), + PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), + PROC(SECINFO, enc_secinfo, dec_secinfo), + PROC(FSID_PRESENT, enc_fsid_present, dec_fsid_present), +#if defined(CONFIG_NFS_V4_1) + PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC(CREATE_SESSION, enc_create_session, dec_create_session), + PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), + PROC(SEQUENCE, enc_sequence, dec_sequence), + PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), + PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), + PROC(LAYOUTGET, enc_layoutget, dec_layoutget), + PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), + PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), + PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), + PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), + PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), + PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), + PROC(BIND_CONN_TO_SESSION, + enc_bind_conn_to_session, dec_bind_conn_to_session), + PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), +#endif /* CONFIG_NFS_V4_1 */ }; -struct rpc_version nfs_version4 = { +const struct rpc_version nfs_version4 = { .number = 4, .nrprocs = ARRAY_SIZE(nfs4_procedures), .procs = nfs4_procedures diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 531379d3682..cd3c910d2d1 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -1,13 +1,12 @@ /* - * $Id: nfsroot.c,v 1.45 1998/03/07 10:44:46 mj Exp $ - * * Copyright (C) 1995, 1996 Gero Kuhlmann <gero@gkminix.han.de> * * Allow an NFS filesystem to be mounted as root. The way this works is: * (1) Use the IP autoconfig mechanism to set local IP addresses and routes. - * (2) Handle RPC negotiation with the system which replied to RARP or - * was reported as a boot server by BOOTP or manually. - * (3) The actual mounting is done later, when init() is running. + * (2) Construct the device string and the options string using DHCP + * option 17 and/or kernel command line options. + * (3) When mount_root() sets up the root file system, pass these strings + * to the NFS client's regular mount interface via sys_mount(). * * * Changes: @@ -67,456 +66,244 @@ * Hua Qin : Support for mounting root file system via * NFS over TCP. * Fabian Frederick: Option parser rebuilt (using parser lib) -*/ + * Chuck Lever : Use super.c's text-based mount option parsing + * Chuck Lever : Add "nfsrootdebug". + */ #include <linux/types.h> #include <linux/string.h> -#include <linux/kernel.h> -#include <linux/time.h> -#include <linux/fs.h> #include <linux/init.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/xprtsock.h> #include <linux/nfs.h> #include <linux/nfs_fs.h> -#include <linux/nfs_mount.h> -#include <linux/in.h> -#include <linux/major.h> #include <linux/utsname.h> -#include <linux/inet.h> #include <linux/root_dev.h> #include <net/ipconfig.h> -#include <linux/parser.h> -/* Define this to allow debugging output */ -#undef NFSROOT_DEBUG +#include "internal.h" + #define NFSDBG_FACILITY NFSDBG_ROOT /* Default path we try to mount. "%s" gets replaced by our IP address */ #define NFS_ROOT "/tftpboot/%s" +/* Default NFSROOT mount options. */ +#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096" + /* Parameters passed from the kernel command line */ -static char nfs_root_name[256] __initdata = ""; +static char nfs_root_parms[256] __initdata = ""; + +/* Text-based mount options passed to super.c */ +static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS; /* Address of NFS server */ -static __be32 servaddr __initdata = 0; +static __be32 servaddr __initdata = htonl(INADDR_NONE); /* Name of directory to mount */ -static char nfs_path[NFS_MAXPATHLEN] __initdata = { 0, }; - -/* NFS-related data */ -static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */ -static int nfs_port __initdata = 0; /* Port to connect to for NFS */ -static int mount_port __initdata = 0; /* Mount daemon port number */ - - -/*************************************************************************** - - Parsing of options - - ***************************************************************************/ - -enum { - /* Options that take integer arguments */ - Opt_port, Opt_rsize, Opt_wsize, Opt_timeo, Opt_retrans, Opt_acregmin, - Opt_acregmax, Opt_acdirmin, Opt_acdirmax, - /* Options that take no arguments */ - Opt_soft, Opt_hard, Opt_intr, - Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, - Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, Opt_udp, Opt_tcp, - Opt_acl, Opt_noacl, - /* Error token */ - Opt_err -}; - -static match_table_t __initdata tokens = { - {Opt_port, "port=%u"}, - {Opt_rsize, "rsize=%u"}, - {Opt_wsize, "wsize=%u"}, - {Opt_timeo, "timeo=%u"}, - {Opt_retrans, "retrans=%u"}, - {Opt_acregmin, "acregmin=%u"}, - {Opt_acregmax, "acregmax=%u"}, - {Opt_acdirmin, "acdirmin=%u"}, - {Opt_acdirmax, "acdirmax=%u"}, - {Opt_soft, "soft"}, - {Opt_hard, "hard"}, - {Opt_intr, "intr"}, - {Opt_nointr, "nointr"}, - {Opt_posix, "posix"}, - {Opt_noposix, "noposix"}, - {Opt_cto, "cto"}, - {Opt_nocto, "nocto"}, - {Opt_ac, "ac"}, - {Opt_noac, "noac"}, - {Opt_lock, "lock"}, - {Opt_nolock, "nolock"}, - {Opt_v2, "nfsvers=2"}, - {Opt_v2, "v2"}, - {Opt_v3, "nfsvers=3"}, - {Opt_v3, "v3"}, - {Opt_udp, "proto=udp"}, - {Opt_udp, "udp"}, - {Opt_tcp, "proto=tcp"}, - {Opt_tcp, "tcp"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_err, NULL} - -}; - -/* - * Parse option string. - */ - -static int __init root_nfs_parse(char *name, char *buf) -{ +static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = ""; - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - - if (!name) - return 1; - - /* Set the NFS remote path */ - p = strsep(&name, ","); - if (p[0] != '\0' && strcmp(p, "default") != 0) - strlcpy(buf, p, NFS_MAXPATHLEN); - - while ((p = strsep (&name, ",")) != NULL) { - int token; - if (!*p) - continue; - token = match_token(p, tokens, args); - - /* %u tokens only. Beware if you add new tokens! */ - if (token < Opt_soft && match_int(&args[0], &option)) - return 0; - switch (token) { - case Opt_port: - nfs_port = option; - break; - case Opt_rsize: - nfs_data.rsize = option; - break; - case Opt_wsize: - nfs_data.wsize = option; - break; - case Opt_timeo: - nfs_data.timeo = option; - break; - case Opt_retrans: - nfs_data.retrans = option; - break; - case Opt_acregmin: - nfs_data.acregmin = option; - break; - case Opt_acregmax: - nfs_data.acregmax = option; - break; - case Opt_acdirmin: - nfs_data.acdirmin = option; - break; - case Opt_acdirmax: - nfs_data.acdirmax = option; - break; - case Opt_soft: - nfs_data.flags |= NFS_MOUNT_SOFT; - break; - case Opt_hard: - nfs_data.flags &= ~NFS_MOUNT_SOFT; - break; - case Opt_intr: - case Opt_nointr: - break; - case Opt_posix: - nfs_data.flags |= NFS_MOUNT_POSIX; - break; - case Opt_noposix: - nfs_data.flags &= ~NFS_MOUNT_POSIX; - break; - case Opt_cto: - nfs_data.flags &= ~NFS_MOUNT_NOCTO; - break; - case Opt_nocto: - nfs_data.flags |= NFS_MOUNT_NOCTO; - break; - case Opt_ac: - nfs_data.flags &= ~NFS_MOUNT_NOAC; - break; - case Opt_noac: - nfs_data.flags |= NFS_MOUNT_NOAC; - break; - case Opt_lock: - nfs_data.flags &= ~NFS_MOUNT_NONLM; - break; - case Opt_nolock: - nfs_data.flags |= NFS_MOUNT_NONLM; - break; - case Opt_v2: - nfs_data.flags &= ~NFS_MOUNT_VER3; - break; - case Opt_v3: - nfs_data.flags |= NFS_MOUNT_VER3; - break; - case Opt_udp: - nfs_data.flags &= ~NFS_MOUNT_TCP; - break; - case Opt_tcp: - nfs_data.flags |= NFS_MOUNT_TCP; - break; - case Opt_acl: - nfs_data.flags &= ~NFS_MOUNT_NOACL; - break; - case Opt_noacl: - nfs_data.flags |= NFS_MOUNT_NOACL; - break; - default: - printk(KERN_WARNING "Root-NFS: unknown " - "option: %s\n", p); - return 0; - } - } - - return 1; -} +/* server:export path string passed to super.c */ +static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = ""; +#ifdef NFS_DEBUG /* - * Prepare the NFS data structure and parse all options. + * When the "nfsrootdebug" kernel command line option is specified, + * enable debugging messages for NFSROOT. */ -static int __init root_nfs_name(char *name) +static int __init nfs_root_debug(char *__unused) { - static char buf[NFS_MAXPATHLEN] __initdata; - char *cp; - - /* Set some default values */ - memset(&nfs_data, 0, sizeof(nfs_data)); - nfs_port = -1; - nfs_data.version = NFS_MOUNT_VERSION; - nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */ - nfs_data.rsize = NFS_DEF_FILE_IO_SIZE; - nfs_data.wsize = NFS_DEF_FILE_IO_SIZE; - nfs_data.acregmin = 3; - nfs_data.acregmax = 60; - nfs_data.acdirmin = 30; - nfs_data.acdirmax = 60; - strcpy(buf, NFS_ROOT); - - /* Process options received from the remote server */ - root_nfs_parse(root_server_path, buf); - - /* Override them by options set on kernel command-line */ - root_nfs_parse(name, buf); - - cp = utsname()->nodename; - if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) { - printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n"); - return -1; - } - sprintf(nfs_path, buf, cp); - + nfs_debug |= NFSDBG_ROOT | NFSDBG_MOUNT; return 1; } - -/* - * Get NFS server address. - */ -static int __init root_nfs_addr(void) -{ - if ((servaddr = root_server_addr) == htonl(INADDR_NONE)) { - printk(KERN_ERR "Root-NFS: No NFS server available, giving up.\n"); - return -1; - } - - snprintf(nfs_data.hostname, sizeof(nfs_data.hostname), - "%u.%u.%u.%u", NIPQUAD(servaddr)); - return 0; -} - -/* - * Tell the user what's going on. - */ -#ifdef NFSROOT_DEBUG -static void __init root_nfs_print(void) -{ - printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n", - nfs_path, nfs_data.hostname); - printk(KERN_NOTICE "Root-NFS: rsize = %d, wsize = %d, timeo = %d, retrans = %d\n", - nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans); - printk(KERN_NOTICE "Root-NFS: acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n", - nfs_data.acregmin, nfs_data.acregmax, - nfs_data.acdirmin, nfs_data.acdirmax); - printk(KERN_NOTICE "Root-NFS: nfsd port = %d, mountd port = %d, flags = %08x\n", - nfs_port, mount_port, nfs_data.flags); -} -#endif - - -static int __init root_nfs_init(void) -{ -#ifdef NFSROOT_DEBUG - nfs_debug |= NFSDBG_ROOT; +__setup("nfsrootdebug", nfs_root_debug); #endif - /* - * Decode the root directory path name and NFS options from - * the kernel command line. This has to go here in order to - * be able to use the client IP address for the remote root - * directory (necessary for pure RARP booting). - */ - if (root_nfs_name(nfs_root_name) < 0 || - root_nfs_addr() < 0) - return -1; - -#ifdef NFSROOT_DEBUG - root_nfs_print(); -#endif - - return 0; -} - - /* * Parse NFS server and directory information passed on the kernel * command line. + * + * nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>] + * + * If there is a "%s" token in the <root-dir> string, it is replaced + * by the ASCII-representation of the client's IP address. */ static int __init nfs_root_setup(char *line) { ROOT_DEV = Root_NFS; + if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { - strlcpy(nfs_root_name, line, sizeof(nfs_root_name)); + strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms)); } else { - int n = strlen(line) + sizeof(NFS_ROOT) - 1; - if (n >= sizeof(nfs_root_name)) - line[sizeof(nfs_root_name) - sizeof(NFS_ROOT) - 2] = '\0'; - sprintf(nfs_root_name, NFS_ROOT, line); + size_t n = strlen(line) + sizeof(NFS_ROOT) - 1; + if (n >= sizeof(nfs_root_parms)) + line[sizeof(nfs_root_parms) - sizeof(NFS_ROOT) - 2] = '\0'; + sprintf(nfs_root_parms, NFS_ROOT, line); } - root_server_addr = root_nfs_parse_addr(nfs_root_name); + + /* + * Extract the IP address of the NFS server containing our + * root file system, if one was specified. + * + * Note: root_nfs_parse_addr() removes the server-ip from + * nfs_root_parms, if it exists. + */ + root_server_addr = root_nfs_parse_addr(nfs_root_parms); + return 1; } __setup("nfsroot=", nfs_root_setup); -/*************************************************************************** +static int __init root_nfs_copy(char *dest, const char *src, + const size_t destlen) +{ + if (strlcpy(dest, src, destlen) > destlen) + return -1; + return 0; +} - Routines to actually mount the root directory +static int __init root_nfs_cat(char *dest, const char *src, + const size_t destlen) +{ + size_t len = strlen(dest); - ***************************************************************************/ + if (len && dest[len - 1] != ',') + if (strlcat(dest, ",", destlen) > destlen) + return -1; -/* - * Construct sockaddr_in from address and port number. - */ -static inline void -set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port) -{ - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = addr; - sin->sin_port = port; + if (strlcat(dest, src, destlen) > destlen) + return -1; + return 0; } /* - * Query server portmapper for the port of a daemon program. + * Parse out root export path and mount options from + * passed-in string @incoming. + * + * Copy the export path into @exppath. */ -static int __init root_nfs_getport(int program, int version, int proto) +static int __init root_nfs_parse_options(char *incoming, char *exppath, + const size_t exppathlen) { - struct sockaddr_in sin; + char *p; - printk(KERN_NOTICE "Looking up port of RPC %d/%d on %u.%u.%u.%u\n", - program, version, NIPQUAD(servaddr)); - set_sockaddr(&sin, servaddr, 0); - return rpcb_getport_sync(&sin, program, version, proto); -} + /* + * Set the NFS remote path + */ + p = strsep(&incoming, ","); + if (*p != '\0' && strcmp(p, "default") != 0) + if (root_nfs_copy(exppath, p, exppathlen)) + return -1; + /* + * @incoming now points to the rest of the string; if it + * contains something, append it to our root options buffer + */ + if (incoming != NULL && *incoming != '\0') + if (root_nfs_cat(nfs_root_options, incoming, + sizeof(nfs_root_options))) + return -1; + return 0; +} /* - * Use portmapper to find mountd and nfsd port numbers if not overriden - * by the user. Use defaults if portmapper is not available. - * XXX: Is there any nfs server with no portmapper? + * Decode the export directory path name and NFS options from + * the kernel command line. This has to be done late in order to + * use a dynamically acquired client IP address for the remote + * root directory path. + * + * Returns zero if successful; otherwise -1 is returned. */ -static int __init root_nfs_ports(void) +static int __init root_nfs_data(char *cmdline) { - int port; - int nfsd_ver, mountd_ver; - int nfsd_port, mountd_port; - int proto; - - if (nfs_data.flags & NFS_MOUNT_VER3) { - nfsd_ver = NFS3_VERSION; - mountd_ver = NFS_MNT3_VERSION; - nfsd_port = NFS_PORT; - mountd_port = NFS_MNT_PORT; - } else { - nfsd_ver = NFS2_VERSION; - mountd_ver = NFS_MNT_VERSION; - nfsd_port = NFS_PORT; - mountd_port = NFS_MNT_PORT; + char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; + int len, retval = -1; + char *tmp = NULL; + const size_t tmplen = sizeof(nfs_export_path); + + tmp = kzalloc(tmplen, GFP_KERNEL); + if (tmp == NULL) + goto out_nomem; + strcpy(tmp, NFS_ROOT); + + if (root_server_path[0] != '\0') { + dprintk("Root-NFS: DHCPv4 option 17: %s\n", + root_server_path); + if (root_nfs_parse_options(root_server_path, tmp, tmplen)) + goto out_optionstoolong; } - proto = (nfs_data.flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP; - - if (nfs_port < 0) { - if ((port = root_nfs_getport(NFS_PROGRAM, nfsd_ver, proto)) < 0) { - printk(KERN_ERR "Root-NFS: Unable to get nfsd port " - "number from server, using default\n"); - port = nfsd_port; - } - nfs_port = port; - dprintk("Root-NFS: Portmapper on server returned %d " - "as nfsd port\n", port); + if (cmdline[0] != '\0') { + dprintk("Root-NFS: nfsroot=%s\n", cmdline); + if (root_nfs_parse_options(cmdline, tmp, tmplen)) + goto out_optionstoolong; } - if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) { - printk(KERN_ERR "Root-NFS: Unable to get mountd port " - "number from server, using default\n"); - port = mountd_port; - } - mount_port = port; - dprintk("Root-NFS: mountd port is %d\n", port); + /* + * Append mandatory options for nfsroot so they override + * what has come before + */ + snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4", + &servaddr); + if (root_nfs_cat(nfs_root_options, mand_options, + sizeof(nfs_root_options))) + goto out_optionstoolong; - return 0; + /* + * Set up nfs_root_device. For NFS mounts, this looks like + * + * server:/path + * + * At this point, utsname()->nodename contains our local + * IP address or hostname, set by ipconfig. If "%s" exists + * in tmp, substitute the nodename, then shovel the whole + * mess into nfs_root_device. + */ + len = snprintf(nfs_export_path, sizeof(nfs_export_path), + tmp, utsname()->nodename); + if (len > (int)sizeof(nfs_export_path)) + goto out_devnametoolong; + len = snprintf(nfs_root_device, sizeof(nfs_root_device), + "%pI4:%s", &servaddr, nfs_export_path); + if (len > (int)sizeof(nfs_root_device)) + goto out_devnametoolong; + + retval = 0; + +out: + kfree(tmp); + return retval; +out_nomem: + printk(KERN_ERR "Root-NFS: could not allocate memory\n"); + goto out; +out_optionstoolong: + printk(KERN_ERR "Root-NFS: mount options string too long\n"); + goto out; +out_devnametoolong: + printk(KERN_ERR "Root-NFS: root device name too long.\n"); + goto out; } - -/* - * Get a file handle from the server for the directory which is to be - * mounted. +/** + * nfs_root_data - Return prepared 'data' for NFSROOT mount + * @root_device: OUT: address of string containing NFSROOT device + * @root_data: OUT: address of string containing NFSROOT mount options + * + * Returns zero and sets @root_device and @root_data if successful, + * otherwise -1 is returned. */ -static int __init root_nfs_get_handle(void) +int __init nfs_root_data(char **root_device, char **root_data) { - struct nfs_fh fh; - struct sockaddr_in sin; - int status; - int protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? - XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP; - int version = (nfs_data.flags & NFS_MOUNT_VER3) ? - NFS_MNT3_VERSION : NFS_MNT_VERSION; - - set_sockaddr(&sin, servaddr, htons(mount_port)); - status = nfs_mount((struct sockaddr *) &sin, sizeof(sin), NULL, - nfs_path, version, protocol, &fh); - if (status < 0) - printk(KERN_ERR "Root-NFS: Server returned error %d " - "while mounting %s\n", status, nfs_path); - else { - nfs_data.root.size = fh.size; - memcpy(nfs_data.root.data, fh.data, fh.size); + servaddr = root_server_addr; + if (servaddr == htonl(INADDR_NONE)) { + printk(KERN_ERR "Root-NFS: no NFS server address\n"); + return -1; } - return status; -} + if (root_nfs_data(nfs_root_parms) < 0) + return -1; -/* - * Get the NFS port numbers and file handle, and return the prepared 'data' - * argument for mount() if everything went OK. Return NULL otherwise. - */ -void * __init nfs_root_data(void) -{ - if (root_nfs_init() < 0 - || root_nfs_ports() < 0 - || root_nfs_get_handle() < 0) - return NULL; - set_sockaddr((struct sockaddr_in *) &nfs_data.addr, servaddr, htons(nfs_port)); - return (void*)&nfs_data; + *root_device = nfs_root_device; + *root_data = nfs_root_options; + return 0; } diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c new file mode 100644 index 00000000000..4eb0aead69b --- /dev/null +++ b/fs/nfs/nfstrace.c @@ -0,0 +1,9 @@ +/* + * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#include <linux/nfs_fs.h> +#include <linux/namei.h> +#include "internal.h" + +#define CREATE_TRACE_POINTS +#include "nfstrace.h" diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h new file mode 100644 index 00000000000..59f838cdc00 --- /dev/null +++ b/fs/nfs/nfstrace.h @@ -0,0 +1,730 @@ +/* + * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nfs + +#if !defined(_TRACE_NFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NFS_H + +#include <linux/tracepoint.h> + +#define nfs_show_file_type(ftype) \ + __print_symbolic(ftype, \ + { DT_UNKNOWN, "UNKNOWN" }, \ + { DT_FIFO, "FIFO" }, \ + { DT_CHR, "CHR" }, \ + { DT_DIR, "DIR" }, \ + { DT_BLK, "BLK" }, \ + { DT_REG, "REG" }, \ + { DT_LNK, "LNK" }, \ + { DT_SOCK, "SOCK" }, \ + { DT_WHT, "WHT" }) + +#define nfs_show_cache_validity(v) \ + __print_flags(v, "|", \ + { NFS_INO_INVALID_ATTR, "INVALID_ATTR" }, \ + { NFS_INO_INVALID_DATA, "INVALID_DATA" }, \ + { NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \ + { NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \ + { NFS_INO_INVALID_ACL, "INVALID_ACL" }, \ + { NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \ + { NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \ + { NFS_INO_INVALID_LABEL, "INVALID_LABEL" }) + +#define nfs_show_nfsi_flags(v) \ + __print_flags(v, "|", \ + { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ + { 1 << NFS_INO_STALE, "STALE" }, \ + { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ + { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ + { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ + { 1 << NFS_INO_COMMIT, "COMMIT" }, \ + { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ + { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) + +DECLARE_EVENT_CLASS(nfs_inode_event, + TP_PROTO( + const struct inode *inode + ), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode->i_version; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu ", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (unsigned long long)__entry->version + ) +); + +DECLARE_EVENT_CLASS(nfs_inode_event_done, + TP_PROTO( + const struct inode *inode, + int error + ), + + TP_ARGS(inode, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(unsigned char, type) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, size) + __field(unsigned long, nfsi_flags) + __field(unsigned long, cache_validity) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + __entry->error = error; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->type = nfs_umode_to_dtype(inode->i_mode); + __entry->version = inode->i_version; + __entry->size = i_size_read(inode); + __entry->nfsi_flags = nfsi->flags; + __entry->cache_validity = nfsi->cache_validity; + ), + + TP_printk( + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "type=%u (%s) version=%llu size=%lld " + "cache_validity=%lu (%s) nfs_flags=%ld (%s)", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->type, + nfs_show_file_type(__entry->type), + (unsigned long long)__entry->version, + (long long)__entry->size, + __entry->cache_validity, + nfs_show_cache_validity(__entry->cache_validity), + __entry->nfsi_flags, + nfs_show_nfsi_flags(__entry->nfsi_flags) + ) +); + +#define DEFINE_NFS_INODE_EVENT(name) \ + DEFINE_EVENT(nfs_inode_event, name, \ + TP_PROTO( \ + const struct inode *inode \ + ), \ + TP_ARGS(inode)) +#define DEFINE_NFS_INODE_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_inode_event_done, name, \ + TP_PROTO( \ + const struct inode *inode, \ + int error \ + ), \ + TP_ARGS(inode, error)) +DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit); +DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_revalidate_inode_exit); +DEFINE_NFS_INODE_EVENT(nfs_invalidate_mapping_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_invalidate_mapping_exit); +DEFINE_NFS_INODE_EVENT(nfs_getattr_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_getattr_exit); +DEFINE_NFS_INODE_EVENT(nfs_setattr_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_setattr_exit); +DEFINE_NFS_INODE_EVENT(nfs_writeback_page_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_page_exit); +DEFINE_NFS_INODE_EVENT(nfs_writeback_inode_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit); +DEFINE_NFS_INODE_EVENT(nfs_fsync_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit); +DEFINE_NFS_INODE_EVENT(nfs_access_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_access_exit); + +#define show_lookup_flags(flags) \ + __print_flags((unsigned long)flags, "|", \ + { LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \ + { LOOKUP_DIRECTORY, "DIRECTORY" }, \ + { LOOKUP_OPEN, "OPEN" }, \ + { LOOKUP_CREATE, "CREATE" }, \ + { LOOKUP_EXCL, "EXCL" }) + +DECLARE_EVENT_CLASS(nfs_lookup_event, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry, + unsigned int flags + ), + + TP_ARGS(dir, dentry, flags), + + TP_STRUCT__entry( + __field(unsigned int, flags) + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->flags = flags; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "flags=%u (%s) name=%02x:%02x:%llu/%s", + __entry->flags, + show_lookup_flags(__entry->flags), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS_LOOKUP_EVENT(name) \ + DEFINE_EVENT(nfs_lookup_event, name, \ + TP_PROTO( \ + const struct inode *dir, \ + const struct dentry *dentry, \ + unsigned int flags \ + ), \ + TP_ARGS(dir, dentry, flags)) + +DECLARE_EVENT_CLASS(nfs_lookup_event_done, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry, + unsigned int flags, + int error + ), + + TP_ARGS(dir, dentry, flags, error), + + TP_STRUCT__entry( + __field(int, error) + __field(unsigned int, flags) + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->error = error; + __entry->flags = flags; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "error=%d flags=%u (%s) name=%02x:%02x:%llu/%s", + __entry->error, + __entry->flags, + show_lookup_flags(__entry->flags), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS_LOOKUP_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_lookup_event_done, name, \ + TP_PROTO( \ + const struct inode *dir, \ + const struct dentry *dentry, \ + unsigned int flags, \ + int error \ + ), \ + TP_ARGS(dir, dentry, flags, error)) + +DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter); +DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit); +DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter); +DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit); + +#define show_open_flags(flags) \ + __print_flags((unsigned long)flags, "|", \ + { O_CREAT, "O_CREAT" }, \ + { O_EXCL, "O_EXCL" }, \ + { O_TRUNC, "O_TRUNC" }, \ + { O_APPEND, "O_APPEND" }, \ + { O_DSYNC, "O_DSYNC" }, \ + { O_DIRECT, "O_DIRECT" }, \ + { O_DIRECTORY, "O_DIRECTORY" }) + +#define show_fmode_flags(mode) \ + __print_flags(mode, "|", \ + { ((__force unsigned long)FMODE_READ), "READ" }, \ + { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \ + { ((__force unsigned long)FMODE_EXEC), "EXEC" }) + +TRACE_EVENT(nfs_atomic_open_enter, + TP_PROTO( + const struct inode *dir, + const struct nfs_open_context *ctx, + unsigned int flags + ), + + TP_ARGS(dir, ctx, flags), + + TP_STRUCT__entry( + __field(unsigned int, flags) + __field(unsigned int, fmode) + __field(dev_t, dev) + __field(u64, dir) + __string(name, ctx->dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->flags = flags; + __entry->fmode = (__force unsigned int)ctx->mode; + __assign_str(name, ctx->dentry->d_name.name); + ), + + TP_printk( + "flags=%u (%s) fmode=%s name=%02x:%02x:%llu/%s", + __entry->flags, + show_open_flags(__entry->flags), + show_fmode_flags(__entry->fmode), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +TRACE_EVENT(nfs_atomic_open_exit, + TP_PROTO( + const struct inode *dir, + const struct nfs_open_context *ctx, + unsigned int flags, + int error + ), + + TP_ARGS(dir, ctx, flags, error), + + TP_STRUCT__entry( + __field(int, error) + __field(unsigned int, flags) + __field(unsigned int, fmode) + __field(dev_t, dev) + __field(u64, dir) + __string(name, ctx->dentry->d_name.name) + ), + + TP_fast_assign( + __entry->error = error; + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->flags = flags; + __entry->fmode = (__force unsigned int)ctx->mode; + __assign_str(name, ctx->dentry->d_name.name); + ), + + TP_printk( + "error=%d flags=%u (%s) fmode=%s " + "name=%02x:%02x:%llu/%s", + __entry->error, + __entry->flags, + show_open_flags(__entry->flags), + show_fmode_flags(__entry->fmode), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +TRACE_EVENT(nfs_create_enter, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry, + unsigned int flags + ), + + TP_ARGS(dir, dentry, flags), + + TP_STRUCT__entry( + __field(unsigned int, flags) + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->flags = flags; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "flags=%u (%s) name=%02x:%02x:%llu/%s", + __entry->flags, + show_open_flags(__entry->flags), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +TRACE_EVENT(nfs_create_exit, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry, + unsigned int flags, + int error + ), + + TP_ARGS(dir, dentry, flags, error), + + TP_STRUCT__entry( + __field(int, error) + __field(unsigned int, flags) + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->error = error; + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->flags = flags; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "error=%d flags=%u (%s) name=%02x:%02x:%llu/%s", + __entry->error, + __entry->flags, + show_open_flags(__entry->flags), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +DECLARE_EVENT_CLASS(nfs_directory_event, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry + ), + + TP_ARGS(dir, dentry), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "name=%02x:%02x:%llu/%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS_DIRECTORY_EVENT(name) \ + DEFINE_EVENT(nfs_directory_event, name, \ + TP_PROTO( \ + const struct inode *dir, \ + const struct dentry *dentry \ + ), \ + TP_ARGS(dir, dentry)) + +DECLARE_EVENT_CLASS(nfs_directory_event_done, + TP_PROTO( + const struct inode *dir, + const struct dentry *dentry, + int error + ), + + TP_ARGS(dir, dentry, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->error = error; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "error=%d name=%02x:%02x:%llu/%s", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +#define DEFINE_NFS_DIRECTORY_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_directory_event_done, name, \ + TP_PROTO( \ + const struct inode *dir, \ + const struct dentry *dentry, \ + int error \ + ), \ + TP_ARGS(dir, dentry, error)) + +DEFINE_NFS_DIRECTORY_EVENT(nfs_mknod_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mknod_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_mkdir_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_mkdir_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_rmdir_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_rmdir_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_remove_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_remove_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_unlink_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_unlink_exit); +DEFINE_NFS_DIRECTORY_EVENT(nfs_symlink_enter); +DEFINE_NFS_DIRECTORY_EVENT_DONE(nfs_symlink_exit); + +TRACE_EVENT(nfs_link_enter, + TP_PROTO( + const struct inode *inode, + const struct inode *dir, + const struct dentry *dentry + ), + + TP_ARGS(inode, dir, dentry), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->dir = NFS_FILEID(dir); + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->fileid, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +TRACE_EVENT(nfs_link_exit, + TP_PROTO( + const struct inode *inode, + const struct inode *dir, + const struct dentry *dentry, + int error + ), + + TP_ARGS(inode, dir, dentry, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u64, fileid) + __field(u64, dir) + __string(name, dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->dir = NFS_FILEID(dir); + __entry->error = error; + __assign_str(name, dentry->d_name.name); + ), + + TP_printk( + "error=%d fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->fileid, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); + +DECLARE_EVENT_CLASS(nfs_rename_event, + TP_PROTO( + const struct inode *old_dir, + const struct dentry *old_dentry, + const struct inode *new_dir, + const struct dentry *new_dentry + ), + + TP_ARGS(old_dir, old_dentry, new_dir, new_dentry), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, old_dir) + __field(u64, new_dir) + __string(old_name, old_dentry->d_name.name) + __string(new_name, new_dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = old_dir->i_sb->s_dev; + __entry->old_dir = NFS_FILEID(old_dir); + __entry->new_dir = NFS_FILEID(new_dir); + __assign_str(old_name, old_dentry->d_name.name); + __assign_str(new_name, new_dentry->d_name.name); + ), + + TP_printk( + "old_name=%02x:%02x:%llu/%s new_name=%02x:%02x:%llu/%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->old_dir, + __get_str(old_name), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->new_dir, + __get_str(new_name) + ) +); +#define DEFINE_NFS_RENAME_EVENT(name) \ + DEFINE_EVENT(nfs_rename_event, name, \ + TP_PROTO( \ + const struct inode *old_dir, \ + const struct dentry *old_dentry, \ + const struct inode *new_dir, \ + const struct dentry *new_dentry \ + ), \ + TP_ARGS(old_dir, old_dentry, new_dir, new_dentry)) + +DECLARE_EVENT_CLASS(nfs_rename_event_done, + TP_PROTO( + const struct inode *old_dir, + const struct dentry *old_dentry, + const struct inode *new_dir, + const struct dentry *new_dentry, + int error + ), + + TP_ARGS(old_dir, old_dentry, new_dir, new_dentry, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, error) + __field(u64, old_dir) + __string(old_name, old_dentry->d_name.name) + __field(u64, new_dir) + __string(new_name, new_dentry->d_name.name) + ), + + TP_fast_assign( + __entry->dev = old_dir->i_sb->s_dev; + __entry->old_dir = NFS_FILEID(old_dir); + __entry->new_dir = NFS_FILEID(new_dir); + __entry->error = error; + __assign_str(old_name, old_dentry->d_name.name); + __assign_str(new_name, new_dentry->d_name.name); + ), + + TP_printk( + "error=%d old_name=%02x:%02x:%llu/%s " + "new_name=%02x:%02x:%llu/%s", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->old_dir, + __get_str(old_name), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->new_dir, + __get_str(new_name) + ) +); +#define DEFINE_NFS_RENAME_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_rename_event_done, name, \ + TP_PROTO( \ + const struct inode *old_dir, \ + const struct dentry *old_dentry, \ + const struct inode *new_dir, \ + const struct dentry *new_dentry, \ + int error \ + ), \ + TP_ARGS(old_dir, old_dentry, new_dir, \ + new_dentry, error)) + +DEFINE_NFS_RENAME_EVENT(nfs_rename_enter); +DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit); + +DEFINE_NFS_RENAME_EVENT_DONE(nfs_sillyrename_rename); + +TRACE_EVENT(nfs_sillyrename_unlink, + TP_PROTO( + const struct nfs_unlinkdata *data, + int error + ), + + TP_ARGS(data, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, error) + __field(u64, dir) + __dynamic_array(char, name, data->args.name.len + 1) + ), + + TP_fast_assign( + struct inode *dir = data->dir; + size_t len = data->args.name.len; + __entry->dev = dir->i_sb->s_dev; + __entry->dir = NFS_FILEID(dir); + __entry->error = error; + memcpy(__get_dynamic_array(name), + data->args.name.name, len); + ((char *)__get_dynamic_array(name))[len] = 0; + ), + + TP_printk( + "error=%d name=%02x:%02x:%llu/%s", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->dir, + __get_str(name) + ) +); +#endif /* _TRACE_NFS_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE nfstrace +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild new file mode 100644 index 00000000000..ed30ea072bb --- /dev/null +++ b/fs/nfs/objlayout/Kbuild @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS Objects Layout Driver kernel module +# +objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c new file mode 100644 index 00000000000..611320753db --- /dev/null +++ b/fs/nfs/objlayout/objio_osd.c @@ -0,0 +1,706 @@ +/* + * pNFS Objects layout implementation over open-osd initiator library + * + * Copyright (C) 2009 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy <bhalevy@panasas.com> + * Boaz Harrosh <bharrosh@panasas.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/module.h> +#include <scsi/osd_ore.h> + +#include "objlayout.h" +#include "../internal.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +struct objio_dev_ent { + struct nfs4_deviceid_node id_node; + struct ore_dev od; +}; + +static void +objio_free_deviceid_node(struct nfs4_deviceid_node *d) +{ + struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); + + dprintk("%s: free od=%p\n", __func__, de->od.od); + osduld_put_device(de->od.od); + kfree(de); +} + +static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss, + const struct nfs4_deviceid *d_id) +{ + struct nfs4_deviceid_node *d; + struct objio_dev_ent *de; + + d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id); + if (!d) + return NULL; + + de = container_of(d, struct objio_dev_ent, id_node); + return de; +} + +static struct objio_dev_ent * +_dev_list_add(const struct nfs_server *nfss, + const struct nfs4_deviceid *d_id, struct osd_dev *od, + gfp_t gfp_flags) +{ + struct nfs4_deviceid_node *d; + struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags); + struct objio_dev_ent *n; + + if (!de) { + dprintk("%s: -ENOMEM od=%p\n", __func__, od); + return NULL; + } + + dprintk("%s: Adding od=%p\n", __func__, od); + nfs4_init_deviceid_node(&de->id_node, + nfss->pnfs_curr_ld, + nfss->nfs_client, + d_id); + de->od.od = od; + + d = nfs4_insert_deviceid_node(&de->id_node); + n = container_of(d, struct objio_dev_ent, id_node); + if (n != de) { + dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od); + objio_free_deviceid_node(&de->id_node); + de = n; + } + + return de; +} + +struct objio_segment { + struct pnfs_layout_segment lseg; + + struct ore_layout layout; + struct ore_components oc; +}; + +static inline struct objio_segment * +OBJIO_LSEG(struct pnfs_layout_segment *lseg) +{ + return container_of(lseg, struct objio_segment, lseg); +} + +struct objio_state { + /* Generic layer */ + struct objlayout_io_res oir; + + bool sync; + /*FIXME: Support for extra_bytes at ore_get_rw_state() */ + struct ore_io_state *ios; +}; + +/* Send and wait for a get_device_info of devices in the layout, + then look them up with the osd_initiator library */ +static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, + struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id, + gfp_t gfp_flags) +{ + struct pnfs_osd_deviceaddr *deviceaddr; + struct objio_dev_ent *ode; + struct osd_dev *od; + struct osd_dev_info odi; + bool retry_flag = true; + int err; + + ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); + if (ode) { + objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ + return 0; + } + + err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); + if (unlikely(err)) { + dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", + __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); + return err; + } + + odi.systemid_len = deviceaddr->oda_systemid.len; + if (odi.systemid_len > sizeof(odi.systemid)) { + dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n", + __func__, sizeof(odi.systemid)); + err = -EINVAL; + goto out; + } else if (odi.systemid_len) + memcpy(odi.systemid, deviceaddr->oda_systemid.data, + odi.systemid_len); + odi.osdname_len = deviceaddr->oda_osdname.len; + odi.osdname = (u8 *)deviceaddr->oda_osdname.data; + + if (!odi.osdname_len && !odi.systemid_len) { + dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", + __func__); + err = -ENODEV; + goto out; + } + +retry_lookup: + od = osduld_info_lookup(&odi); + if (unlikely(IS_ERR(od))) { + err = PTR_ERR(od); + dprintk("%s: osduld_info_lookup => %d\n", __func__, err); + if (err == -ENODEV && retry_flag) { + err = objlayout_autologin(deviceaddr); + if (likely(!err)) { + retry_flag = false; + goto retry_lookup; + } + } + goto out; + } + + ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, + gfp_flags); + objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ + dprintk("Adding new dev_id(%llx:%llx)\n", + _DEVID_LO(d_id), _DEVID_HI(d_id)); +out: + objlayout_put_deviceinfo(deviceaddr); + return err; +} + +static void copy_single_comp(struct ore_components *oc, unsigned c, + struct pnfs_osd_object_cred *src_comp) +{ + struct ore_comp *ocomp = &oc->comps[c]; + + WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */ + WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred)); + + ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id; + ocomp->obj.id = src_comp->oc_object_id.oid_object_id; + + memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred)); +} + +static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, + struct objio_segment **pseg) +{ +/* This is the in memory structure of the objio_segment + * + * struct __alloc_objio_segment { + * struct objio_segment olseg; + * struct ore_dev *ods[numdevs]; + * struct ore_comp comps[numdevs]; + * } *aolseg; + * NOTE: The code as above compiles and runs perfectly. It is elegant, + * type safe and compact. At some Past time Linus has decided he does not + * like variable length arrays, For the sake of this principal we uglify + * the code as below. + */ + struct objio_segment *lseg; + size_t lseg_size = sizeof(*lseg) + + numdevs * sizeof(lseg->oc.ods[0]) + + numdevs * sizeof(*lseg->oc.comps); + + lseg = kzalloc(lseg_size, gfp_flags); + if (unlikely(!lseg)) { + dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__, + numdevs, lseg_size); + return -ENOMEM; + } + + lseg->oc.numdevs = numdevs; + lseg->oc.single_comp = EC_MULTPLE_COMPS; + lseg->oc.ods = (void *)(lseg + 1); + lseg->oc.comps = (void *)(lseg->oc.ods + numdevs); + + *pseg = lseg; + return 0; +} + +int objio_alloc_lseg(struct pnfs_layout_segment **outp, + struct pnfs_layout_hdr *pnfslay, + struct pnfs_layout_range *range, + struct xdr_stream *xdr, + gfp_t gfp_flags) +{ + struct objio_segment *objio_seg; + struct pnfs_osd_xdr_decode_layout_iter iter; + struct pnfs_osd_layout layout; + struct pnfs_osd_object_cred src_comp; + unsigned cur_comp; + int err; + + err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); + if (unlikely(err)) + return err; + + err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg); + if (unlikely(err)) + return err; + + objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit; + objio_seg->layout.group_width = layout.olo_map.odm_group_width; + objio_seg->layout.group_depth = layout.olo_map.odm_group_depth; + objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; + objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm; + + err = ore_verify_layout(layout.olo_map.odm_num_comps, + &objio_seg->layout); + if (unlikely(err)) + goto err; + + objio_seg->oc.first_dev = layout.olo_comps_index; + cur_comp = 0; + while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { + copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); + err = objio_devices_lookup(pnfslay, objio_seg, cur_comp, + &src_comp.oc_object_id.oid_device_id, + gfp_flags); + if (err) + goto err; + ++cur_comp; + } + /* pnfs_osd_xdr_decode_layout_comp returns false on error */ + if (unlikely(err)) + goto err; + + *outp = &objio_seg->lseg; + return 0; + +err: + kfree(objio_seg); + dprintk("%s: Error: return %d\n", __func__, err); + *outp = NULL; + return err; +} + +void objio_free_lseg(struct pnfs_layout_segment *lseg) +{ + int i; + struct objio_segment *objio_seg = OBJIO_LSEG(lseg); + + for (i = 0; i < objio_seg->oc.numdevs; i++) { + struct ore_dev *od = objio_seg->oc.ods[i]; + struct objio_dev_ent *ode; + + if (!od) + break; + ode = container_of(od, typeof(*ode), od); + nfs4_put_deviceid_node(&ode->id_node); + } + kfree(objio_seg); +} + +static int +objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading, + struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase, + loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags, + struct objio_state **outp) +{ + struct objio_segment *objio_seg = OBJIO_LSEG(lseg); + struct ore_io_state *ios; + int ret; + struct __alloc_objio_state { + struct objio_state objios; + struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs]; + } *aos; + + aos = kzalloc(sizeof(*aos), gfp_flags); + if (unlikely(!aos)) + return -ENOMEM; + + objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs, + aos->ioerrs, rpcdata, pnfs_layout_type); + + ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading, + offset, count, &ios); + if (unlikely(ret)) { + kfree(aos); + return ret; + } + + ios->pages = pages; + ios->pgbase = pgbase; + ios->private = aos; + BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT); + + aos->objios.sync = 0; + aos->objios.ios = ios; + *outp = &aos->objios; + return 0; +} + +void objio_free_result(struct objlayout_io_res *oir) +{ + struct objio_state *objios = container_of(oir, struct objio_state, oir); + + ore_put_io_state(objios->ios); + kfree(objios); +} + +static enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) +{ + switch (oep) { + case OSD_ERR_PRI_NO_ERROR: + return (enum pnfs_osd_errno)0; + + case OSD_ERR_PRI_CLEAR_PAGES: + BUG_ON(1); + return 0; + + case OSD_ERR_PRI_RESOURCE: + return PNFS_OSD_ERR_RESOURCE; + case OSD_ERR_PRI_BAD_CRED: + return PNFS_OSD_ERR_BAD_CRED; + case OSD_ERR_PRI_NO_ACCESS: + return PNFS_OSD_ERR_NO_ACCESS; + case OSD_ERR_PRI_UNREACHABLE: + return PNFS_OSD_ERR_UNREACHABLE; + case OSD_ERR_PRI_NOT_FOUND: + return PNFS_OSD_ERR_NOT_FOUND; + case OSD_ERR_PRI_NO_SPACE: + return PNFS_OSD_ERR_NO_SPACE; + default: + WARN_ON(1); + /* fallthrough */ + case OSD_ERR_PRI_EIO: + return PNFS_OSD_ERR_EIO; + } +} + +static void __on_dev_error(struct ore_io_state *ios, + struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep, + u64 dev_offset, u64 dev_len) +{ + struct objio_state *objios = ios->private; + struct pnfs_osd_objid pooid; + struct objio_dev_ent *ode = container_of(od, typeof(*ode), od); + /* FIXME: what to do with more-then-one-group layouts. We need to + * translate from ore_io_state index to oc->comps index + */ + unsigned comp = dev_index; + + pooid.oid_device_id = ode->id_node.deviceid; + pooid.oid_partition_id = ios->oc->comps[comp].obj.partition; + pooid.oid_object_id = ios->oc->comps[comp].obj.id; + + objlayout_io_set_result(&objios->oir, comp, + &pooid, osd_pri_2_pnfs_err(oep), + dev_offset, dev_len, !ios->reading); +} + +/* + * read + */ +static void _read_done(struct ore_io_state *ios, void *private) +{ + struct objio_state *objios = private; + ssize_t status; + int ret = ore_check_io(ios, &__on_dev_error); + + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ + + if (likely(!ret)) + status = ios->length; + else + status = ret; + + objlayout_read_done(&objios->oir, status, objios->sync); +} + +int objio_read_pagelist(struct nfs_pgio_data *rdata) +{ + struct nfs_pgio_header *hdr = rdata->header; + struct objio_state *objios; + int ret; + + ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true, + hdr->lseg, rdata->args.pages, rdata->args.pgbase, + rdata->args.offset, rdata->args.count, rdata, + GFP_KERNEL, &objios); + if (unlikely(ret)) + return ret; + + objios->ios->done = _read_done; + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + rdata->args.offset, rdata->args.count); + ret = ore_read(objios->ios); + if (unlikely(ret)) + objio_free_result(&objios->oir); + return ret; +} + +/* + * write + */ +static void _write_done(struct ore_io_state *ios, void *private) +{ + struct objio_state *objios = private; + ssize_t status; + int ret = ore_check_io(ios, &__on_dev_error); + + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ + + if (likely(!ret)) { + /* FIXME: should be based on the OSD's persistence model + * See OSD2r05 Section 4.13 Data persistence model */ + objios->oir.committed = NFS_FILE_SYNC; + status = ios->length; + } else { + status = ret; + } + + objlayout_write_done(&objios->oir, status, objios->sync); +} + +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) +{ + struct objio_state *objios = priv; + struct nfs_pgio_data *wdata = objios->oir.rpcdata; + struct address_space *mapping = wdata->header->inode->i_mapping; + pgoff_t index = offset / PAGE_SIZE; + struct page *page; + loff_t i_size = i_size_read(wdata->header->inode); + + if (offset >= i_size) { + *uptodate = true; + dprintk("%s: g_zero_page index=0x%lx\n", __func__, index); + return ZERO_PAGE(0); + } + + page = find_get_page(mapping, index); + if (!page) { + page = find_or_create_page(mapping, index, GFP_NOFS); + if (unlikely(!page)) { + dprintk("%s: grab_cache_page Failed index=0x%lx\n", + __func__, index); + return NULL; + } + unlock_page(page); + } + if (PageDirty(page) || PageWriteback(page)) + *uptodate = true; + else + *uptodate = PageUptodate(page); + dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); + return page; +} + +static void __r4w_put_page(void *priv, struct page *page) +{ + dprintk("%s: index=0x%lx\n", __func__, + (page == ZERO_PAGE(0)) ? -1UL : page->index); + if (ZERO_PAGE(0) != page) + page_cache_release(page); + return; +} + +static const struct _ore_r4w_op _r4w_op = { + .get_page = &__r4w_get_page, + .put_page = &__r4w_put_page, +}; + +int objio_write_pagelist(struct nfs_pgio_data *wdata, int how) +{ + struct nfs_pgio_header *hdr = wdata->header; + struct objio_state *objios; + int ret; + + ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false, + hdr->lseg, wdata->args.pages, wdata->args.pgbase, + wdata->args.offset, wdata->args.count, wdata, GFP_NOFS, + &objios); + if (unlikely(ret)) + return ret; + + objios->sync = 0 != (how & FLUSH_SYNC); + objios->ios->r4w = &_r4w_op; + + if (!objios->sync) + objios->ios->done = _write_done; + + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + wdata->args.offset, wdata->args.count); + ret = ore_write(objios->ios); + if (unlikely(ret)) { + objio_free_result(&objios->oir); + return ret; + } + + if (objios->sync) + _write_done(objios->ios, objios); + + return 0; +} + +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, struct nfs_page *req) +{ + unsigned int size; + + size = pnfs_generic_pg_test(pgio, prev, req); + + if (!size || pgio->pg_count + req->wb_bytes > + (unsigned long)pgio->pg_layout_private) + return 0; + + return min(size, req->wb_bytes); +} + +static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + pnfs_generic_pg_init_read(pgio, req); + if (unlikely(pgio->pg_lseg == NULL)) + return; /* Not pNFS */ + + pgio->pg_layout_private = (void *) + OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; +} + +static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout, + unsigned long *stripe_end) +{ + u32 stripe_off; + unsigned stripe_size; + + if (layout->raid_algorithm == PNFS_OSD_RAID_0) + return true; + + stripe_size = layout->stripe_unit * + (layout->group_width - layout->parity); + + div_u64_rem(offset, stripe_size, &stripe_off); + if (!stripe_off) + return true; + + *stripe_end = stripe_size - stripe_off; + return false; +} + +static void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + unsigned long stripe_end = 0; + u64 wb_size; + + if (pgio->pg_dreq == NULL) + wb_size = i_size_read(pgio->pg_inode) - req_offset(req); + else + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + + pnfs_generic_pg_init_write(pgio, req, wb_size); + if (unlikely(pgio->pg_lseg == NULL)) + return; /* Not pNFS */ + + if (req->wb_offset || + !aligned_on_raid_stripe(req->wb_index * PAGE_SIZE, + &OBJIO_LSEG(pgio->pg_lseg)->layout, + &stripe_end)) { + pgio->pg_layout_private = (void *)stripe_end; + } else { + pgio->pg_layout_private = (void *) + OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; + } +} + +static const struct nfs_pageio_ops objio_pg_read_ops = { + .pg_init = objio_init_read, + .pg_test = objio_pg_test, + .pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops objio_pg_write_ops = { + .pg_init = objio_init_write, + .pg_test = objio_pg_test, + .pg_doio = pnfs_generic_pg_writepages, +}; + +static struct pnfs_layoutdriver_type objlayout_type = { + .id = LAYOUT_OSD2_OBJECTS, + .name = "LAYOUT_OSD2_OBJECTS", + .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR, + + .owner = THIS_MODULE, + .alloc_layout_hdr = objlayout_alloc_layout_hdr, + .free_layout_hdr = objlayout_free_layout_hdr, + + .alloc_lseg = objlayout_alloc_lseg, + .free_lseg = objlayout_free_lseg, + + .read_pagelist = objlayout_read_pagelist, + .write_pagelist = objlayout_write_pagelist, + .pg_read_ops = &objio_pg_read_ops, + .pg_write_ops = &objio_pg_write_ops, + + .free_deviceid_node = objio_free_deviceid_node, + + .encode_layoutcommit = objlayout_encode_layoutcommit, + .encode_layoutreturn = objlayout_encode_layoutreturn, +}; + +MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); +MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>"); +MODULE_LICENSE("GPL"); + +static int __init +objlayout_init(void) +{ + int ret = pnfs_register_layoutdriver(&objlayout_type); + + if (ret) + printk(KERN_INFO + "NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n", + __func__, ret); + else + printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n", + __func__); + return ret; +} + +static void __exit +objlayout_exit(void) +{ + pnfs_unregister_layoutdriver(&objlayout_type); + printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n", + __func__); +} + +MODULE_ALIAS("nfs-layouttype4-2"); + +module_init(objlayout_init); +module_exit(objlayout_exit); diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c new file mode 100644 index 00000000000..765d3f54e98 --- /dev/null +++ b/fs/nfs/objlayout/objlayout.c @@ -0,0 +1,779 @@ +/* + * pNFS Objects layout driver high level definitions + * + * Copyright (C) 2007 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy <bhalevy@panasas.com> + * Boaz Harrosh <bharrosh@panasas.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/kmod.h> +#include <linux/moduleparam.h> +#include <linux/ratelimit.h> +#include <scsi/osd_initiator.h> +#include "objlayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD +/* + * Create a objlayout layout structure for the given inode and return it. + */ +struct pnfs_layout_hdr * +objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) +{ + struct objlayout *objlay; + + objlay = kzalloc(sizeof(struct objlayout), gfp_flags); + if (!objlay) + return NULL; + spin_lock_init(&objlay->lock); + INIT_LIST_HEAD(&objlay->err_list); + dprintk("%s: Return %p\n", __func__, objlay); + return &objlay->pnfs_layout; +} + +/* + * Free an objlayout layout structure + */ +void +objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct objlayout *objlay = OBJLAYOUT(lo); + + dprintk("%s: objlay %p\n", __func__, objlay); + + WARN_ON(!list_empty(&objlay->err_list)); + kfree(objlay); +} + +/* + * Unmarshall layout and store it in pnfslay. + */ +struct pnfs_layout_segment * +objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, + struct nfs4_layoutget_res *lgr, + gfp_t gfp_flags) +{ + int status = -ENOMEM; + struct xdr_stream stream; + struct xdr_buf buf = { + .pages = lgr->layoutp->pages, + .page_len = lgr->layoutp->len, + .buflen = lgr->layoutp->len, + .len = lgr->layoutp->len, + }; + struct page *scratch; + struct pnfs_layout_segment *lseg; + + dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay); + + scratch = alloc_page(gfp_flags); + if (!scratch) + goto err_nofree; + + xdr_init_decode(&stream, &buf, NULL); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags); + if (unlikely(status)) { + dprintk("%s: objio_alloc_lseg Return err %d\n", __func__, + status); + goto err; + } + + __free_page(scratch); + + dprintk("%s: Return %p\n", __func__, lseg); + return lseg; + +err: + __free_page(scratch); +err_nofree: + dprintk("%s: Err Return=>%d\n", __func__, status); + return ERR_PTR(status); +} + +/* + * Free a layout segement + */ +void +objlayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + dprintk("%s: freeing layout segment %p\n", __func__, lseg); + + if (unlikely(!lseg)) + return; + + objio_free_lseg(lseg); +} + +/* + * I/O Operations + */ +static inline u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end : NFS4_MAX_UINT64; +} + +static void _fix_verify_io_params(struct pnfs_layout_segment *lseg, + struct page ***p_pages, unsigned *p_pgbase, + u64 offset, unsigned long count) +{ + u64 lseg_end_offset; + + BUG_ON(offset < lseg->pls_range.offset); + lseg_end_offset = end_offset(lseg->pls_range.offset, + lseg->pls_range.length); + BUG_ON(offset >= lseg_end_offset); + WARN_ON(offset + count > lseg_end_offset); + + if (*p_pgbase > PAGE_SIZE) { + dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase); + *p_pages += *p_pgbase >> PAGE_SHIFT; + *p_pgbase &= ~PAGE_MASK; + } +} + +/* + * I/O done common code + */ +static void +objlayout_iodone(struct objlayout_io_res *oir) +{ + if (likely(oir->status >= 0)) { + objio_free_result(oir); + } else { + struct objlayout *objlay = oir->objlay; + + spin_lock(&objlay->lock); + objlay->delta_space_valid = OBJ_DSU_INVALID; + list_add(&objlay->err_list, &oir->err_list); + spin_unlock(&objlay->lock); + } +} + +/* + * objlayout_io_set_result - Set an osd_error code on a specific osd comp. + * + * The @index component IO failed (error returned from target). Register + * the error for later reporting at layout-return. + */ +void +objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, + struct pnfs_osd_objid *pooid, int osd_error, + u64 offset, u64 length, bool is_write) +{ + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index]; + + BUG_ON(index >= oir->num_comps); + if (osd_error) { + ioerr->oer_component = *pooid; + ioerr->oer_comp_offset = offset; + ioerr->oer_comp_length = length; + ioerr->oer_iswrite = is_write; + ioerr->oer_errno = osd_error; + + dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " + "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", + __func__, index, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + } else { + /* User need not call if no error is reported */ + ioerr->oer_errno = 0; + } +} + +/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). + * This is because the osd completion is called with ints-off from + * the block layer + */ +static void _rpc_read_complete(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_pgio_data *rdata; + + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + rdata = container_of(task, struct nfs_pgio_data, task); + + pnfs_ld_read_done(rdata); +} + +void +objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) +{ + struct nfs_pgio_data *rdata = oir->rpcdata; + + oir->status = rdata->task.tk_status = status; + if (status >= 0) + rdata->res.count = status; + else + rdata->header->pnfs_error = status; + objlayout_iodone(oir); + /* must not use oir after this point */ + + dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, + status, rdata->res.eof, sync); + + if (sync) + pnfs_ld_read_done(rdata); + else { + INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); + schedule_work(&rdata->task.u.tk_work); + } +} + +/* + * Perform sync or async reads. + */ +enum pnfs_try_status +objlayout_read_pagelist(struct nfs_pgio_data *rdata) +{ + struct nfs_pgio_header *hdr = rdata->header; + struct inode *inode = hdr->inode; + loff_t offset = rdata->args.offset; + size_t count = rdata->args.count; + int err; + loff_t eof; + + eof = i_size_read(inode); + if (unlikely(offset + count > eof)) { + if (offset >= eof) { + err = 0; + rdata->res.count = 0; + rdata->res.eof = 1; + /*FIXME: do we need to call pnfs_ld_read_done() */ + goto out; + } + count = eof - offset; + } + + rdata->res.eof = (offset + count) >= eof; + _fix_verify_io_params(hdr->lseg, &rdata->args.pages, + &rdata->args.pgbase, + rdata->args.offset, rdata->args.count); + + dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n", + __func__, inode->i_ino, offset, count, rdata->res.eof); + + err = objio_read_pagelist(rdata); + out: + if (unlikely(err)) { + hdr->pnfs_error = err; + dprintk("%s: Returned Error %d\n", __func__, err); + return PNFS_NOT_ATTEMPTED; + } + return PNFS_ATTEMPTED; +} + +/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). + * This is because the osd completion is called with ints-off from + * the block layer + */ +static void _rpc_write_complete(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_pgio_data *wdata; + + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + wdata = container_of(task, struct nfs_pgio_data, task); + + pnfs_ld_write_done(wdata); +} + +void +objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) +{ + struct nfs_pgio_data *wdata = oir->rpcdata; + + oir->status = wdata->task.tk_status = status; + if (status >= 0) { + wdata->res.count = status; + wdata->verf.committed = oir->committed; + } else { + wdata->header->pnfs_error = status; + } + objlayout_iodone(oir); + /* must not use oir after this point */ + + dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, + status, wdata->verf.committed, sync); + + if (sync) + pnfs_ld_write_done(wdata); + else { + INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); + schedule_work(&wdata->task.u.tk_work); + } +} + +/* + * Perform sync or async writes. + */ +enum pnfs_try_status +objlayout_write_pagelist(struct nfs_pgio_data *wdata, + int how) +{ + struct nfs_pgio_header *hdr = wdata->header; + int err; + + _fix_verify_io_params(hdr->lseg, &wdata->args.pages, + &wdata->args.pgbase, + wdata->args.offset, wdata->args.count); + + err = objio_write_pagelist(wdata, how); + if (unlikely(err)) { + hdr->pnfs_error = err; + dprintk("%s: Returned Error %d\n", __func__, err); + return PNFS_NOT_ATTEMPTED; + } + return PNFS_ATTEMPTED; +} + +void +objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *args) +{ + struct objlayout *objlay = OBJLAYOUT(pnfslay); + struct pnfs_osd_layoutupdate lou; + __be32 *start; + + dprintk("%s: Begin\n", __func__); + + spin_lock(&objlay->lock); + lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); + lou.dsu_delta = objlay->delta_space_used; + objlay->delta_space_used = 0; + objlay->delta_space_valid = OBJ_DSU_INIT; + lou.olu_ioerr_flag = !list_empty(&objlay->err_list); + spin_unlock(&objlay->lock); + + start = xdr_reserve_space(xdr, 4); + + BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); + + dprintk("%s: Return delta_space_used %lld err %d\n", __func__, + lou.dsu_delta, lou.olu_ioerr_flag); +} + +static int +err_prio(u32 oer_errno) +{ + switch (oer_errno) { + case 0: + return 0; + + case PNFS_OSD_ERR_RESOURCE: + return OSD_ERR_PRI_RESOURCE; + case PNFS_OSD_ERR_BAD_CRED: + return OSD_ERR_PRI_BAD_CRED; + case PNFS_OSD_ERR_NO_ACCESS: + return OSD_ERR_PRI_NO_ACCESS; + case PNFS_OSD_ERR_UNREACHABLE: + return OSD_ERR_PRI_UNREACHABLE; + case PNFS_OSD_ERR_NOT_FOUND: + return OSD_ERR_PRI_NOT_FOUND; + case PNFS_OSD_ERR_NO_SPACE: + return OSD_ERR_PRI_NO_SPACE; + default: + WARN_ON(1); + /* fallthrough */ + case PNFS_OSD_ERR_EIO: + return OSD_ERR_PRI_EIO; + } +} + +static void +merge_ioerr(struct pnfs_osd_ioerr *dest_err, + const struct pnfs_osd_ioerr *src_err) +{ + u64 dest_end, src_end; + + if (!dest_err->oer_errno) { + *dest_err = *src_err; + /* accumulated device must be blank */ + memset(&dest_err->oer_component.oid_device_id, 0, + sizeof(dest_err->oer_component.oid_device_id)); + + return; + } + + if (dest_err->oer_component.oid_partition_id != + src_err->oer_component.oid_partition_id) + dest_err->oer_component.oid_partition_id = 0; + + if (dest_err->oer_component.oid_object_id != + src_err->oer_component.oid_object_id) + dest_err->oer_component.oid_object_id = 0; + + if (dest_err->oer_comp_offset > src_err->oer_comp_offset) + dest_err->oer_comp_offset = src_err->oer_comp_offset; + + dest_end = end_offset(dest_err->oer_comp_offset, + dest_err->oer_comp_length); + src_end = end_offset(src_err->oer_comp_offset, + src_err->oer_comp_length); + if (dest_end < src_end) + dest_end = src_end; + + dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; + + if ((src_err->oer_iswrite == dest_err->oer_iswrite) && + (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { + dest_err->oer_errno = src_err->oer_errno; + } else if (src_err->oer_iswrite) { + dest_err->oer_iswrite = true; + dest_err->oer_errno = src_err->oer_errno; + } +} + +static void +encode_accumulated_error(struct objlayout *objlay, __be32 *p) +{ + struct objlayout_io_res *oir, *tmp; + struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; + + list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { + unsigned i; + + for (i = 0; i < oir->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; + + if (!ioerr->oer_errno) + continue; + + printk(KERN_ERR "NFS: %s: err[%d]: errno=%d " + "is_write=%d dev(%llx:%llx) par=0x%llx " + "obj=0x%llx offset=0x%llx length=0x%llx\n", + __func__, i, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + + merge_ioerr(&accumulated_err, ioerr); + } + list_del(&oir->err_list); + objio_free_result(oir); + } + + pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); +} + +void +objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args) +{ + struct objlayout *objlay = OBJLAYOUT(pnfslay); + struct objlayout_io_res *oir, *tmp; + __be32 *start; + + dprintk("%s: Begin\n", __func__); + start = xdr_reserve_space(xdr, 4); + BUG_ON(!start); + + spin_lock(&objlay->lock); + + list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { + __be32 *last_xdr = NULL, *p; + unsigned i; + int res = 0; + + for (i = 0; i < oir->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; + + if (!ioerr->oer_errno) + continue; + + dprintk("%s: err[%d]: errno=%d is_write=%d " + "dev(%llx:%llx) par=0x%llx obj=0x%llx " + "offset=0x%llx length=0x%llx\n", + __func__, i, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + + p = pnfs_osd_xdr_ioerr_reserve_space(xdr); + if (unlikely(!p)) { + res = -E2BIG; + break; /* accumulated_error */ + } + + last_xdr = p; + pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]); + } + + /* TODO: use xdr_write_pages */ + if (unlikely(res)) { + /* no space for even one error descriptor */ + BUG_ON(!last_xdr); + + /* we've encountered a situation with lots and lots of + * errors and no space to encode them all. Use the last + * available slot to report the union of all the + * remaining errors. + */ + encode_accumulated_error(objlay, last_xdr); + goto loop_done; + } + list_del(&oir->err_list); + objio_free_result(oir); + } +loop_done: + spin_unlock(&objlay->lock); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); + dprintk("%s: Return\n", __func__); +} + + +/* + * Get Device Info API for io engines + */ +struct objlayout_deviceinfo { + struct page *page; + struct pnfs_osd_deviceaddr da; /* This must be last */ +}; + +/* Initialize and call nfs_getdeviceinfo, then decode and return a + * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() + * should be called. + */ +int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, + struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, + gfp_t gfp_flags) +{ + struct objlayout_deviceinfo *odi; + struct pnfs_device pd; + struct page *page, **pages; + u32 *p; + int err; + + page = alloc_page(gfp_flags); + if (!page) + return -ENOMEM; + + pages = &page; + pd.pages = pages; + + memcpy(&pd.dev_id, d_id, sizeof(*d_id)); + pd.layout_type = LAYOUT_OSD2_OBJECTS; + pd.pages = &page; + pd.pgbase = 0; + pd.pglen = PAGE_SIZE; + pd.mincount = 0; + pd.maxcount = PAGE_SIZE; + + err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd, + pnfslay->plh_lc_cred); + dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); + if (err) + goto err_out; + + p = page_address(page); + odi = kzalloc(sizeof(*odi), gfp_flags); + if (!odi) { + err = -ENOMEM; + goto err_out; + } + pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); + odi->page = page; + *deviceaddr = &odi->da; + return 0; + +err_out: + __free_page(page); + return err; +} + +void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) +{ + struct objlayout_deviceinfo *odi = container_of(deviceaddr, + struct objlayout_deviceinfo, + da); + + __free_page(odi->page); + kfree(odi); +} + +enum { + OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, + OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, + OSD_LOGIN_UPCALL_PATHLEN = 256 +}; + +static char osd_login_prog[OSD_LOGIN_UPCALL_PATHLEN] = "/sbin/osd_login"; + +module_param_string(osd_login_prog, osd_login_prog, sizeof(osd_login_prog), + 0600); +MODULE_PARM_DESC(osd_login_prog, "Path to the osd_login upcall program"); + +struct __auto_login { + char uri[OBJLAYOUT_MAX_URI_LEN]; + char osdname[OBJLAYOUT_MAX_OSDNAME_LEN]; + char systemid_hex[OBJLAYOUT_MAX_SYSID_HEX_LEN]; +}; + +static int __objlayout_upcall(struct __auto_login *login) +{ + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + char *argv[8]; + int ret; + + if (unlikely(!osd_login_prog[0])) { + dprintk("%s: osd_login_prog is disabled\n", __func__); + return -EACCES; + } + + dprintk("%s uri: %s\n", __func__, login->uri); + dprintk("%s osdname %s\n", __func__, login->osdname); + dprintk("%s systemid_hex %s\n", __func__, login->systemid_hex); + + argv[0] = (char *)osd_login_prog; + argv[1] = "-u"; + argv[2] = login->uri; + argv[3] = "-o"; + argv[4] = login->osdname; + argv[5] = "-s"; + argv[6] = login->systemid_hex; + argv[7] = NULL; + + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + /* + * Disable the upcall mechanism if we're getting an ENOENT or + * EACCES error. The admin can re-enable it on the fly by using + * sysfs to set the objlayoutdriver.osd_login_prog module parameter once + * the problem has been fixed. + */ + if (ret == -ENOENT || ret == -EACCES) { + printk(KERN_ERR "PNFS-OBJ: %s was not found please set " + "objlayoutdriver.osd_login_prog kernel parameter!\n", + osd_login_prog); + osd_login_prog[0] = '\0'; + } + dprintk("%s %s return value: %d\n", __func__, osd_login_prog, ret); + + return ret; +} + +/* Assume dest is all zeros */ +static void __copy_nfsS_and_zero_terminate(struct nfs4_string s, + char *dest, int max_len, + const char *var_name) +{ + if (!s.len) + return; + + if (s.len >= max_len) { + pr_warn_ratelimited( + "objlayout_autologin: %s: s.len(%d) >= max_len(%d)", + var_name, s.len, max_len); + s.len = max_len - 1; /* space for null terminator */ + } + + memcpy(dest, s.data, s.len); +} + +/* Assume sysid is all zeros */ +static void _sysid_2_hex(struct nfs4_string s, + char sysid[OBJLAYOUT_MAX_SYSID_HEX_LEN]) +{ + int i; + char *cur; + + if (!s.len) + return; + + if (s.len != OSD_SYSTEMID_LEN) { + pr_warn_ratelimited( + "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN", + s.len); + if (s.len > OSD_SYSTEMID_LEN) + s.len = OSD_SYSTEMID_LEN; + } + + cur = sysid; + for (i = 0; i < s.len; i++) + cur = hex_byte_pack(cur, s.data[i]); +} + +int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr) +{ + int rc; + struct __auto_login login; + + if (!deviceaddr->oda_targetaddr.ota_netaddr.r_addr.len) + return -ENODEV; + + memset(&login, 0, sizeof(login)); + __copy_nfsS_and_zero_terminate( + deviceaddr->oda_targetaddr.ota_netaddr.r_addr, + login.uri, sizeof(login.uri), "URI"); + + __copy_nfsS_and_zero_terminate( + deviceaddr->oda_osdname, + login.osdname, sizeof(login.osdname), "OSDNAME"); + + _sysid_2_hex(deviceaddr->oda_systemid, login.systemid_hex); + + rc = __objlayout_upcall(&login); + if (rc > 0) /* script returns positive values */ + rc = -ENODEV; + + return rc; +} diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h new file mode 100644 index 00000000000..01e041029a6 --- /dev/null +++ b/fs/nfs/objlayout/objlayout.h @@ -0,0 +1,189 @@ +/* + * Data types and function declerations for interfacing with the + * pNFS standard object layout driver. + * + * Copyright (C) 2007 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy <bhalevy@panasas.com> + * Boaz Harrosh <bharrosh@panasas.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _OBJLAYOUT_H +#define _OBJLAYOUT_H + +#include <linux/nfs_fs.h> +#include <linux/pnfs_osd_xdr.h> +#include "../pnfs.h" + +/* + * per-inode layout + */ +struct objlayout { + struct pnfs_layout_hdr pnfs_layout; + + /* for layout_commit */ + enum osd_delta_space_valid_enum { + OBJ_DSU_INIT = 0, + OBJ_DSU_VALID, + OBJ_DSU_INVALID, + } delta_space_valid; + s64 delta_space_used; /* consumed by write ops */ + + /* for layout_return */ + spinlock_t lock; + struct list_head err_list; +}; + +static inline struct objlayout * +OBJLAYOUT(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct objlayout, pnfs_layout); +} + +/* + * per-I/O operation state + * embedded in objects provider io_state data structure + */ +struct objlayout_io_res { + struct objlayout *objlay; + + void *rpcdata; + int status; /* res */ + int committed; /* res */ + + /* Error reporting (layout_return) */ + struct list_head err_list; + unsigned num_comps; + /* Pointer to array of error descriptors of size num_comps. + * It should contain as many entries as devices in the osd_layout + * that participate in the I/O. It is up to the io_engine to allocate + * needed space and set num_comps. + */ + struct pnfs_osd_ioerr *ioerrs; +}; + +static inline +void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps, + struct pnfs_osd_ioerr *ioerrs, void *rpcdata, + struct pnfs_layout_hdr *pnfs_layout_type) +{ + oir->objlay = OBJLAYOUT(pnfs_layout_type); + oir->rpcdata = rpcdata; + INIT_LIST_HEAD(&oir->err_list); + oir->num_comps = num_comps; + oir->ioerrs = ioerrs; +} + +/* + * Raid engine I/O API + */ +extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, + struct pnfs_layout_hdr *pnfslay, + struct pnfs_layout_range *range, + struct xdr_stream *xdr, + gfp_t gfp_flags); +extern void objio_free_lseg(struct pnfs_layout_segment *lseg); + +/* objio_free_result will free these @oir structs received from + * objlayout_{read,write}_done + */ +extern void objio_free_result(struct objlayout_io_res *oir); + +extern int objio_read_pagelist(struct nfs_pgio_data *rdata); +extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how); + +/* + * callback API + */ +extern void objlayout_io_set_result(struct objlayout_io_res *oir, + unsigned index, struct pnfs_osd_objid *pooid, + int osd_error, u64 offset, u64 length, bool is_write); + +static inline void +objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used) +{ + /* If one of the I/Os errored out and the delta_space_used was + * invalid we render the complete report as invalid. Protocol mandate + * the DSU be accurate or not reported. + */ + spin_lock(&objlay->lock); + if (objlay->delta_space_valid != OBJ_DSU_INVALID) { + objlay->delta_space_valid = OBJ_DSU_VALID; + objlay->delta_space_used += space_used; + } + spin_unlock(&objlay->lock); +} + +extern void objlayout_read_done(struct objlayout_io_res *oir, + ssize_t status, bool sync); +extern void objlayout_write_done(struct objlayout_io_res *oir, + ssize_t status, bool sync); + +extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, + struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, + gfp_t gfp_flags); +extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); + +/* + * exported generic objects function vectors + */ + +extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags); +extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *); + +extern struct pnfs_layout_segment *objlayout_alloc_lseg( + struct pnfs_layout_hdr *, + struct nfs4_layoutget_res *, + gfp_t gfp_flags); +extern void objlayout_free_lseg(struct pnfs_layout_segment *); + +extern enum pnfs_try_status objlayout_read_pagelist( + struct nfs_pgio_data *); + +extern enum pnfs_try_status objlayout_write_pagelist( + struct nfs_pgio_data *, + int how); + +extern void objlayout_encode_layoutcommit( + struct pnfs_layout_hdr *, + struct xdr_stream *, + const struct nfs4_layoutcommit_args *); + +extern void objlayout_encode_layoutreturn( + struct pnfs_layout_hdr *, + struct xdr_stream *, + const struct nfs4_layoutreturn_args *); + +extern int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr); + +#endif /* _OBJLAYOUT_H */ diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c new file mode 100644 index 00000000000..b3918f7ac34 --- /dev/null +++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c @@ -0,0 +1,415 @@ +/* + * Object-Based pNFS Layout XDR layer + * + * Copyright (C) 2007 Panasas Inc. [year of first publication] + * All rights reserved. + * + * Benny Halevy <bhalevy@panasas.com> + * Boaz Harrosh <bharrosh@panasas.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/pnfs_osd_xdr.h> + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* + * The following implementation is based on RFC5664 + */ + +/* + * struct pnfs_osd_objid { + * struct nfs4_deviceid oid_device_id; + * u64 oid_partition_id; + * u64 oid_object_id; + * }; // xdr size 32 bytes + */ +static __be32 * +_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) +{ + p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data, + sizeof(objid->oid_device_id.data)); + + p = xdr_decode_hyper(p, &objid->oid_partition_id); + p = xdr_decode_hyper(p, &objid->oid_object_id); + return p; +} +/* + * struct pnfs_osd_opaque_cred { + * u32 cred_len; + * void *cred; + * }; // xdr size [variable] + * The return pointers are from the xdr buffer + */ +static int +_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred, + struct xdr_stream *xdr) +{ + __be32 *p = xdr_inline_decode(xdr, 1); + + if (!p) + return -EINVAL; + + opaque_cred->cred_len = be32_to_cpu(*p++); + + p = xdr_inline_decode(xdr, opaque_cred->cred_len); + if (!p) + return -EINVAL; + + opaque_cred->cred = p; + return 0; +} + +/* + * struct pnfs_osd_object_cred { + * struct pnfs_osd_objid oc_object_id; + * u32 oc_osd_version; + * u32 oc_cap_key_sec; + * struct pnfs_osd_opaque_cred oc_cap_key + * struct pnfs_osd_opaque_cred oc_cap; + * }; // xdr size 32 + 4 + 4 + [variable] + [variable] + */ +static int +_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp, + struct xdr_stream *xdr) +{ + __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4); + int ret; + + if (!p) + return -EIO; + + p = _osd_xdr_decode_objid(p, &comp->oc_object_id); + comp->oc_osd_version = be32_to_cpup(p++); + comp->oc_cap_key_sec = be32_to_cpup(p); + + ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr); + if (unlikely(ret)) + return ret; + + ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr); + return ret; +} + +/* + * struct pnfs_osd_data_map { + * u32 odm_num_comps; + * u64 odm_stripe_unit; + * u32 odm_group_width; + * u32 odm_group_depth; + * u32 odm_mirror_cnt; + * u32 odm_raid_algorithm; + * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4 + */ +static inline int +_osd_data_map_xdr_sz(void) +{ + return 4 + 8 + 4 + 4 + 4 + 4; +} + +static __be32 * +_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map) +{ + data_map->odm_num_comps = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &data_map->odm_stripe_unit); + data_map->odm_group_width = be32_to_cpup(p++); + data_map->odm_group_depth = be32_to_cpup(p++); + data_map->odm_mirror_cnt = be32_to_cpup(p++); + data_map->odm_raid_algorithm = be32_to_cpup(p++); + dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " + "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", + __func__, + data_map->odm_num_comps, + (unsigned long long)data_map->odm_stripe_unit, + data_map->odm_group_width, + data_map->odm_group_depth, + data_map->odm_mirror_cnt, + data_map->odm_raid_algorithm); + return p; +} + +int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout, + struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr) +{ + __be32 *p; + + memset(iter, 0, sizeof(*iter)); + + p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4); + if (unlikely(!p)) + return -EINVAL; + + p = _osd_xdr_decode_data_map(p, &layout->olo_map); + layout->olo_comps_index = be32_to_cpup(p++); + layout->olo_num_comps = be32_to_cpup(p++); + dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__, + layout->olo_comps_index, layout->olo_num_comps); + + iter->total_comps = layout->olo_num_comps; + return 0; +} + +bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp, + struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr, + int *err) +{ + BUG_ON(iter->decoded_comps > iter->total_comps); + if (iter->decoded_comps == iter->total_comps) + return false; + + *err = _osd_xdr_decode_object_cred(comp, xdr); + if (unlikely(*err)) { + dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d " + "total_comps=%d\n", __func__, *err, + iter->decoded_comps, iter->total_comps); + return false; /* stop the loop */ + } + dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx " + "key_len=%u cap_len=%u\n", + __func__, + _DEVID_LO(&comp->oc_object_id.oid_device_id), + _DEVID_HI(&comp->oc_object_id.oid_device_id), + comp->oc_object_id.oid_partition_id, + comp->oc_object_id.oid_object_id, + comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); + + iter->decoded_comps++; + return true; +} + +/* + * Get Device Information Decoding + * + * Note: since Device Information is currently done synchronously, all + * variable strings fields are left inside the rpc buffer and are only + * pointed to by the pnfs_osd_deviceaddr members. So the read buffer + * should not be freed while the returned information is in use. + */ +/* + *struct nfs4_string { + * unsigned int len; + * char *data; + *}; // size [variable] + * NOTE: Returned string points to inside the XDR buffer + */ +static __be32 * +__read_u8_opaque(__be32 *p, struct nfs4_string *str) +{ + str->len = be32_to_cpup(p++); + str->data = (char *)p; + + p += XDR_QUADLEN(str->len); + return p; +} + +/* + * struct pnfs_osd_targetid { + * u32 oti_type; + * struct nfs4_string oti_scsi_device_id; + * };// size 4 + [variable] + */ +static __be32 * +__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid) +{ + u32 oti_type; + + oti_type = be32_to_cpup(p++); + targetid->oti_type = oti_type; + + switch (oti_type) { + case OBJ_TARGET_SCSI_NAME: + case OBJ_TARGET_SCSI_DEVICE_ID: + p = __read_u8_opaque(p, &targetid->oti_scsi_device_id); + } + + return p; +} + +/* + * struct pnfs_osd_net_addr { + * struct nfs4_string r_netid; + * struct nfs4_string r_addr; + * }; + */ +static __be32 * +__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr) +{ + p = __read_u8_opaque(p, &netaddr->r_netid); + p = __read_u8_opaque(p, &netaddr->r_addr); + + return p; +} + +/* + * struct pnfs_osd_targetaddr { + * u32 ota_available; + * struct pnfs_osd_net_addr ota_netaddr; + * }; + */ +static __be32 * +__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr) +{ + u32 ota_available; + + ota_available = be32_to_cpup(p++); + targetaddr->ota_available = ota_available; + + if (ota_available) + p = __read_net_addr(p, &targetaddr->ota_netaddr); + + + return p; +} + +/* + * struct pnfs_osd_deviceaddr { + * struct pnfs_osd_targetid oda_targetid; + * struct pnfs_osd_targetaddr oda_targetaddr; + * u8 oda_lun[8]; + * struct nfs4_string oda_systemid; + * struct pnfs_osd_object_cred oda_root_obj_cred; + * struct nfs4_string oda_osdname; + * }; + */ + +/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does + * not have an xdr_stream + */ +static __be32 * +__read_opaque_cred(__be32 *p, + struct pnfs_osd_opaque_cred *opaque_cred) +{ + opaque_cred->cred_len = be32_to_cpu(*p++); + opaque_cred->cred = p; + return p + XDR_QUADLEN(opaque_cred->cred_len); +} + +static __be32 * +__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp) +{ + p = _osd_xdr_decode_objid(p, &comp->oc_object_id); + comp->oc_osd_version = be32_to_cpup(p++); + comp->oc_cap_key_sec = be32_to_cpup(p++); + + p = __read_opaque_cred(p, &comp->oc_cap_key); + p = __read_opaque_cred(p, &comp->oc_cap); + return p; +} + +void pnfs_osd_xdr_decode_deviceaddr( + struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p) +{ + p = __read_targetid(p, &deviceaddr->oda_targetid); + + p = __read_targetaddr(p, &deviceaddr->oda_targetaddr); + + p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun, + sizeof(deviceaddr->oda_lun)); + + p = __read_u8_opaque(p, &deviceaddr->oda_systemid); + + p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred); + + p = __read_u8_opaque(p, &deviceaddr->oda_osdname); + + /* libosd likes this terminated in dbg. It's last, so no problems */ + deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0; +} + +/* + * struct pnfs_osd_layoutupdate { + * u32 dsu_valid; + * s64 dsu_delta; + * u32 olu_ioerr_flag; + * }; xdr size 4 + 8 + 4 + */ +int +pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, + struct pnfs_osd_layoutupdate *lou) +{ + __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4); + + if (!p) + return -E2BIG; + + *p++ = cpu_to_be32(lou->dsu_valid); + if (lou->dsu_valid) + p = xdr_encode_hyper(p, lou->dsu_delta); + *p++ = cpu_to_be32(lou->olu_ioerr_flag); + return 0; +} + +/* + * struct pnfs_osd_objid { + * struct nfs4_deviceid oid_device_id; + * u64 oid_partition_id; + * u64 oid_object_id; + * }; // xdr size 32 bytes + */ +static inline __be32 * +pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id) +{ + p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, + sizeof(object_id->oid_device_id.data)); + p = xdr_encode_hyper(p, object_id->oid_partition_id); + p = xdr_encode_hyper(p, object_id->oid_object_id); + + return p; +} + +/* + * struct pnfs_osd_ioerr { + * struct pnfs_osd_objid oer_component; + * u64 oer_comp_offset; + * u64 oer_comp_length; + * u32 oer_iswrite; + * u32 oer_errno; + * }; // xdr size 32 + 24 bytes + */ +void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr) +{ + p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component); + p = xdr_encode_hyper(p, ioerr->oer_comp_offset); + p = xdr_encode_hyper(p, ioerr->oer_comp_length); + *p++ = cpu_to_be32(ioerr->oer_iswrite); + *p = cpu_to_be32(ioerr->oer_errno); +} + +__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 32 + 24); + if (unlikely(!p)) + dprintk("%s: out of xdr space\n", __func__); + + return p; +} diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 7f079209d70..17fab89f635 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -13,25 +13,71 @@ #include <linux/file.h> #include <linux/sched.h> #include <linux/sunrpc/clnt.h> +#include <linux/nfs.h> #include <linux/nfs3.h> #include <linux/nfs4.h> #include <linux/nfs_page.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> +#include <linux/export.h> #include "internal.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PAGECACHE static struct kmem_cache *nfs_page_cachep; +static const struct rpc_call_ops nfs_pgio_common_ops; + +static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) +{ + p->npages = pagecount; + if (pagecount <= ARRAY_SIZE(p->page_array)) + p->pagevec = p->page_array; + else { + p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); + if (!p->pagevec) + p->npages = 0; + } + return p->pagevec != NULL; +} + +void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr, + void (*release)(struct nfs_pgio_header *hdr)) +{ + hdr->req = nfs_list_entry(desc->pg_list.next); + hdr->inode = desc->pg_inode; + hdr->cred = hdr->req->wb_context->cred; + hdr->io_start = req_offset(hdr->req); + hdr->good_bytes = desc->pg_count; + hdr->dreq = desc->pg_dreq; + hdr->layout_private = desc->pg_layout_private; + hdr->release = release; + hdr->completion_ops = desc->pg_completion_ops; + if (hdr->completion_ops->init_hdr) + hdr->completion_ops->init_hdr(hdr); +} +EXPORT_SYMBOL_GPL(nfs_pgheader_init); + +void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) +{ + spin_lock(&hdr->lock); + if (pos < hdr->io_start + hdr->good_bytes) { + set_bit(NFS_IOHDR_ERROR, &hdr->flags); + clear_bit(NFS_IOHDR_EOF, &hdr->flags); + hdr->good_bytes = pos - hdr->io_start; + hdr->error = error; + } + spin_unlock(&hdr->lock); +} static inline struct nfs_page * nfs_page_alloc(void) { - struct nfs_page *p; - p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL); - if (p) { - memset(p, 0, sizeof(*p)); + struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_NOIO); + if (p) INIT_LIST_HEAD(&p->wb_list); - } return p; } @@ -41,11 +87,217 @@ nfs_page_free(struct nfs_page *p) kmem_cache_free(nfs_page_cachep, p); } +static void +nfs_iocounter_inc(struct nfs_io_counter *c) +{ + atomic_inc(&c->io_count); +} + +static void +nfs_iocounter_dec(struct nfs_io_counter *c) +{ + if (atomic_dec_and_test(&c->io_count)) { + clear_bit(NFS_IO_INPROGRESS, &c->flags); + smp_mb__after_atomic(); + wake_up_bit(&c->flags, NFS_IO_INPROGRESS); + } +} + +static int +__nfs_iocounter_wait(struct nfs_io_counter *c) +{ + wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS); + DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS); + int ret = 0; + + do { + prepare_to_wait(wq, &q.wait, TASK_KILLABLE); + set_bit(NFS_IO_INPROGRESS, &c->flags); + if (atomic_read(&c->io_count) == 0) + break; + ret = nfs_wait_bit_killable(&c->flags); + } while (atomic_read(&c->io_count) != 0); + finish_wait(wq, &q.wait); + return ret; +} + +/** + * nfs_iocounter_wait - wait for i/o to complete + * @c: nfs_io_counter to use + * + * returns -ERESTARTSYS if interrupted by a fatal signal. + * Otherwise returns 0 once the io_count hits 0. + */ +int +nfs_iocounter_wait(struct nfs_io_counter *c) +{ + if (atomic_read(&c->io_count) == 0) + return 0; + return __nfs_iocounter_wait(c); +} + +static int nfs_wait_bit_uninterruptible(void *word) +{ + io_schedule(); + return 0; +} + +/* + * nfs_page_group_lock - lock the head of the page group + * @req - request in group that is to be locked + * + * this lock must be held if modifying the page group list + */ +void +nfs_page_group_lock(struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + WARN_ON_ONCE(head != head->wb_head); + + wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, + nfs_wait_bit_uninterruptible, + TASK_UNINTERRUPTIBLE); +} + +/* + * nfs_page_group_unlock - unlock the head of the page group + * @req - request in group that is to be unlocked + */ +void +nfs_page_group_unlock(struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + WARN_ON_ONCE(head != head->wb_head); + + smp_mb__before_atomic(); + clear_bit(PG_HEADLOCK, &head->wb_flags); + smp_mb__after_atomic(); + wake_up_bit(&head->wb_flags, PG_HEADLOCK); +} + +/* + * nfs_page_group_sync_on_bit_locked + * + * must be called with page group lock held + */ +static bool +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) +{ + struct nfs_page *head = req->wb_head; + struct nfs_page *tmp; + + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags)); + WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags)); + + tmp = req->wb_this_page; + while (tmp != req) { + if (!test_bit(bit, &tmp->wb_flags)) + return false; + tmp = tmp->wb_this_page; + } + + /* true! reset all bits */ + tmp = req; + do { + clear_bit(bit, &tmp->wb_flags); + tmp = tmp->wb_this_page; + } while (tmp != req); + + return true; +} + +/* + * nfs_page_group_sync_on_bit - set bit on current request, but only + * return true if the bit is set for all requests in page group + * @req - request in page group + * @bit - PG_* bit that is used to sync page group + */ +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit) +{ + bool ret; + + nfs_page_group_lock(req); + ret = nfs_page_group_sync_on_bit_locked(req, bit); + nfs_page_group_unlock(req); + + return ret; +} + +/* + * nfs_page_group_init - Initialize the page group linkage for @req + * @req - a new nfs request + * @prev - the previous request in page group, or NULL if @req is the first + * or only request in the group (the head). + */ +static inline void +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) +{ + WARN_ON_ONCE(prev == req); + + if (!prev) { + /* a head request */ + req->wb_head = req; + req->wb_this_page = req; + } else { + /* a subrequest */ + WARN_ON_ONCE(prev->wb_this_page != prev->wb_head); + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags)); + req->wb_head = prev->wb_head; + req->wb_this_page = prev->wb_this_page; + prev->wb_this_page = req; + + /* All subrequests take a ref on the head request until + * nfs_page_group_destroy is called */ + kref_get(&req->wb_head->wb_kref); + + /* grab extra ref if head request has extra ref from + * the write/commit path to handle handoff between write + * and commit lists */ + if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) { + set_bit(PG_INODE_REF, &req->wb_flags); + kref_get(&req->wb_kref); + } + } +} + +/* + * nfs_page_group_destroy - sync the destruction of page groups + * @req - request that no longer needs the page group + * + * releases the page group reference from each member once all + * members have called this function. + */ +static void +nfs_page_group_destroy(struct kref *kref) +{ + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); + struct nfs_page *tmp, *next; + + /* subrequests must release the ref on the head request */ + if (req->wb_head != req) + nfs_release_request(req->wb_head); + + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN)) + return; + + tmp = req; + do { + next = tmp->wb_this_page; + /* unlink and free */ + tmp->wb_this_page = tmp; + tmp->wb_head = tmp; + nfs_free_request(tmp); + tmp = next; + } while (tmp != req); +} + /** * nfs_create_request - Create an NFS read/write request. - * @file: file descriptor to use - * @inode: inode to which the request is attached + * @ctx: open context to use * @page: page to write + * @last: last nfs request created for this page group or NULL if head * @offset: starting offset within the page for the write * @count: number of bytes to read/write * @@ -54,38 +306,41 @@ nfs_page_free(struct nfs_page *p) * User should ensure it is safe to sleep in this function. */ struct nfs_page * -nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, - struct page *page, - unsigned int offset, unsigned int count) +nfs_create_request(struct nfs_open_context *ctx, struct page *page, + struct nfs_page *last, unsigned int offset, + unsigned int count) { struct nfs_page *req; + struct nfs_lock_context *l_ctx; - for (;;) { - /* try to allocate the request struct */ - req = nfs_page_alloc(); - if (req != NULL) - break; + if (test_bit(NFS_CONTEXT_BAD, &ctx->flags)) + return ERR_PTR(-EBADF); + /* try to allocate the request struct */ + req = nfs_page_alloc(); + if (req == NULL) + return ERR_PTR(-ENOMEM); - if (fatal_signal_pending(current)) - return ERR_PTR(-ERESTARTSYS); - yield(); + /* get lock context early so we can deal with alloc failures */ + l_ctx = nfs_get_lock_context(ctx); + if (IS_ERR(l_ctx)) { + nfs_page_free(req); + return ERR_CAST(l_ctx); } + req->wb_lock_context = l_ctx; + nfs_iocounter_inc(&l_ctx->io_count); /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in * update_nfs_request below if the region is not locked. */ req->wb_page = page; - atomic_set(&req->wb_complete, 0); - req->wb_index = page->index; + req->wb_index = page_file_index(page); page_cache_get(page); - BUG_ON(PagePrivate(page)); - BUG_ON(!PageLocked(page)); - BUG_ON(page->mapping->host != inode); req->wb_offset = offset; req->wb_pgbase = offset; req->wb_bytes = count; req->wb_context = get_nfs_open_context(ctx); kref_init(&req->wb_kref); + nfs_page_group_init(req, last); return req; } @@ -99,92 +354,75 @@ void nfs_unlock_request(struct nfs_page *req) printk(KERN_ERR "NFS: Invalid unlock attempted\n"); BUG(); } - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(PG_BUSY, &req->wb_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&req->wb_flags, PG_BUSY); - nfs_release_request(req); } /** - * nfs_set_page_tag_locked - Tag a request as locked + * nfs_unlock_and_release_request - Unlock request and release the nfs_page * @req: */ -int nfs_set_page_tag_locked(struct nfs_page *req) +void nfs_unlock_and_release_request(struct nfs_page *req) { - struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode); - - if (!nfs_lock_request_dontget(req)) - return 0; - if (req->wb_page != NULL) - radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); - return 1; -} - -/** - * nfs_clear_page_tag_locked - Clear request tag and wake up sleepers - */ -void nfs_clear_page_tag_locked(struct nfs_page *req) -{ - struct inode *inode = req->wb_context->path.dentry->d_inode; - struct nfs_inode *nfsi = NFS_I(inode); - - if (req->wb_page != NULL) { - spin_lock(&inode->i_lock); - radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); - nfs_unlock_request(req); - spin_unlock(&inode->i_lock); - } else - nfs_unlock_request(req); + nfs_unlock_request(req); + nfs_release_request(req); } -/** +/* * nfs_clear_request - Free up all resources allocated to the request * @req: * - * Release page resources associated with a write request after it - * has completed. + * Release page and open context resources associated with a read/write + * request after it has completed. */ -void nfs_clear_request(struct nfs_page *req) +static void nfs_clear_request(struct nfs_page *req) { struct page *page = req->wb_page; + struct nfs_open_context *ctx = req->wb_context; + struct nfs_lock_context *l_ctx = req->wb_lock_context; + if (page != NULL) { page_cache_release(page); req->wb_page = NULL; } + if (l_ctx != NULL) { + nfs_iocounter_dec(&l_ctx->io_count); + nfs_put_lock_context(l_ctx); + req->wb_lock_context = NULL; + } + if (ctx != NULL) { + put_nfs_open_context(ctx); + req->wb_context = NULL; + } } - /** * nfs_release_request - Release the count on an NFS read/write request * @req: request to release * * Note: Should never be called with the spinlock held! */ -static void nfs_free_request(struct kref *kref) +void nfs_free_request(struct nfs_page *req) { - struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); + WARN_ON_ONCE(req->wb_this_page != req); + + /* extra debug: make sure no sync bits are still set */ + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags)); - /* Release struct file or cached credential */ + /* Release struct file and open context */ nfs_clear_request(req); - put_nfs_open_context(req->wb_context); nfs_page_free(req); } void nfs_release_request(struct nfs_page *req) { - kref_put(&req->wb_kref, nfs_free_request); -} - -static int nfs_wait_bit_killable(void *word) -{ - int ret = 0; - - if (fatal_signal_pending(current)) - ret = -ERESTARTSYS; - else - schedule(); - return ret; + kref_put(&req->wb_kref, nfs_page_group_destroy); } /** @@ -197,15 +435,253 @@ static int nfs_wait_bit_killable(void *word) int nfs_wait_on_request(struct nfs_page *req) { + return wait_on_bit(&req->wb_flags, PG_BUSY, + nfs_wait_bit_uninterruptible, + TASK_UNINTERRUPTIBLE); +} + +/* + * nfs_generic_pg_test - determine if requests can be coalesced + * @desc: pointer to descriptor + * @prev: previous request in desc, or NULL + * @req: this request + * + * Returns zero if @req can be coalesced into @desc, otherwise it returns + * the size of the request. + */ +size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, + struct nfs_page *prev, struct nfs_page *req) +{ + if (desc->pg_count > desc->pg_bsize) { + /* should never happen */ + WARN_ON_ONCE(1); + return 0; + } + + return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); +} +EXPORT_SYMBOL_GPL(nfs_generic_pg_test); + +static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr) +{ + return container_of(hdr, struct nfs_rw_header, header); +} + +/** + * nfs_rw_header_alloc - Allocate a header for a read or write + * @ops: Read or write function vector + */ +struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops) +{ + struct nfs_rw_header *header = ops->rw_alloc_header(); + + if (header) { + struct nfs_pgio_header *hdr = &header->header; + + INIT_LIST_HEAD(&hdr->pages); + spin_lock_init(&hdr->lock); + atomic_set(&hdr->refcnt, 0); + hdr->rw_ops = ops; + } + return header; +} +EXPORT_SYMBOL_GPL(nfs_rw_header_alloc); + +/* + * nfs_rw_header_free - Free a read or write header + * @hdr: The header to free + */ +void nfs_rw_header_free(struct nfs_pgio_header *hdr) +{ + hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr)); +} +EXPORT_SYMBOL_GPL(nfs_rw_header_free); + +/** + * nfs_pgio_data_alloc - Allocate pageio data + * @hdr: The header making a request + * @pagecount: Number of pages to create + */ +static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr, + unsigned int pagecount) +{ + struct nfs_pgio_data *data, *prealloc; + + prealloc = &NFS_RW_HEADER(hdr)->rpc_data; + if (prealloc->header == NULL) + data = prealloc; + else + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out; + + if (nfs_pgarray_set(&data->pages, pagecount)) { + data->header = hdr; + atomic_inc(&hdr->refcnt); + } else { + if (data != prealloc) + kfree(data); + data = NULL; + } +out: + return data; +} + +/** + * nfs_pgio_data_release - Properly free pageio data + * @data: The data to release + */ +void nfs_pgio_data_release(struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr); + + put_nfs_open_context(data->args.context); + if (data->pages.pagevec != data->pages.page_array) + kfree(data->pages.pagevec); + if (data == &pageio_header->rpc_data) { + data->header = NULL; + data = NULL; + } + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); + /* Note: we only free the rpc_task after callbacks are done. + * See the comment in rpc_free_task() for why + */ + kfree(data); +} +EXPORT_SYMBOL_GPL(nfs_pgio_data_release); + +/** + * nfs_pgio_rpcsetup - Set up arguments for a pageio call + * @data: The pageio data + * @count: Number of bytes to read + * @offset: Initial offset + * @how: How to commit data (writes only) + * @cinfo: Commit information for the call (writes only) + */ +static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data, + unsigned int count, unsigned int offset, + int how, struct nfs_commit_info *cinfo) +{ + struct nfs_page *req = data->header->req; + + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + data->args.fh = NFS_FH(data->header->inode); + data->args.offset = req_offset(req) + offset; + /* pnfs_set_layoutcommit needs this */ + data->mds_offset = data->args.offset; + data->args.pgbase = req->wb_pgbase + offset; + data->args.pages = data->pages.pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); + data->args.lock_context = req->wb_lock_context; + data->args.stable = NFS_UNSTABLE; + switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { + case 0: + break; + case FLUSH_COND_STABLE: + if (nfs_reqs_to_commit(cinfo)) + break; + default: + data->args.stable = NFS_FILE_SYNC; + } + + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.eof = 0; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); +} + +/** + * nfs_pgio_prepare - Prepare pageio data to go over the wire + * @task: The current task + * @calldata: pageio data to prepare + */ +static void nfs_pgio_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_pgio_data *data = calldata; + int err; + err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data); + if (err) + rpc_exit(task, err); +} + +int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data, + const struct rpc_call_ops *call_ops, int how, int flags) +{ + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->header->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .task = &data->task, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC | flags, + }; int ret = 0; - if (!test_bit(PG_BUSY, &req->wb_flags)) + data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how); + + dprintk("NFS: %5u initiated pgio call " + "(req %s/%llu, %u bytes @ offset %llu)\n", + data->task.tk_pid, + data->header->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(data->header->inode), + data->args.count, + (unsigned long long)data->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + ret = PTR_ERR(task); goto out; - ret = out_of_line_wait_on_bit(&req->wb_flags, PG_BUSY, - nfs_wait_bit_killable, TASK_KILLABLE); + } + if (how & FLUSH_SYNC) { + ret = rpc_wait_for_completion_task(task); + if (ret == 0) + ret = task->tk_status; + } + rpc_put_task(task); out: return ret; } +EXPORT_SYMBOL_GPL(nfs_initiate_pgio); + +/** + * nfs_pgio_error - Clean up from a pageio error + * @desc: IO descriptor + * @hdr: pageio header + */ +static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + set_bit(NFS_IOHDR_REDO, &hdr->flags); + nfs_pgio_data_release(hdr->data); + hdr->data = NULL; + desc->pg_completion_ops->error_cleanup(&desc->pg_list); + return -ENOMEM; +} + +/** + * nfs_pgio_release - Release pageio data + * @calldata: The pageio data to release + */ +static void nfs_pgio_release(void *calldata) +{ + struct nfs_pgio_data *data = calldata; + if (data->header->rw_ops->rw_release) + data->header->rw_ops->rw_release(data); + nfs_pgio_data_release(data); +} /** * nfs_pageio_init - initialise a page io descriptor @@ -217,7 +693,9 @@ out: */ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, - int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), + const struct nfs_pageio_ops *pg_ops, + const struct nfs_pgio_completion_ops *compl_ops, + const struct nfs_rw_ops *rw_ops, size_t bsize, int io_flags) { @@ -226,10 +704,119 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_count = 0; desc->pg_bsize = bsize; desc->pg_base = 0; + desc->pg_moreio = 0; + desc->pg_recoalesce = 0; desc->pg_inode = inode; - desc->pg_doio = doio; + desc->pg_ops = pg_ops; + desc->pg_completion_ops = compl_ops; + desc->pg_rw_ops = rw_ops; desc->pg_ioflags = io_flags; desc->pg_error = 0; + desc->pg_lseg = NULL; + desc->pg_dreq = NULL; + desc->pg_layout_private = NULL; +} +EXPORT_SYMBOL_GPL(nfs_pageio_init); + +/** + * nfs_pgio_result - Basic pageio error handling + * @task: The task that ran + * @calldata: Pageio data to check + */ +static void nfs_pgio_result(struct rpc_task *task, void *calldata) +{ + struct nfs_pgio_data *data = calldata; + struct inode *inode = data->header->inode; + + dprintk("NFS: %s: %5u, (status %d)\n", __func__, + task->tk_pid, task->tk_status); + + if (data->header->rw_ops->rw_done(task, data, inode) != 0) + return; + if (task->tk_status < 0) + nfs_set_pgio_error(data->header, task->tk_status, data->args.offset); + else + data->header->rw_ops->rw_result(task, data); +} + +/* + * Create an RPC task for the given read or write request and kick it. + * The page must have been locked by the caller. + * + * It may happen that the page we're passed is not marked dirty. + * This is the case if nfs_updatepage detects a conflicting request + * that has been written but not committed. + */ +int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + struct nfs_page *req; + struct page **pages; + struct nfs_pgio_data *data; + struct list_head *head = &desc->pg_list; + struct nfs_commit_info cinfo; + + data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base, + desc->pg_count)); + if (!data) + return nfs_pgio_error(desc, hdr); + + nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); + pages = data->pages.pagevec; + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_list_add_request(req, &hdr->pages); + *pages++ = req->wb_page; + } + + if ((desc->pg_ioflags & FLUSH_COND_STABLE) && + (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) + desc->pg_ioflags &= ~FLUSH_COND_STABLE; + + /* Set up the argument struct */ + nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo); + hdr->data = data; + desc->pg_rpc_callops = &nfs_pgio_common_ops; + return 0; +} +EXPORT_SYMBOL_GPL(nfs_generic_pgio); + +static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) +{ + struct nfs_rw_header *rw_hdr; + struct nfs_pgio_header *hdr; + int ret; + + rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops); + if (!rw_hdr) { + desc->pg_completion_ops->error_cleanup(&desc->pg_list); + return -ENOMEM; + } + hdr = &rw_hdr->header; + nfs_pgheader_init(desc, hdr, nfs_rw_header_free); + atomic_inc(&hdr->refcnt); + ret = nfs_generic_pgio(desc, hdr); + if (ret == 0) + ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), + hdr->data, desc->pg_rpc_callops, + desc->pg_ioflags, 0); + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); + return ret; +} + +static bool nfs_match_open_context(const struct nfs_open_context *ctx1, + const struct nfs_open_context *ctx2) +{ + return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; +} + +static bool nfs_match_lock_context(const struct nfs_lock_context *l1, + const struct nfs_lock_context *l2) +{ + return l1->lockowner.l_owner == l2->lockowner.l_owner + && l1->lockowner.l_pid == l2->lockowner.l_pid; } /** @@ -243,22 +830,27 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, * * Return 'true' if this is the case, else return 'false'. */ -static int nfs_can_coalesce_requests(struct nfs_page *prev, - struct nfs_page *req) +static bool nfs_can_coalesce_requests(struct nfs_page *prev, + struct nfs_page *req, + struct nfs_pageio_descriptor *pgio) { - if (req->wb_context->cred != prev->wb_context->cred) - return 0; - if (req->wb_context->lockowner != prev->wb_context->lockowner) - return 0; - if (req->wb_context->state != prev->wb_context->state) - return 0; - if (req->wb_index != (prev->wb_index + 1)) - return 0; - if (req->wb_pgbase != 0) - return 0; - if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) - return 0; - return 1; + size_t size; + + if (prev) { + if (!nfs_match_open_context(req->wb_context, prev->wb_context)) + return false; + if (req->wb_context->dentry->d_inode->i_flock != NULL && + !nfs_match_lock_context(req->wb_lock_context, + prev->wb_lock_context)) + return false; + if (req_offset(req) != req_offset(prev) + prev->wb_bytes) + return false; + } + size = pgio->pg_ops->pg_test(pgio, prev, req); + WARN_ON_ONCE(size > req->wb_bytes); + if (size && size < req->wb_bytes) + req->wb_bytes = size; + return size > 0; } /** @@ -272,31 +864,19 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev, static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { - size_t newlen = req->wb_bytes; - + struct nfs_page *prev = NULL; if (desc->pg_count != 0) { - struct nfs_page *prev; - - /* - * FIXME: ideally we should be able to coalesce all requests - * that are not block boundary aligned, but currently this - * is problematic for the case of bsize < PAGE_CACHE_SIZE, - * since nfs_flush_multi and nfs_pagein_multi assume you - * can have only one struct nfs_page. - */ - if (desc->pg_bsize < PAGE_SIZE) - return 0; - newlen += desc->pg_count; - if (newlen > desc->pg_bsize) - return 0; prev = nfs_list_entry(desc->pg_list.prev); - if (!nfs_can_coalesce_requests(prev, req)) - return 0; - } else + } else { + if (desc->pg_ops->pg_init) + desc->pg_ops->pg_init(desc, req); desc->pg_base = req->wb_pgbase; + } + if (!nfs_can_coalesce_requests(prev, req, desc)) + return 0; nfs_list_remove_request(req); nfs_list_add_request(req, &desc->pg_list); - desc->pg_count = newlen; + desc->pg_count += req->wb_bytes; return 1; } @@ -306,12 +886,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) { if (!list_empty(&desc->pg_list)) { - int error = desc->pg_doio(desc->pg_inode, - &desc->pg_list, - nfs_page_array_len(desc->pg_base, - desc->pg_count), - desc->pg_count, - desc->pg_ioflags); + int error = desc->pg_ops->pg_doio(desc); if (error < 0) desc->pg_error = error; else @@ -328,28 +903,133 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) * @desc: destination io descriptor * @req: request * + * This may split a request into subrequests which are all part of the + * same page group. + * * Returns true if the request 'req' was successfully coalesced into the * existing list of pages 'desc'. */ -int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, +static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { - while (!nfs_pageio_do_add_request(desc, req)) { - nfs_pageio_doio(desc); - if (desc->pg_error < 0) - return 0; - } + struct nfs_page *subreq; + unsigned int bytes_left = 0; + unsigned int offset, pgbase; + + nfs_page_group_lock(req); + + subreq = req; + bytes_left = subreq->wb_bytes; + offset = subreq->wb_offset; + pgbase = subreq->wb_pgbase; + + do { + if (!nfs_pageio_do_add_request(desc, subreq)) { + /* make sure pg_test call(s) did nothing */ + WARN_ON_ONCE(subreq->wb_bytes != bytes_left); + WARN_ON_ONCE(subreq->wb_offset != offset); + WARN_ON_ONCE(subreq->wb_pgbase != pgbase); + + nfs_page_group_unlock(req); + desc->pg_moreio = 1; + nfs_pageio_doio(desc); + if (desc->pg_error < 0) + return 0; + if (desc->pg_recoalesce) + return 0; + /* retry add_request for this subreq */ + nfs_page_group_lock(req); + continue; + } + + /* check for buggy pg_test call(s) */ + WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE); + WARN_ON_ONCE(subreq->wb_bytes > bytes_left); + WARN_ON_ONCE(subreq->wb_bytes == 0); + + bytes_left -= subreq->wb_bytes; + offset += subreq->wb_bytes; + pgbase += subreq->wb_bytes; + + if (bytes_left) { + subreq = nfs_create_request(req->wb_context, + req->wb_page, + subreq, pgbase, bytes_left); + if (IS_ERR(subreq)) + goto err_ptr; + nfs_lock_request(subreq); + subreq->wb_offset = offset; + subreq->wb_index = req->wb_index; + } + } while (bytes_left > 0); + + nfs_page_group_unlock(req); return 1; +err_ptr: + desc->pg_error = PTR_ERR(subreq); + nfs_page_group_unlock(req); + return 0; } +static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) +{ + LIST_HEAD(head); + + do { + list_splice_init(&desc->pg_list, &head); + desc->pg_bytes_written -= desc->pg_count; + desc->pg_count = 0; + desc->pg_base = 0; + desc->pg_recoalesce = 0; + desc->pg_moreio = 0; + + while (!list_empty(&head)) { + struct nfs_page *req; + + req = list_first_entry(&head, struct nfs_page, wb_list); + nfs_list_remove_request(req); + if (__nfs_pageio_add_request(desc, req)) + continue; + if (desc->pg_error < 0) + return 0; + break; + } + } while (desc->pg_recoalesce); + return 1; +} + +int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + int ret; + + do { + ret = __nfs_pageio_add_request(desc, req); + if (ret) + break; + if (desc->pg_error < 0) + break; + ret = nfs_do_recoalesce(desc); + } while (ret); + return ret; +} +EXPORT_SYMBOL_GPL(nfs_pageio_add_request); + /** * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor * @desc: pointer to io descriptor */ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) { - nfs_pageio_doio(desc); + for (;;) { + nfs_pageio_doio(desc); + if (!desc->pg_recoalesce) + break; + if (!nfs_do_recoalesce(desc)) + break; + } } +EXPORT_SYMBOL_GPL(nfs_pageio_complete); /** * nfs_pageio_cond_complete - Conditional I/O completion @@ -367,68 +1047,8 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) if (!list_empty(&desc->pg_list)) { struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev); if (index != prev->wb_index + 1) - nfs_pageio_doio(desc); - } -} - -#define NFS_SCAN_MAXENTRIES 16 -/** - * nfs_scan_list - Scan a list for matching requests - * @nfsi: NFS inode - * @dst: Destination list - * @idx_start: lower bound of page->index to scan - * @npages: idx_start + npages sets the upper bound to scan. - * @tag: tag to scan for - * - * Moves elements from one of the inode request lists. - * If the number of requests is set to 0, the entire address_space - * starting at index idx_start, is scanned. - * The requests are *not* checked to ensure that they form a contiguous set. - * You must be holding the inode's i_lock when calling this function - */ -int nfs_scan_list(struct nfs_inode *nfsi, - struct list_head *dst, pgoff_t idx_start, - unsigned int npages, int tag) -{ - struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; - struct nfs_page *req; - pgoff_t idx_end; - int found, i; - int res; - - res = 0; - if (npages == 0) - idx_end = ~0; - else - idx_end = idx_start + npages - 1; - - for (;;) { - found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, - (void **)&pgvec[0], idx_start, - NFS_SCAN_MAXENTRIES, tag); - if (found <= 0) - break; - for (i = 0; i < found; i++) { - req = pgvec[i]; - if (req->wb_index > idx_end) - goto out; - idx_start = req->wb_index + 1; - if (nfs_set_page_tag_locked(req)) { - kref_get(&req->wb_kref); - nfs_list_remove_request(req); - radix_tree_tag_clear(&nfsi->nfs_page_tree, - req->wb_index, tag); - nfs_list_add_request(req, dst); - res++; - if (res == INT_MAX) - goto out; - } - } - /* for latency reduction */ - cond_resched_lock(&nfsi->vfs_inode.i_lock); + nfs_pageio_complete(desc); } -out: - return res; } int __init nfs_init_nfspagecache(void) @@ -448,3 +1068,13 @@ void nfs_destroy_nfspagecache(void) kmem_cache_destroy(nfs_page_cachep); } +static const struct rpc_call_ops nfs_pgio_common_ops = { + .rpc_call_prepare = nfs_pgio_prepare, + .rpc_call_done = nfs_pgio_result, + .rpc_release = nfs_pgio_release, +}; + +const struct nfs_pageio_ops nfs_pgio_rw_ops = { + .pg_test = nfs_generic_pg_test, + .pg_doio = nfs_generic_pg_pgios, +}; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c new file mode 100644 index 00000000000..6fdcd233d6f --- /dev/null +++ b/fs/nfs/pnfs.c @@ -0,0 +1,1948 @@ +/* + * pNFS functions to call and manage layout drivers. + * + * Copyright (c) 2002 [year of first publication] + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include <linux/nfs_fs.h> +#include <linux/nfs_page.h> +#include <linux/module.h> +#include "internal.h" +#include "pnfs.h" +#include "iostat.h" +#include "nfs4trace.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS +#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) + +/* Locking: + * + * pnfs_spinlock: + * protects pnfs_modules_tbl. + */ +static DEFINE_SPINLOCK(pnfs_spinlock); + +/* + * pnfs_modules_tbl holds all pnfs modules + */ +static LIST_HEAD(pnfs_modules_tbl); + +/* Return the registered pnfs layout driver module matching given id */ +static struct pnfs_layoutdriver_type * +find_pnfs_driver_locked(u32 id) +{ + struct pnfs_layoutdriver_type *local; + + list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) + if (local->id == id) + goto out; + local = NULL; +out: + dprintk("%s: Searching for id %u, found %p\n", __func__, id, local); + return local; +} + +static struct pnfs_layoutdriver_type * +find_pnfs_driver(u32 id) +{ + struct pnfs_layoutdriver_type *local; + + spin_lock(&pnfs_spinlock); + local = find_pnfs_driver_locked(id); + if (local != NULL && !try_module_get(local->owner)) { + dprintk("%s: Could not grab reference on module\n", __func__); + local = NULL; + } + spin_unlock(&pnfs_spinlock); + return local; +} + +void +unset_pnfs_layoutdriver(struct nfs_server *nfss) +{ + if (nfss->pnfs_curr_ld) { + if (nfss->pnfs_curr_ld->clear_layoutdriver) + nfss->pnfs_curr_ld->clear_layoutdriver(nfss); + /* Decrement the MDS count. Purge the deviceid cache if zero */ + if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count)) + nfs4_deviceid_purge_client(nfss->nfs_client); + module_put(nfss->pnfs_curr_ld->owner); + } + nfss->pnfs_curr_ld = NULL; +} + +/* + * Try to set the server's pnfs module to the pnfs layout type specified by id. + * Currently only one pNFS layout driver per filesystem is supported. + * + * @id layout type. Zero (illegal layout type) indicates pNFS not in use. + */ +void +set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, + u32 id) +{ + struct pnfs_layoutdriver_type *ld_type = NULL; + + if (id == 0) + goto out_no_driver; + if (!(server->nfs_client->cl_exchange_flags & + (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { + printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n", + __func__, id, server->nfs_client->cl_exchange_flags); + goto out_no_driver; + } + ld_type = find_pnfs_driver(id); + if (!ld_type) { + request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); + ld_type = find_pnfs_driver(id); + if (!ld_type) { + dprintk("%s: No pNFS module found for %u.\n", + __func__, id); + goto out_no_driver; + } + } + server->pnfs_curr_ld = ld_type; + if (ld_type->set_layoutdriver + && ld_type->set_layoutdriver(server, mntfh)) { + printk(KERN_ERR "NFS: %s: Error initializing pNFS layout " + "driver %u.\n", __func__, id); + module_put(ld_type->owner); + goto out_no_driver; + } + /* Bump the MDS count */ + atomic_inc(&server->nfs_client->cl_mds_count); + + dprintk("%s: pNFS module for %u set\n", __func__, id); + return; + +out_no_driver: + dprintk("%s: Using NFSv4 I/O\n", __func__); + server->pnfs_curr_ld = NULL; +} + +int +pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) +{ + int status = -EINVAL; + struct pnfs_layoutdriver_type *tmp; + + if (ld_type->id == 0) { + printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__); + return status; + } + if (!ld_type->alloc_lseg || !ld_type->free_lseg) { + printk(KERN_ERR "NFS: %s Layout driver must provide " + "alloc_lseg and free_lseg.\n", __func__); + return status; + } + + spin_lock(&pnfs_spinlock); + tmp = find_pnfs_driver_locked(ld_type->id); + if (!tmp) { + list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl); + status = 0; + dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id, + ld_type->name); + } else { + printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n", + __func__, ld_type->id); + } + spin_unlock(&pnfs_spinlock); + + return status; +} +EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver); + +void +pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) +{ + dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); + spin_lock(&pnfs_spinlock); + list_del(&ld_type->pnfs_tblid); + spin_unlock(&pnfs_spinlock); +} +EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); + +/* + * pNFS client layout cache + */ + +/* Need to hold i_lock if caller does not already hold reference */ +void +pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo) +{ + atomic_inc(&lo->plh_refcount); +} + +static struct pnfs_layout_hdr * +pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; + return ld->alloc_layout_hdr(ino, gfp_flags); +} + +static void +pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct nfs_server *server = NFS_SERVER(lo->plh_inode); + struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; + + if (!list_empty(&lo->plh_layouts)) { + struct nfs_client *clp = server->nfs_client; + + spin_lock(&clp->cl_lock); + list_del_init(&lo->plh_layouts); + spin_unlock(&clp->cl_lock); + } + put_rpccred(lo->plh_lc_cred); + return ld->free_layout_hdr(lo); +} + +static void +pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct nfs_inode *nfsi = NFS_I(lo->plh_inode); + dprintk("%s: freeing layout cache %p\n", __func__, lo); + nfsi->layout = NULL; + /* Reset MDS Threshold I/O counters */ + nfsi->write_io = 0; + nfsi->read_io = 0; +} + +void +pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct inode *inode = lo->plh_inode; + + if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { + pnfs_detach_layout_hdr(lo); + spin_unlock(&inode->i_lock); + pnfs_free_layout_hdr(lo); + } +} + +static int +pnfs_iomode_to_fail_bit(u32 iomode) +{ + return iomode == IOMODE_RW ? + NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; +} + +static void +pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) +{ + lo->plh_retry_timestamp = jiffies; + if (!test_and_set_bit(fail_bit, &lo->plh_flags)) + atomic_inc(&lo->plh_refcount); +} + +static void +pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) +{ + if (test_and_clear_bit(fail_bit, &lo->plh_flags)) + atomic_dec(&lo->plh_refcount); +} + +static void +pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) +{ + struct inode *inode = lo->plh_inode; + struct pnfs_layout_range range = { + .iomode = iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + LIST_HEAD(head); + + spin_lock(&inode->i_lock); + pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); + pnfs_mark_matching_lsegs_invalid(lo, &head, &range); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); + dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, + iomode == IOMODE_RW ? "RW" : "READ"); +} + +static bool +pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) +{ + unsigned long start, end; + int fail_bit = pnfs_iomode_to_fail_bit(iomode); + + if (test_bit(fail_bit, &lo->plh_flags) == 0) + return false; + end = jiffies; + start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT; + if (!time_in_range(lo->plh_retry_timestamp, start, end)) { + /* It is time to retry the failed layoutgets */ + pnfs_layout_clear_fail_bit(lo, fail_bit); + return false; + } + return true; +} + +static void +init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) +{ + INIT_LIST_HEAD(&lseg->pls_list); + INIT_LIST_HEAD(&lseg->pls_lc_list); + atomic_set(&lseg->pls_refcount, 1); + smp_mb(); + set_bit(NFS_LSEG_VALID, &lseg->pls_flags); + lseg->pls_layout = lo; +} + +static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) +{ + struct inode *ino = lseg->pls_layout->plh_inode; + + NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); +} + +static void +pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) +{ + struct inode *inode = lo->plh_inode; + + WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + list_del_init(&lseg->pls_list); + /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ + atomic_dec(&lo->plh_refcount); + if (list_empty(&lo->plh_segs)) + clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); +} + +void +pnfs_put_lseg(struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_hdr *lo; + struct inode *inode; + + if (!lseg) + return; + + dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, + atomic_read(&lseg->pls_refcount), + test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + lo = lseg->pls_layout; + inode = lo->plh_inode; + if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { + pnfs_get_layout_hdr(lo); + pnfs_layout_remove_lseg(lo, lseg); + spin_unlock(&inode->i_lock); + pnfs_free_lseg(lseg); + pnfs_put_layout_hdr(lo); + } +} +EXPORT_SYMBOL_GPL(pnfs_put_lseg); + +static u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end : NFS4_MAX_UINT64; +} + +/* + * is l2 fully contained in l1? + * start1 end1 + * [----------------------------------) + * start2 end2 + * [----------------) + */ +static bool +pnfs_lseg_range_contained(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) +{ + u64 start1 = l1->offset; + u64 end1 = end_offset(start1, l1->length); + u64 start2 = l2->offset; + u64 end2 = end_offset(start2, l2->length); + + return (start1 <= start2) && (end1 >= end2); +} + +/* + * is l1 and l2 intersecting? + * start1 end1 + * [----------------------------------) + * start2 end2 + * [----------------) + */ +static bool +pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) +{ + u64 start1 = l1->offset; + u64 end1 = end_offset(start1, l1->length); + u64 start2 = l2->offset; + u64 end2 = end_offset(start2, l2->length); + + return (end1 == NFS4_MAX_UINT64 || end1 > start2) && + (end2 == NFS4_MAX_UINT64 || end2 > start1); +} + +static bool +should_free_lseg(const struct pnfs_layout_range *lseg_range, + const struct pnfs_layout_range *recall_range) +{ + return (recall_range->iomode == IOMODE_ANY || + lseg_range->iomode == recall_range->iomode) && + pnfs_lseg_range_intersecting(lseg_range, recall_range); +} + +static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list) +{ + if (!atomic_dec_and_test(&lseg->pls_refcount)) + return false; + pnfs_layout_remove_lseg(lseg->pls_layout, lseg); + list_add(&lseg->pls_list, tmp_list); + return true; +} + +/* Returns 1 if lseg is removed from list, 0 otherwise */ +static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list) +{ + int rv = 0; + + if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { + /* Remove the reference keeping the lseg in the + * list. It will now be removed when all + * outstanding io is finished. + */ + dprintk("%s: lseg %p ref %d\n", __func__, lseg, + atomic_read(&lseg->pls_refcount)); + if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list)) + rv = 1; + } + return rv; +} + +/* Returns count of number of matching invalid lsegs remaining in list + * after call. + */ +int +pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + struct pnfs_layout_range *recall_range) +{ + struct pnfs_layout_segment *lseg, *next; + int invalid = 0, removed = 0; + + dprintk("%s:Begin lo %p\n", __func__, lo); + + if (list_empty(&lo->plh_segs)) + return 0; + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) + if (!recall_range || + should_free_lseg(&lseg->pls_range, recall_range)) { + dprintk("%s: freeing lseg %p iomode %d " + "offset %llu length %llu\n", __func__, + lseg, lseg->pls_range.iomode, lseg->pls_range.offset, + lseg->pls_range.length); + invalid++; + removed += mark_lseg_invalid(lseg, tmp_list); + } + dprintk("%s:Return %i\n", __func__, invalid - removed); + return invalid - removed; +} + +/* note free_me must contain lsegs from a single layout_hdr */ +void +pnfs_free_lseg_list(struct list_head *free_me) +{ + struct pnfs_layout_segment *lseg, *tmp; + + if (list_empty(free_me)) + return; + + list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { + list_del(&lseg->pls_list); + pnfs_free_lseg(lseg); + } +} + +void +pnfs_destroy_layout(struct nfs_inode *nfsi) +{ + struct pnfs_layout_hdr *lo; + LIST_HEAD(tmp_list); + + spin_lock(&nfsi->vfs_inode.i_lock); + lo = nfsi->layout; + if (lo) { + lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ + pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + pnfs_get_layout_hdr(lo); + pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); + pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); + spin_unlock(&nfsi->vfs_inode.i_lock); + pnfs_free_lseg_list(&tmp_list); + pnfs_put_layout_hdr(lo); + } else + spin_unlock(&nfsi->vfs_inode.i_lock); +} +EXPORT_SYMBOL_GPL(pnfs_destroy_layout); + +static bool +pnfs_layout_add_bulk_destroy_list(struct inode *inode, + struct list_head *layout_list) +{ + struct pnfs_layout_hdr *lo; + bool ret = false; + + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) { + pnfs_get_layout_hdr(lo); + list_add(&lo->plh_bulk_destroy, layout_list); + ret = true; + } + spin_unlock(&inode->i_lock); + return ret; +} + +/* Caller must hold rcu_read_lock and clp->cl_lock */ +static int +pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, + struct nfs_server *server, + struct list_head *layout_list) +{ + struct pnfs_layout_hdr *lo, *next; + struct inode *inode; + + list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { + inode = igrab(lo->plh_inode); + if (inode == NULL) + continue; + list_del_init(&lo->plh_layouts); + if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) + continue; + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + iput(inode); + spin_lock(&clp->cl_lock); + rcu_read_lock(); + return -EAGAIN; + } + return 0; +} + +static int +pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, + bool is_bulk_recall) +{ + struct pnfs_layout_hdr *lo; + struct inode *inode; + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + LIST_HEAD(lseg_list); + int ret = 0; + + while (!list_empty(layout_list)) { + lo = list_entry(layout_list->next, struct pnfs_layout_hdr, + plh_bulk_destroy); + dprintk("%s freeing layout for inode %lu\n", __func__, + lo->plh_inode->i_ino); + inode = lo->plh_inode; + spin_lock(&inode->i_lock); + list_del_init(&lo->plh_bulk_destroy); + lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ + if (is_bulk_recall) + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range)) + ret = -EAGAIN; + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&lseg_list); + pnfs_put_layout_hdr(lo); + iput(inode); + } + return ret; +} + +int +pnfs_destroy_layouts_byfsid(struct nfs_client *clp, + struct nfs_fsid *fsid, + bool is_recall) +{ + struct nfs_server *server; + LIST_HEAD(layout_list); + + spin_lock(&clp->cl_lock); + rcu_read_lock(); +restart: + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0) + continue; + if (pnfs_layout_bulk_destroy_byserver_locked(clp, + server, + &layout_list) != 0) + goto restart; + } + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + + if (list_empty(&layout_list)) + return 0; + return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +} + +int +pnfs_destroy_layouts_byclid(struct nfs_client *clp, + bool is_recall) +{ + struct nfs_server *server; + LIST_HEAD(layout_list); + + spin_lock(&clp->cl_lock); + rcu_read_lock(); +restart: + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (pnfs_layout_bulk_destroy_byserver_locked(clp, + server, + &layout_list) != 0) + goto restart; + } + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + + if (list_empty(&layout_list)) + return 0; + return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +} + +/* + * Called by the state manger to remove all layouts established under an + * expired lease. + */ +void +pnfs_destroy_all_layouts(struct nfs_client *clp) +{ + nfs4_deviceid_mark_client_invalid(clp); + nfs4_deviceid_purge_client(clp); + + pnfs_destroy_layouts_byclid(clp, false); +} + +/* + * Compare 2 layout stateid sequence ids, to see which is newer, + * taking into account wraparound issues. + */ +static bool pnfs_seqid_is_newer(u32 s1, u32 s2) +{ + return (s32)(s1 - s2) > 0; +} + +static void +pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new, + struct list_head *free_me_list) +{ + if (nfs4_stateid_match_other(&lo->plh_stateid, new)) + return; + /* Layout is new! Kill existing layout segments */ + pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL); +} + +/* update lo->plh_stateid with new if is more recent */ +void +pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, + bool update_barrier) +{ + u32 oldseq, newseq, new_barrier; + int empty = list_empty(&lo->plh_segs); + + oldseq = be32_to_cpu(lo->plh_stateid.seqid); + newseq = be32_to_cpu(new->seqid); + if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { + nfs4_stateid_copy(&lo->plh_stateid, new); + if (update_barrier) { + new_barrier = be32_to_cpu(new->seqid); + } else { + /* Because of wraparound, we want to keep the barrier + * "close" to the current seqids. + */ + new_barrier = newseq - atomic_read(&lo->plh_outstanding); + } + if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) + lo->plh_barrier = new_barrier; + } +} + +static bool +pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, + const nfs4_stateid *stateid) +{ + u32 seqid = be32_to_cpu(stateid->seqid); + + return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); +} + +/* lget is set to 1 if called from inside send_layoutget call chain */ +static bool +pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget) +{ + return lo->plh_block_lgets || + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || + (list_empty(&lo->plh_segs) && + (atomic_read(&lo->plh_outstanding) > lget)); +} + +int +pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, + struct nfs4_state *open_state) +{ + int status = 0; + + dprintk("--> %s\n", __func__); + spin_lock(&lo->plh_inode->i_lock); + if (pnfs_layoutgets_blocked(lo, 1)) { + status = -EAGAIN; + } else if (!nfs4_valid_open_stateid(open_state)) { + status = -EBADF; + } else if (list_empty(&lo->plh_segs)) { + int seq; + + do { + seq = read_seqbegin(&open_state->seqlock); + nfs4_stateid_copy(dst, &open_state->stateid); + } while (read_seqretry(&open_state->seqlock, seq)); + } else + nfs4_stateid_copy(dst, &lo->plh_stateid); + spin_unlock(&lo->plh_inode->i_lock); + dprintk("<-- %s\n", __func__); + return status; +} + +/* +* Get layout from server. +* for now, assume that whole file layouts are requested. +* arg->offset: 0 +* arg->length: all ones +*/ +static struct pnfs_layout_segment * +send_layoutget(struct pnfs_layout_hdr *lo, + struct nfs_open_context *ctx, + struct pnfs_layout_range *range, + gfp_t gfp_flags) +{ + struct inode *ino = lo->plh_inode; + struct nfs_server *server = NFS_SERVER(ino); + struct nfs4_layoutget *lgp; + struct pnfs_layout_segment *lseg; + + dprintk("--> %s\n", __func__); + + lgp = kzalloc(sizeof(*lgp), gfp_flags); + if (lgp == NULL) + return NULL; + + lgp->args.minlength = PAGE_CACHE_SIZE; + if (lgp->args.minlength > range->length) + lgp->args.minlength = range->length; + lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; + lgp->args.range = *range; + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->args.ctx = get_nfs_open_context(ctx); + lgp->gfp_flags = gfp_flags; + lgp->cred = lo->plh_lc_cred; + + /* Synchronously retrieve layout information from server and + * store in lseg. + */ + lseg = nfs4_proc_layoutget(lgp, gfp_flags); + if (IS_ERR(lseg)) { + switch (PTR_ERR(lseg)) { + case -ENOMEM: + case -ERESTARTSYS: + break; + default: + /* remember that LAYOUTGET failed and suspend trying */ + pnfs_layout_io_set_failed(lo, range->iomode); + } + return NULL; + } + + return lseg; +} + +static void pnfs_clear_layoutcommit(struct inode *inode, + struct list_head *head) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct pnfs_layout_segment *lseg, *tmp; + + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + return; + list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) { + if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + continue; + pnfs_lseg_dec_and_remove_zero(lseg, head); + } +} + +/* + * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr + * when the layout segment list is empty. + * + * Note that a pnfs_layout_hdr can exist with an empty layout segment + * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the + * deviceid is marked invalid. + */ +int +_pnfs_return_layout(struct inode *ino) +{ + struct pnfs_layout_hdr *lo = NULL; + struct nfs_inode *nfsi = NFS_I(ino); + LIST_HEAD(tmp_list); + struct nfs4_layoutreturn *lrp; + nfs4_stateid stateid; + int status = 0, empty; + + dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); + + spin_lock(&ino->i_lock); + lo = nfsi->layout; + if (!lo) { + spin_unlock(&ino->i_lock); + dprintk("NFS: %s no layout to return\n", __func__); + goto out; + } + stateid = nfsi->layout->plh_stateid; + /* Reference matched in nfs4_layoutreturn_release */ + pnfs_get_layout_hdr(lo); + empty = list_empty(&lo->plh_segs); + pnfs_clear_layoutcommit(ino, &tmp_list); + pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + /* Don't send a LAYOUTRETURN if list was initially empty */ + if (empty) { + spin_unlock(&ino->i_lock); + pnfs_put_layout_hdr(lo); + dprintk("NFS: %s no layout segments to return\n", __func__); + goto out; + } + lo->plh_block_lgets++; + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&tmp_list); + + lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); + if (unlikely(lrp == NULL)) { + status = -ENOMEM; + spin_lock(&ino->i_lock); + lo->plh_block_lgets--; + spin_unlock(&ino->i_lock); + pnfs_put_layout_hdr(lo); + goto out; + } + + lrp->args.stateid = stateid; + lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; + lrp->args.inode = ino; + lrp->args.layout = lo; + lrp->clp = NFS_SERVER(ino)->nfs_client; + lrp->cred = lo->plh_lc_cred; + + status = nfs4_proc_layoutreturn(lrp); +out: + dprintk("<-- %s status: %d\n", __func__, status); + return status; +} +EXPORT_SYMBOL_GPL(_pnfs_return_layout); + +int +pnfs_commit_and_return_layout(struct inode *inode) +{ + struct pnfs_layout_hdr *lo; + int ret; + + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo == NULL) { + spin_unlock(&inode->i_lock); + return 0; + } + pnfs_get_layout_hdr(lo); + /* Block new layoutgets and read/write to ds */ + lo->plh_block_lgets++; + spin_unlock(&inode->i_lock); + filemap_fdatawait(inode->i_mapping); + ret = pnfs_layoutcommit_inode(inode, true); + if (ret == 0) + ret = _pnfs_return_layout(inode); + spin_lock(&inode->i_lock); + lo->plh_block_lgets--; + spin_unlock(&inode->i_lock); + pnfs_put_layout_hdr(lo); + return ret; +} + +bool pnfs_roc(struct inode *ino) +{ + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg, *tmp; + LIST_HEAD(tmp_list); + bool found = false; + + spin_lock(&ino->i_lock); + lo = NFS_I(ino)->layout; + if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) + goto out_nolayout; + list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) + if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { + mark_lseg_invalid(lseg, &tmp_list); + found = true; + } + if (!found) + goto out_nolayout; + lo->plh_block_lgets++; + pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&tmp_list); + return true; + +out_nolayout: + spin_unlock(&ino->i_lock); + return false; +} + +void pnfs_roc_release(struct inode *ino) +{ + struct pnfs_layout_hdr *lo; + + spin_lock(&ino->i_lock); + lo = NFS_I(ino)->layout; + lo->plh_block_lgets--; + if (atomic_dec_and_test(&lo->plh_refcount)) { + pnfs_detach_layout_hdr(lo); + spin_unlock(&ino->i_lock); + pnfs_free_layout_hdr(lo); + } else + spin_unlock(&ino->i_lock); +} + +void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) +{ + struct pnfs_layout_hdr *lo; + + spin_lock(&ino->i_lock); + lo = NFS_I(ino)->layout; + if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) + lo->plh_barrier = barrier; + spin_unlock(&ino->i_lock); +} + +bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg; + u32 current_seqid; + bool found = false; + + spin_lock(&ino->i_lock); + list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) + if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); + found = true; + goto out; + } + lo = nfsi->layout; + current_seqid = be32_to_cpu(lo->plh_stateid.seqid); + + /* Since close does not return a layout stateid for use as + * a barrier, we choose the worst-case barrier. + */ + *barrier = current_seqid + atomic_read(&lo->plh_outstanding); +out: + spin_unlock(&ino->i_lock); + return found; +} + +/* + * Compare two layout segments for sorting into layout cache. + * We want to preferentially return RW over RO layouts, so ensure those + * are seen first. + */ +static s64 +pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) +{ + s64 d; + + /* high offset > low offset */ + d = l1->offset - l2->offset; + if (d) + return d; + + /* short length > long length */ + d = l2->length - l1->length; + if (d) + return d; + + /* read > read/write */ + return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ); +} + +static void +pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_segment *lp; + + dprintk("%s:Begin\n", __func__); + + list_for_each_entry(lp, &lo->plh_segs, pls_list) { + if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0) + continue; + list_add_tail(&lseg->pls_list, &lp->pls_list); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu before " + "lp %p iomode %d offset %llu length %llu\n", + __func__, lseg, lseg->pls_range.iomode, + lseg->pls_range.offset, lseg->pls_range.length, + lp, lp->pls_range.iomode, lp->pls_range.offset, + lp->pls_range.length); + goto out; + } + list_add_tail(&lseg->pls_list, &lo->plh_segs); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu at tail\n", + __func__, lseg, lseg->pls_range.iomode, + lseg->pls_range.offset, lseg->pls_range.length); +out: + pnfs_get_layout_hdr(lo); + + dprintk("%s:Return\n", __func__); +} + +static struct pnfs_layout_hdr * +alloc_init_layout_hdr(struct inode *ino, + struct nfs_open_context *ctx, + gfp_t gfp_flags) +{ + struct pnfs_layout_hdr *lo; + + lo = pnfs_alloc_layout_hdr(ino, gfp_flags); + if (!lo) + return NULL; + atomic_set(&lo->plh_refcount, 1); + INIT_LIST_HEAD(&lo->plh_layouts); + INIT_LIST_HEAD(&lo->plh_segs); + INIT_LIST_HEAD(&lo->plh_bulk_destroy); + lo->plh_inode = ino; + lo->plh_lc_cred = get_rpccred(ctx->cred); + return lo; +} + +static struct pnfs_layout_hdr * +pnfs_find_alloc_layout(struct inode *ino, + struct nfs_open_context *ctx, + gfp_t gfp_flags) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *new = NULL; + + dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); + + if (nfsi->layout != NULL) + goto out_existing; + spin_unlock(&ino->i_lock); + new = alloc_init_layout_hdr(ino, ctx, gfp_flags); + spin_lock(&ino->i_lock); + + if (likely(nfsi->layout == NULL)) { /* Won the race? */ + nfsi->layout = new; + return new; + } else if (new != NULL) + pnfs_free_layout_hdr(new); +out_existing: + pnfs_get_layout_hdr(nfsi->layout); + return nfsi->layout; +} + +/* + * iomode matching rules: + * iomode lseg match + * ----- ----- ----- + * ANY READ true + * ANY RW true + * RW READ false + * RW RW true + * READ READ true + * READ RW true + */ +static bool +pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, + const struct pnfs_layout_range *range) +{ + struct pnfs_layout_range range1; + + if ((range->iomode == IOMODE_RW && + ls_range->iomode != IOMODE_RW) || + !pnfs_lseg_range_intersecting(ls_range, range)) + return 0; + + /* range1 covers only the first byte in the range */ + range1 = *range; + range1.length = 1; + return pnfs_lseg_range_contained(ls_range, &range1); +} + +/* + * lookup range in layout + */ +static struct pnfs_layout_segment * +pnfs_find_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range) +{ + struct pnfs_layout_segment *lseg, *ret = NULL; + + dprintk("%s:Begin\n", __func__); + + list_for_each_entry(lseg, &lo->plh_segs, pls_list) { + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && + pnfs_lseg_range_match(&lseg->pls_range, range)) { + ret = pnfs_get_lseg(lseg); + break; + } + if (lseg->pls_range.offset > range->offset) + break; + } + + dprintk("%s:Return lseg %p ref %d\n", + __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0); + return ret; +} + +/* + * Use mdsthreshold hints set at each OPEN to determine if I/O should go + * to the MDS or over pNFS + * + * The nfs_inode read_io and write_io fields are cumulative counters reset + * when there are no layout segments. Note that in pnfs_update_layout iomode + * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a + * WRITE request. + * + * A return of true means use MDS I/O. + * + * From rfc 5661: + * If a file's size is smaller than the file size threshold, data accesses + * SHOULD be sent to the metadata server. If an I/O request has a length that + * is below the I/O size threshold, the I/O SHOULD be sent to the metadata + * server. If both file size and I/O size are provided, the client SHOULD + * reach or exceed both thresholds before sending its read or write + * requests to the data server. + */ +static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, + struct inode *ino, int iomode) +{ + struct nfs4_threshold *t = ctx->mdsthreshold; + struct nfs_inode *nfsi = NFS_I(ino); + loff_t fsize = i_size_read(ino); + bool size = false, size_set = false, io = false, io_set = false, ret = false; + + if (t == NULL) + return ret; + + dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n", + __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz); + + switch (iomode) { + case IOMODE_READ: + if (t->bm & THRESHOLD_RD) { + dprintk("%s fsize %llu\n", __func__, fsize); + size_set = true; + if (fsize < t->rd_sz) + size = true; + } + if (t->bm & THRESHOLD_RD_IO) { + dprintk("%s nfsi->read_io %llu\n", __func__, + nfsi->read_io); + io_set = true; + if (nfsi->read_io < t->rd_io_sz) + io = true; + } + break; + case IOMODE_RW: + if (t->bm & THRESHOLD_WR) { + dprintk("%s fsize %llu\n", __func__, fsize); + size_set = true; + if (fsize < t->wr_sz) + size = true; + } + if (t->bm & THRESHOLD_WR_IO) { + dprintk("%s nfsi->write_io %llu\n", __func__, + nfsi->write_io); + io_set = true; + if (nfsi->write_io < t->wr_io_sz) + io = true; + } + break; + } + if (size_set && io_set) { + if (size && io) + ret = true; + } else if (size || io) + ret = true; + + dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret); + return ret; +} + +/* + * Layout segment is retreived from the server if not cached. + * The appropriate layout segment is referenced and returned to the caller. + */ +struct pnfs_layout_segment * +pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + gfp_t gfp_flags) +{ + struct pnfs_layout_range arg = { + .iomode = iomode, + .offset = pos, + .length = count, + }; + unsigned pg_offset; + struct nfs_server *server = NFS_SERVER(ino); + struct nfs_client *clp = server->nfs_client; + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg = NULL; + bool first; + + if (!pnfs_enabled_sb(NFS_SERVER(ino))) + goto out; + + if (pnfs_within_mdsthreshold(ctx, ino, iomode)) + goto out; + + spin_lock(&ino->i_lock); + lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); + if (lo == NULL) { + spin_unlock(&ino->i_lock); + goto out; + } + + /* Do we even need to bother with this? */ + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + dprintk("%s matches recall, use MDS\n", __func__); + goto out_unlock; + } + + /* if LAYOUTGET already failed once we don't try again */ + if (pnfs_layout_io_test_failed(lo, iomode)) + goto out_unlock; + + /* Check to see if the layout for the given range already exists */ + lseg = pnfs_find_lseg(lo, &arg); + if (lseg) + goto out_unlock; + + if (pnfs_layoutgets_blocked(lo, 0)) + goto out_unlock; + atomic_inc(&lo->plh_outstanding); + + first = list_empty(&lo->plh_layouts) ? true : false; + spin_unlock(&ino->i_lock); + + if (first) { + /* The lo must be on the clp list if there is any + * chance of a CB_LAYOUTRECALL(FILE) coming in. + */ + spin_lock(&clp->cl_lock); + list_add_tail(&lo->plh_layouts, &server->layouts); + spin_unlock(&clp->cl_lock); + } + + pg_offset = arg.offset & ~PAGE_CACHE_MASK; + if (pg_offset) { + arg.offset -= pg_offset; + arg.length += pg_offset; + } + if (arg.length != NFS4_MAX_UINT64) + arg.length = PAGE_CACHE_ALIGN(arg.length); + + lseg = send_layoutget(lo, ctx, &arg, gfp_flags); + atomic_dec(&lo->plh_outstanding); +out_put_layout_hdr: + pnfs_put_layout_hdr(lo); +out: + dprintk("%s: inode %s/%llu pNFS layout segment %s for " + "(%s, offset: %llu, length: %llu)\n", + __func__, ino->i_sb->s_id, + (unsigned long long)NFS_FILEID(ino), + lseg == NULL ? "not found" : "found", + iomode==IOMODE_RW ? "read/write" : "read-only", + (unsigned long long)pos, + (unsigned long long)count); + return lseg; +out_unlock: + spin_unlock(&ino->i_lock); + goto out_put_layout_hdr; +} +EXPORT_SYMBOL_GPL(pnfs_update_layout); + +struct pnfs_layout_segment * +pnfs_layout_process(struct nfs4_layoutget *lgp) +{ + struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; + struct nfs4_layoutget_res *res = &lgp->res; + struct pnfs_layout_segment *lseg; + struct inode *ino = lo->plh_inode; + LIST_HEAD(free_me); + int status = 0; + + /* Inject layout blob into I/O device driver */ + lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); + if (!lseg || IS_ERR(lseg)) { + if (!lseg) + status = -ENOMEM; + else + status = PTR_ERR(lseg); + dprintk("%s: Could not allocate layout: error %d\n", + __func__, status); + goto out; + } + + spin_lock(&ino->i_lock); + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + dprintk("%s forget reply due to recall\n", __func__); + goto out_forget_reply; + } + + if (pnfs_layoutgets_blocked(lo, 1) || + pnfs_layout_stateid_blocked(lo, &res->stateid)) { + dprintk("%s forget reply due to state\n", __func__); + goto out_forget_reply; + } + + /* Check that the new stateid matches the old stateid */ + pnfs_verify_layout_stateid(lo, &res->stateid, &free_me); + /* Done processing layoutget. Set the layout stateid */ + pnfs_set_layout_stateid(lo, &res->stateid, false); + + init_lseg(lo, lseg); + lseg->pls_range = res->range; + pnfs_get_lseg(lseg); + pnfs_layout_insert_lseg(lo, lseg); + + if (res->return_on_close) { + set_bit(NFS_LSEG_ROC, &lseg->pls_flags); + set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); + } + + spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me); + return lseg; +out: + return ERR_PTR(status); + +out_forget_reply: + spin_unlock(&ino->i_lock); + lseg->pls_layout = lo; + NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); + goto out; +} + +void +pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + u64 rd_size = req->wb_bytes; + + WARN_ON_ONCE(pgio->pg_lseg != NULL); + + if (pgio->pg_dreq == NULL) + rd_size = i_size_read(pgio->pg_inode) - req_offset(req); + else + rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + req_offset(req), + rd_size, + IOMODE_READ, + GFP_KERNEL); + /* If no lseg, fall back to read through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_reset_read_mds(pgio); + +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); + +void +pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, u64 wb_size) +{ + WARN_ON_ONCE(pgio->pg_lseg != NULL); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + req_offset(req), + wb_size, + IOMODE_RW, + GFP_NOFS); + /* If no lseg, fall back to write through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_reset_write_mds(pgio); +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); + +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +size_t +pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + unsigned int size; + u64 seg_end, req_start, seg_left; + + size = nfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; + + /* + * 'size' contains the number of bytes left in the current page (up + * to the original size asked for in @req->wb_bytes). + * + * Calculate how many bytes are left in the layout segment + * and if there are less bytes than 'size', return that instead. + * + * Please also note that 'end_offset' is actually the offset of the + * first byte that lies outside the pnfs_layout_range. FIXME? + * + */ + if (pgio->pg_lseg) { + seg_end = end_offset(pgio->pg_lseg->pls_range.offset, + pgio->pg_lseg->pls_range.length); + req_start = req_offset(req); + WARN_ON_ONCE(req_start > seg_end); + /* start of request is past the last byte of this segment */ + if (req_start >= seg_end) + return 0; + + /* adjust 'size' iff there are fewer bytes left in the + * segment than what nfs_generic_pg_test returned */ + seg_left = seg_end - req_start; + if (seg_left < size) + size = (unsigned int)seg_left; + } + + return size; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); + +int pnfs_write_done_resend_to_mds(struct inode *inode, + struct list_head *head, + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq) +{ + struct nfs_pageio_descriptor pgio; + LIST_HEAD(failed); + + /* Resend all requests through the MDS */ + nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops); + pgio.pg_dreq = dreq; + while (!list_empty(head)) { + struct nfs_page *req = nfs_list_entry(head->next); + + nfs_list_remove_request(req); + if (!nfs_pageio_add_request(&pgio, req)) + nfs_list_add_request(req, &failed); + } + nfs_pageio_complete(&pgio); + + if (!list_empty(&failed)) { + /* For some reason our attempt to resend pages. Mark the + * overall send request as having failed, and let + * nfs_writeback_release_full deal with the error. + */ + list_move(&failed, head); + return -EIO; + } + return 0; +} +EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); + +static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + + dprintk("pnfs write error = %d\n", hdr->pnfs_error); + if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_ERROR) { + pnfs_return_layout(hdr->inode); + } + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) + data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, + &hdr->pages, + hdr->completion_ops, + hdr->dreq); +} + +/* + * Called by non rpc-based layout drivers + */ +void pnfs_ld_write_done(struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + + trace_nfs4_pnfs_write(data, hdr->pnfs_error); + if (!hdr->pnfs_error) { + pnfs_set_layoutcommit(data); + hdr->mds_ops->rpc_call_done(&data->task, data); + } else + pnfs_ld_handle_write_error(data); + hdr->mds_ops->rpc_release(data); +} +EXPORT_SYMBOL_GPL(pnfs_ld_write_done); + +static void +pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + list_splice_tail_init(&hdr->pages, &desc->pg_list); + nfs_pageio_reset_write_mds(desc); + desc->pg_recoalesce = 1; + } + nfs_pgio_data_release(data); +} + +static enum pnfs_try_status +pnfs_try_to_write_data(struct nfs_pgio_data *wdata, + const struct rpc_call_ops *call_ops, + struct pnfs_layout_segment *lseg, + int how) +{ + struct nfs_pgio_header *hdr = wdata->header; + struct inode *inode = hdr->inode; + enum pnfs_try_status trypnfs; + struct nfs_server *nfss = NFS_SERVER(inode); + + hdr->mds_ops = call_ops; + + dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, + inode->i_ino, wdata->args.count, wdata->args.offset, how); + trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how); + if (trypnfs != PNFS_NOT_ATTEMPTED) + nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); + dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); + return trypnfs; +} + +static void +pnfs_do_write(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr, int how) +{ + struct nfs_pgio_data *data = hdr->data; + const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; + struct pnfs_layout_segment *lseg = desc->pg_lseg; + enum pnfs_try_status trypnfs; + + desc->pg_lseg = NULL; + trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_write_through_mds(desc, data); + pnfs_put_lseg(lseg); +} + +static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) +{ + pnfs_put_lseg(hdr->lseg); + nfs_rw_header_free(hdr); +} +EXPORT_SYMBOL_GPL(pnfs_writehdr_free); + +int +pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) +{ + struct nfs_rw_header *whdr; + struct nfs_pgio_header *hdr; + int ret; + + whdr = nfs_rw_header_alloc(desc->pg_rw_ops); + if (!whdr) { + desc->pg_completion_ops->error_cleanup(&desc->pg_list); + pnfs_put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + return -ENOMEM; + } + hdr = &whdr->header; + nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); + hdr->lseg = pnfs_get_lseg(desc->pg_lseg); + atomic_inc(&hdr->refcnt); + ret = nfs_generic_pgio(desc, hdr); + if (ret != 0) { + pnfs_put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + } else + pnfs_do_write(desc, hdr, desc->pg_ioflags); + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); + return ret; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); + +int pnfs_read_done_resend_to_mds(struct inode *inode, + struct list_head *head, + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq) +{ + struct nfs_pageio_descriptor pgio; + LIST_HEAD(failed); + + /* Resend all requests through the MDS */ + nfs_pageio_init_read(&pgio, inode, true, compl_ops); + pgio.pg_dreq = dreq; + while (!list_empty(head)) { + struct nfs_page *req = nfs_list_entry(head->next); + + nfs_list_remove_request(req); + if (!nfs_pageio_add_request(&pgio, req)) + nfs_list_add_request(req, &failed); + } + nfs_pageio_complete(&pgio); + + if (!list_empty(&failed)) { + list_move(&failed, head); + return -EIO; + } + return 0; +} +EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); + +static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + + dprintk("pnfs read error = %d\n", hdr->pnfs_error); + if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_ERROR) { + pnfs_return_layout(hdr->inode); + } + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) + data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, + &hdr->pages, + hdr->completion_ops, + hdr->dreq); +} + +/* + * Called by non rpc-based layout drivers + */ +void pnfs_ld_read_done(struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + + trace_nfs4_pnfs_read(data, hdr->pnfs_error); + if (likely(!hdr->pnfs_error)) { + __nfs4_read_done_cb(data); + hdr->mds_ops->rpc_call_done(&data->task, data); + } else + pnfs_ld_handle_read_error(data); + hdr->mds_ops->rpc_release(data); +} +EXPORT_SYMBOL_GPL(pnfs_ld_read_done); + +static void +pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + list_splice_tail_init(&hdr->pages, &desc->pg_list); + nfs_pageio_reset_read_mds(desc); + desc->pg_recoalesce = 1; + } + nfs_pgio_data_release(data); +} + +/* + * Call the appropriate parallel I/O subsystem read function. + */ +static enum pnfs_try_status +pnfs_try_to_read_data(struct nfs_pgio_data *rdata, + const struct rpc_call_ops *call_ops, + struct pnfs_layout_segment *lseg) +{ + struct nfs_pgio_header *hdr = rdata->header; + struct inode *inode = hdr->inode; + struct nfs_server *nfss = NFS_SERVER(inode); + enum pnfs_try_status trypnfs; + + hdr->mds_ops = call_ops; + + dprintk("%s: Reading ino:%lu %u@%llu\n", + __func__, inode->i_ino, rdata->args.count, rdata->args.offset); + + trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata); + if (trypnfs != PNFS_NOT_ATTEMPTED) + nfs_inc_stats(inode, NFSIOS_PNFS_READ); + dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); + return trypnfs; +} + +static void +pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) +{ + struct nfs_pgio_data *data = hdr->data; + const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; + struct pnfs_layout_segment *lseg = desc->pg_lseg; + enum pnfs_try_status trypnfs; + + desc->pg_lseg = NULL; + trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_read_through_mds(desc, data); + pnfs_put_lseg(lseg); +} + +static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) +{ + pnfs_put_lseg(hdr->lseg); + nfs_rw_header_free(hdr); +} +EXPORT_SYMBOL_GPL(pnfs_readhdr_free); + +int +pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) +{ + struct nfs_rw_header *rhdr; + struct nfs_pgio_header *hdr; + int ret; + + rhdr = nfs_rw_header_alloc(desc->pg_rw_ops); + if (!rhdr) { + desc->pg_completion_ops->error_cleanup(&desc->pg_list); + ret = -ENOMEM; + pnfs_put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + return ret; + } + hdr = &rhdr->header; + nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); + hdr->lseg = pnfs_get_lseg(desc->pg_lseg); + atomic_inc(&hdr->refcnt); + ret = nfs_generic_pgio(desc, hdr); + if (ret != 0) { + pnfs_put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + } else + pnfs_do_read(desc, hdr); + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); + return ret; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); + +static void pnfs_clear_layoutcommitting(struct inode *inode) +{ + unsigned long *bitlock = &NFS_I(inode)->flags; + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); +} + +/* + * There can be multiple RW segments. + */ +static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { + if (lseg->pls_range.iomode == IOMODE_RW && + test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + list_add(&lseg->pls_lc_list, listp); + } +} + +static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) +{ + struct pnfs_layout_segment *lseg, *tmp; + + /* Matched by references in pnfs_set_layoutcommit */ + list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { + list_del_init(&lseg->pls_lc_list); + pnfs_put_lseg(lseg); + } + + pnfs_clear_layoutcommitting(inode); +} + +void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) +{ + pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); +} +EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); + +void +pnfs_set_layoutcommit(struct nfs_pgio_data *wdata) +{ + struct nfs_pgio_header *hdr = wdata->header; + struct inode *inode = hdr->inode; + struct nfs_inode *nfsi = NFS_I(inode); + loff_t end_pos = wdata->mds_offset + wdata->res.count; + bool mark_as_dirty = false; + + spin_lock(&inode->i_lock); + if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + mark_as_dirty = true; + dprintk("%s: Set layoutcommit for inode %lu ", + __func__, inode->i_ino); + } + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { + /* references matched in nfs4_layoutcommit_release */ + pnfs_get_lseg(hdr->lseg); + } + if (end_pos > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; + spin_unlock(&inode->i_lock); + dprintk("%s: lseg %p end_pos %llu\n", + __func__, hdr->lseg, nfsi->layout->plh_lwb); + + /* if pnfs_layoutcommit_inode() runs between inode locks, the next one + * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ + if (mark_as_dirty) + mark_inode_dirty_sync(inode); +} +EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); + +void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) +{ + struct nfs_server *nfss = NFS_SERVER(data->args.inode); + + if (nfss->pnfs_curr_ld->cleanup_layoutcommit) + nfss->pnfs_curr_ld->cleanup_layoutcommit(data); + pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list); +} + +/* + * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and + * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough + * data to disk to allow the server to recover the data if it crashes. + * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag + * is off, and a COMMIT is sent to a data server, or + * if WRITEs to a data server return NFS_DATA_SYNC. + */ +int +pnfs_layoutcommit_inode(struct inode *inode, bool sync) +{ + struct nfs4_layoutcommit_data *data; + struct nfs_inode *nfsi = NFS_I(inode); + loff_t end_pos; + int status; + + if (!pnfs_layoutcommit_outstanding(inode)) + return 0; + + dprintk("--> %s inode %lu\n", __func__, inode->i_ino); + + status = -EAGAIN; + if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { + if (!sync) + goto out; + status = wait_on_bit_lock(&nfsi->flags, + NFS_INO_LAYOUTCOMMITTING, + nfs_wait_bit_killable, + TASK_KILLABLE); + if (status) + goto out; + } + + status = -ENOMEM; + /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) + goto clear_layoutcommitting; + + status = 0; + spin_lock(&inode->i_lock); + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + goto out_unlock; + + INIT_LIST_HEAD(&data->lseg_list); + pnfs_list_write_lseg(inode, &data->lseg_list); + + end_pos = nfsi->layout->plh_lwb; + nfsi->layout->plh_lwb = 0; + + nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); + spin_unlock(&inode->i_lock); + + data->args.inode = inode; + data->cred = get_rpccred(nfsi->layout->plh_lc_cred); + nfs_fattr_init(&data->fattr); + data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; + data->res.fattr = &data->fattr; + data->args.lastbytewritten = end_pos - 1; + data->res.server = NFS_SERVER(inode); + + status = nfs4_proc_layoutcommit(data, sync); +out: + if (status) + mark_inode_dirty_sync(inode); + dprintk("<-- %s status %d\n", __func__, status); + return status; +out_unlock: + spin_unlock(&inode->i_lock); + kfree(data); +clear_layoutcommitting: + pnfs_clear_layoutcommitting(inode); + goto out; +} + +struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) +{ + struct nfs4_threshold *thp; + + thp = kzalloc(sizeof(*thp), GFP_NOFS); + if (!thp) { + dprintk("%s mdsthreshold allocation failed\n", __func__); + return NULL; + } + return thp; +} diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h new file mode 100644 index 00000000000..4fb309a2b4c --- /dev/null +++ b/fs/nfs/pnfs.h @@ -0,0 +1,525 @@ +/* + * pNFS client data structures. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#ifndef FS_NFS_PNFS_H +#define FS_NFS_PNFS_H + +#include <linux/nfs_fs.h> +#include <linux/nfs_page.h> + +enum { + NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ + NFS_LSEG_ROC, /* roc bit received from server */ + NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ +}; + +struct pnfs_layout_segment { + struct list_head pls_list; + struct list_head pls_lc_list; + struct pnfs_layout_range pls_range; + atomic_t pls_refcount; + unsigned long pls_flags; + struct pnfs_layout_hdr *pls_layout; +}; + +enum pnfs_try_status { + PNFS_ATTEMPTED = 0, + PNFS_NOT_ATTEMPTED = 1, +}; + +#ifdef CONFIG_NFS_V4_1 + +#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" + +enum { + NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ + NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ + NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ + NFS_LAYOUT_ROC, /* some lseg had roc bit set */ + NFS_LAYOUT_RETURN, /* Return this layout ASAP */ +}; + +enum layoutdriver_policy_flags { + /* Should the pNFS client commit and return the layout upon a setattr */ + PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, + PNFS_LAYOUTRET_ON_ERROR = 1 << 1, +}; + +struct nfs4_deviceid_node; + +/* Per-layout driver specific registration structure */ +struct pnfs_layoutdriver_type { + struct list_head pnfs_tblid; + const u32 id; + const char *name; + struct module *owner; + unsigned flags; + + int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); + int (*clear_layoutdriver) (struct nfs_server *); + + struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags); + void (*free_layout_hdr) (struct pnfs_layout_hdr *); + + struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); + void (*free_lseg) (struct pnfs_layout_segment *lseg); + + /* test for nfs page cache coalescing */ + const struct nfs_pageio_ops *pg_read_ops; + const struct nfs_pageio_ops *pg_write_ops; + + struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode); + void (*mark_request_commit) (struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo); + void (*clear_request_commit) (struct nfs_page *req, + struct nfs_commit_info *cinfo); + int (*scan_commit_lists) (struct nfs_commit_info *cinfo, + int max); + void (*recover_commit_reqs) (struct list_head *list, + struct nfs_commit_info *cinfo); + int (*commit_pagelist)(struct inode *inode, + struct list_head *mds_pages, + int how, + struct nfs_commit_info *cinfo); + + /* + * Return PNFS_ATTEMPTED to indicate the layout code has attempted + * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS + */ + enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data); + enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how); + + void (*free_deviceid_node) (struct nfs4_deviceid_node *); + + void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args); + + void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); + + void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *args); +}; + +struct pnfs_layout_hdr { + atomic_t plh_refcount; + struct list_head plh_layouts; /* other client layouts */ + struct list_head plh_bulk_destroy; + struct list_head plh_segs; /* layout segments list */ + nfs4_stateid plh_stateid; + atomic_t plh_outstanding; /* number of RPCs out */ + unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ + u32 plh_barrier; /* ignore lower seqids */ + unsigned long plh_retry_timestamp; + unsigned long plh_flags; + loff_t plh_lwb; /* last write byte for layoutcommit */ + struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ + struct inode *plh_inode; +}; + +struct pnfs_device { + struct nfs4_deviceid dev_id; + unsigned int layout_type; + unsigned int mincount; + unsigned int maxcount; /* gdia_maxcount */ + struct page **pages; + unsigned int pgbase; + unsigned int pglen; /* reply buffer length */ +}; + +#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 + +struct pnfs_devicelist { + unsigned int eof; + unsigned int num_devs; + struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; +}; + +extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); +extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); + +/* nfs4proc.c */ +extern int nfs4_proc_getdevicelist(struct nfs_server *server, + const struct nfs_fh *fh, + struct pnfs_devicelist *devlist); +extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *dev, + struct rpc_cred *cred); +extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); +extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); + +/* pnfs.c */ +void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); +void pnfs_put_lseg(struct pnfs_layout_segment *lseg); + +void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); +void unset_pnfs_layoutdriver(struct nfs_server *); +void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); +int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); +void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, u64 wb_size); +int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); +size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, struct nfs_page *req); +void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); +struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); +void pnfs_free_lseg_list(struct list_head *tmp_list); +void pnfs_destroy_layout(struct nfs_inode *); +void pnfs_destroy_all_layouts(struct nfs_client *); +int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, + struct nfs_fsid *fsid, + bool is_recall); +int pnfs_destroy_layouts_byclid(struct nfs_client *clp, + bool is_recall); +void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); +void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new, + bool update_barrier); +int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, + struct pnfs_layout_hdr *lo, + struct nfs4_state *open_state); +int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + struct pnfs_layout_range *recall_range); +bool pnfs_roc(struct inode *ino); +void pnfs_roc_release(struct inode *ino); +void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); +bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); +void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata); +void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); +int pnfs_layoutcommit_inode(struct inode *inode, bool sync); +int _pnfs_return_layout(struct inode *); +int pnfs_commit_and_return_layout(struct inode *); +void pnfs_ld_write_done(struct nfs_pgio_data *); +void pnfs_ld_read_done(struct nfs_pgio_data *); +struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + gfp_t gfp_flags); + +void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); +int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head, + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq); +int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head, + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq); +struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); + +/* nfs4_deviceid_flags */ +enum { + NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ + NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */ +}; + +/* pnfs_dev.c */ +struct nfs4_deviceid_node { + struct hlist_node node; + struct hlist_node tmpnode; + const struct pnfs_layoutdriver_type *ld; + const struct nfs_client *nfs_client; + unsigned long flags; + unsigned long timestamp_unavailable; + struct nfs4_deviceid deviceid; + atomic_t ref; +}; + +struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); +void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); +void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, + const struct pnfs_layoutdriver_type *, + const struct nfs_client *, + const struct nfs4_deviceid *); +struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *); +bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); +void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node); +bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); +void nfs4_deviceid_purge_client(const struct nfs_client *); + +static inline struct pnfs_layout_segment * +pnfs_get_lseg(struct pnfs_layout_segment *lseg) +{ + if (lseg) { + atomic_inc(&lseg->pls_refcount); + smp_mb__after_atomic(); + } + return lseg; +} + +/* Return true if a layout driver is being used for this mountpoint */ +static inline int pnfs_enabled_sb(struct nfs_server *nfss) +{ + return nfss->pnfs_curr_ld != NULL; +} + +static inline int +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how, + struct nfs_commit_info *cinfo) +{ + if (cinfo->ds == NULL || cinfo->ds->ncommitting == 0) + return PNFS_NOT_ATTEMPTED; + return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how, cinfo); +} + +static inline struct pnfs_ds_commit_info * +pnfs_get_ds_info(struct inode *inode) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (ld == NULL || ld->get_ds_info == NULL) + return NULL; + return ld->get_ds_info(inode); +} + +static inline bool +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) +{ + struct inode *inode = req->wb_context->dentry->d_inode; + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (lseg == NULL || ld->mark_request_commit == NULL) + return false; + ld->mark_request_commit(req, lseg, cinfo); + return true; +} + +static inline bool +pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) +{ + struct inode *inode = req->wb_context->dentry->d_inode; + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (ld == NULL || ld->clear_request_commit == NULL) + return false; + ld->clear_request_commit(req, cinfo); + return true; +} + +static inline int +pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, + int max) +{ + if (cinfo->ds == NULL || cinfo->ds->nwritten == 0) + return 0; + else + return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max); +} + +static inline void +pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list, + struct nfs_commit_info *cinfo) +{ + if (cinfo->ds == NULL || cinfo->ds->nwritten == 0) + return; + NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); +} + +/* Should the pNFS client commit and return the layout upon a setattr */ +static inline bool +pnfs_ld_layoutret_on_setattr(struct inode *inode) +{ + if (!pnfs_enabled_sb(NFS_SERVER(inode))) + return false; + return NFS_SERVER(inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_SETATTR; +} + +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + return test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags) != 0 || + test_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags) != 0; +} + +static inline int pnfs_return_layout(struct inode *ino) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct nfs_server *nfss = NFS_SERVER(ino); + + if (pnfs_enabled_sb(nfss) && nfsi->layout) + return _pnfs_return_layout(ino); + + return 0; +} + +static inline bool +pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, + struct nfs_server *nfss) +{ + return (dst && src && src->bm != 0 && nfss->pnfs_curr_ld && + nfss->pnfs_curr_ld->id == src->l_type); +} + +#ifdef NFS_DEBUG +void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); +#else +static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id) +{ +} +#endif /* NFS_DEBUG */ +#else /* CONFIG_NFS_V4_1 */ + +static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) +{ +} + +static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) +{ +} + +static inline struct pnfs_layout_segment * +pnfs_get_lseg(struct pnfs_layout_segment *lseg) +{ + return NULL; +} + +static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg) +{ +} + +static inline int pnfs_return_layout(struct inode *ino) +{ + return 0; +} + +static inline int pnfs_commit_and_return_layout(struct inode *inode) +{ + return 0; +} + +static inline bool +pnfs_ld_layoutret_on_setattr(struct inode *inode) +{ + return false; +} + +static inline bool +pnfs_roc(struct inode *ino) +{ + return false; +} + +static inline void +pnfs_roc_release(struct inode *ino) +{ +} + +static inline void +pnfs_roc_set_barrier(struct inode *ino, u32 barrier) +{ +} + +static inline bool +pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) +{ + return false; +} + +static inline void set_pnfs_layoutdriver(struct nfs_server *s, + const struct nfs_fh *mntfh, u32 id) +{ +} + +static inline void unset_pnfs_layoutdriver(struct nfs_server *s) +{ +} + +static inline int +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how, + struct nfs_commit_info *cinfo) +{ + return PNFS_NOT_ATTEMPTED; +} + +static inline struct pnfs_ds_commit_info * +pnfs_get_ds_info(struct inode *inode) +{ + return NULL; +} + +static inline bool +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) +{ + return false; +} + +static inline bool +pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) +{ + return false; +} + +static inline int +pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, + int max) +{ + return 0; +} + +static inline void +pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list, + struct nfs_commit_info *cinfo) +{ +} + +static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) +{ + return 0; +} + +static inline bool +pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, + struct nfs_server *nfss) +{ + return false; +} + +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ + return false; +} + + +static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) +{ + return NULL; +} + +#endif /* CONFIG_NFS_V4_1 */ + +#endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c new file mode 100644 index 00000000000..6da209bd940 --- /dev/null +++ b/fs/nfs/pnfs_dev.c @@ -0,0 +1,302 @@ +/* + * Device operations for the pnfs client. + * + * Copyright (c) 2002 + * The Regents of the University of Michigan + * All Rights Reserved + * + * Dean Hildebrand <dhildebz@umich.edu> + * Garth Goodson <Garth.Goodson@netapp.com> + * + * Permission is granted to use, copy, create derivative works, and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * University of Michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation or warranty + * of any kind either express or implied, including without limitation + * the implied warranties of merchantability, fitness for a particular + * purpose, or noninfringement. The Regents of the University of + * Michigan shall not be liable for any damages, including special, + * indirect, incidental, or consequential damages, with respect to any + * claim arising out of or in connection with the use of the software, + * even if it has been or is hereafter advised of the possibility of + * such damages. + */ + +#include <linux/export.h> +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS + +/* + * Device ID RCU cache. A device ID is unique per server and layout type. + */ +#define NFS4_DEVICE_ID_HASH_BITS 5 +#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) +#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) + +#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ) + +static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; +static DEFINE_SPINLOCK(nfs4_deviceid_lock); + +#ifdef NFS_DEBUG +void +nfs4_print_deviceid(const struct nfs4_deviceid *id) +{ + u32 *p = (u32 *)id; + + dprintk("%s: device id= [%x%x%x%x]\n", __func__, + p[0], p[1], p[2], p[3]); +} +EXPORT_SYMBOL_GPL(nfs4_print_deviceid); +#endif + +static inline u32 +nfs4_deviceid_hash(const struct nfs4_deviceid *id) +{ + unsigned char *cptr = (unsigned char *)id->data; + unsigned int nbytes = NFS4_DEVICEID4_SIZE; + u32 x = 0; + + while (nbytes--) { + x *= 37; + x += *cptr++; + } + return x & NFS4_DEVICE_ID_HASH_MASK; +} + +static struct nfs4_deviceid_node * +_lookup_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id, + long hash) +{ + struct nfs4_deviceid_node *d; + + hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node) + if (d->ld == ld && d->nfs_client == clp && + !memcmp(&d->deviceid, id, sizeof(*id))) { + if (atomic_read(&d->ref)) + return d; + else + continue; + } + return NULL; +} + +/* + * Lookup a deviceid in cache and get a reference count on it if found + * + * @clp nfs_client associated with deviceid + * @id deviceid to look up + */ +static struct nfs4_deviceid_node * +_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id, + long hash) +{ + struct nfs4_deviceid_node *d; + + rcu_read_lock(); + d = _lookup_deviceid(ld, clp, id, hash); + if (d != NULL) + atomic_inc(&d->ref); + rcu_read_unlock(); + return d; +} + +struct nfs4_deviceid_node * +nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id) +{ + return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); +} +EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); + +/* + * Remove a deviceid from cache + * + * @clp nfs_client associated with deviceid + * @id the deviceid to unhash + * + * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise. + */ +void +nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *clp, const struct nfs4_deviceid *id) +{ + struct nfs4_deviceid_node *d; + + spin_lock(&nfs4_deviceid_lock); + rcu_read_lock(); + d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); + rcu_read_unlock(); + if (!d) { + spin_unlock(&nfs4_deviceid_lock); + return; + } + hlist_del_init_rcu(&d->node); + spin_unlock(&nfs4_deviceid_lock); + synchronize_rcu(); + + /* balance the initial ref set in pnfs_insert_deviceid */ + if (atomic_dec_and_test(&d->ref)) + d->ld->free_deviceid_node(d); +} +EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); + +void +nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, + const struct pnfs_layoutdriver_type *ld, + const struct nfs_client *nfs_client, + const struct nfs4_deviceid *id) +{ + INIT_HLIST_NODE(&d->node); + INIT_HLIST_NODE(&d->tmpnode); + d->ld = ld; + d->nfs_client = nfs_client; + d->flags = 0; + d->deviceid = *id; + atomic_set(&d->ref, 1); +} +EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); + +/* + * Uniquely initialize and insert a deviceid node into cache + * + * @new new deviceid node + * Note that the caller must set up the following members: + * new->ld + * new->nfs_client + * new->deviceid + * + * @ret the inserted node, if none found, otherwise, the found entry. + */ +struct nfs4_deviceid_node * +nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new) +{ + struct nfs4_deviceid_node *d; + long hash; + + spin_lock(&nfs4_deviceid_lock); + hash = nfs4_deviceid_hash(&new->deviceid); + d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash); + if (d) { + spin_unlock(&nfs4_deviceid_lock); + return d; + } + + hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); + spin_unlock(&nfs4_deviceid_lock); + atomic_inc(&new->ref); + + return new; +} +EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node); + +/* + * Dereference a deviceid node and delete it when its reference count drops + * to zero. + * + * @d deviceid node to put + * + * return true iff the node was deleted + * Note that since the test for d->ref == 0 is sufficient to establish + * that the node is no longer hashed in the global device id cache. + */ +bool +nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) +{ + if (!atomic_dec_and_test(&d->ref)) + return false; + d->ld->free_deviceid_node(d); + return true; +} +EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node); + +void +nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node) +{ + node->timestamp_unavailable = jiffies; + set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); +} +EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable); + +bool +nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node) +{ + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) { + unsigned long start, end; + + end = jiffies; + start = end - PNFS_DEVICE_RETRY_TIMEOUT; + if (time_in_range(node->timestamp_unavailable, start, end)) + return true; + clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + } + return false; +} +EXPORT_SYMBOL_GPL(nfs4_test_deviceid_unavailable); + +static void +_deviceid_purge_client(const struct nfs_client *clp, long hash) +{ + struct nfs4_deviceid_node *d; + HLIST_HEAD(tmp); + + spin_lock(&nfs4_deviceid_lock); + rcu_read_lock(); + hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node) + if (d->nfs_client == clp && atomic_read(&d->ref)) { + hlist_del_init_rcu(&d->node); + hlist_add_head(&d->tmpnode, &tmp); + } + rcu_read_unlock(); + spin_unlock(&nfs4_deviceid_lock); + + if (hlist_empty(&tmp)) + return; + + synchronize_rcu(); + while (!hlist_empty(&tmp)) { + d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode); + hlist_del(&d->tmpnode); + if (atomic_dec_and_test(&d->ref)) + d->ld->free_deviceid_node(d); + } +} + +void +nfs4_deviceid_purge_client(const struct nfs_client *clp) +{ + long h; + + if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) + return; + for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++) + _deviceid_purge_client(clp, h); +} + +/* + * Stop use of all deviceids associated with an nfs_client + */ +void +nfs4_deviceid_mark_client_invalid(struct nfs_client *clp) +{ + struct nfs4_deviceid_node *d; + int i; + + rcu_read_lock(); + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){ + hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[i], node) + if (d->nfs_client == clp) + set_bit(NFS_DEVICEID_INVALID, &d->flags); + } + rcu_read_unlock(); +} + diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 5ccf7faee19..c171ce1a8a3 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -29,10 +29,8 @@ #include <linux/types.h> #include <linux/param.h> -#include <linux/slab.h> #include <linux/time.h> #include <linux/mm.h> -#include <linux/utsname.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/in.h> @@ -43,6 +41,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/lockd/bind.h> +#include <linux/freezer.h> #include "internal.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -63,17 +62,23 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, }; int status; - dprintk("%s: call getattr\n", __FUNCTION__); + dprintk("%s: call getattr\n", __func__); nfs_fattr_init(fattr); - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); - dprintk("%s: reply getattr: %d\n", __FUNCTION__, status); + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + dprintk("%s: reply getattr: %d\n", __func__, status); if (status) return status; - dprintk("%s: call statfs\n", __FUNCTION__); + dprintk("%s: call statfs\n", __func__); msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS]; msg.rpc_resp = &fsinfo; - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); - dprintk("%s: reply statfs: %d\n", __FUNCTION__, status); + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + dprintk("%s: reply statfs: %d\n", __func__, status); if (status) return status; info->rtmax = NFS_MAXDATA; @@ -93,7 +98,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, */ static int nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, struct nfs4_label *label) { struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_GETATTR], @@ -129,6 +134,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, sattr->ia_mode &= S_IALLUGO; dprintk("NFS call setattr\n"); + if (sattr->ia_valid & ATTR_FILE) + msg.rpc_cred = nfs_file_cred(sattr->ia_file); nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) @@ -139,7 +146,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, static int nfs_proc_lookup(struct inode *dir, struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + struct nfs4_label *label) { struct nfs_diropargs arg = { .fh = NFS_FH(dir), @@ -185,35 +193,60 @@ static int nfs_proc_readlink(struct inode *inode, struct page *page, return status; } +struct nfs_createdata { + struct nfs_createargs arg; + struct nfs_diropok res; + struct nfs_fh fhandle; + struct nfs_fattr fattr; +}; + +static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir, + struct dentry *dentry, struct iattr *sattr) +{ + struct nfs_createdata *data; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + + if (data != NULL) { + data->arg.fh = NFS_FH(dir); + data->arg.name = dentry->d_name.name; + data->arg.len = dentry->d_name.len; + data->arg.sattr = sattr; + nfs_fattr_init(&data->fattr); + data->fhandle.size = 0; + data->res.fh = &data->fhandle; + data->res.fattr = &data->fattr; + } + return data; +}; + +static void nfs_free_createdata(const struct nfs_createdata *data) +{ + kfree(data); +} + static int nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - int flags, struct nameidata *nd) + int flags) { - struct nfs_fh fhandle; - struct nfs_fattr fattr; - struct nfs_createargs arg = { - .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len, - .sattr = sattr - }; - struct nfs_diropok res = { - .fh = &fhandle, - .fattr = &fattr - }; + struct nfs_createdata *data; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_CREATE], - .rpc_argp = &arg, - .rpc_resp = &res, }; - int status; + int status = -ENOMEM; - nfs_fattr_init(&fattr); - dprintk("NFS call create %s\n", dentry->d_name.name); + dprintk("NFS call create %pd\n", dentry); + data = nfs_alloc_createdata(dir, dentry, sattr); + if (data == NULL) + goto out; + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); if (status == 0) - status = nfs_instantiate(dentry, &fhandle, &fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); + nfs_free_createdata(data); +out: dprintk("NFS reply create: %d\n", status); return status; } @@ -225,26 +258,14 @@ static int nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { - struct nfs_fh fhandle; - struct nfs_fattr fattr; - struct nfs_createargs arg = { - .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len, - .sattr = sattr - }; - struct nfs_diropok res = { - .fh = &fhandle, - .fattr = &fattr - }; + struct nfs_createdata *data; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_CREATE], - .rpc_argp = &arg, - .rpc_resp = &res, }; - int status, mode; + umode_t mode; + int status = -ENOMEM; - dprintk("NFS call mknod %s\n", dentry->d_name.name); + dprintk("NFS call mknod %pd\n", dentry); mode = sattr->ia_mode; if (S_ISFIFO(mode)) { @@ -255,17 +276,24 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */ } - nfs_fattr_init(&fattr); + data = nfs_alloc_createdata(dir, dentry, sattr); + if (data == NULL) + goto out; + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); if (status == -EINVAL && S_ISFIFO(mode)) { sattr->ia_mode = mode; - nfs_fattr_init(&fattr); + nfs_fattr_init(data->res.fattr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); } if (status == 0) - status = nfs_instantiate(dentry, &fhandle, &fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); + nfs_free_createdata(data); +out: dprintk("NFS reply mknod: %d\n", status); return status; } @@ -275,8 +303,7 @@ nfs_proc_remove(struct inode *dir, struct qstr *name) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), - .name.len = name->len, - .name.name = name->name, + .name = *name, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_REMOVE], @@ -298,36 +325,35 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE]; } +static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) +{ + rpc_call_start(task); +} + static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) { nfs_mark_for_revalidate(dir); return 1; } -static int -nfs_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) -{ - struct nfs_renameargs arg = { - .fromfh = NFS_FH(old_dir), - .fromname = old_name->name, - .fromlen = old_name->len, - .tofh = NFS_FH(new_dir), - .toname = new_name->name, - .tolen = new_name->len - }; - struct rpc_message msg = { - .rpc_proc = &nfs_procedures[NFSPROC_RENAME], - .rpc_argp = &arg, - }; - int status; +static void +nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +{ + msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME]; +} + +static void nfs_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) +{ + rpc_call_start(task); +} - dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); - status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); +static int +nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, + struct inode *new_dir) +{ nfs_mark_for_revalidate(old_dir); nfs_mark_for_revalidate(new_dir); - dprintk("NFS reply rename: %d\n", status); - return status; + return 1; } static int @@ -357,8 +383,8 @@ static int nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { - struct nfs_fh fhandle; - struct nfs_fattr fattr; + struct nfs_fh *fh; + struct nfs_fattr *fattr; struct nfs_symlinkargs arg = { .fromfh = NFS_FH(dir), .fromname = dentry->d_name.name, @@ -371,12 +397,18 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK], .rpc_argp = &arg, }; - int status; + int status = -ENAMETOOLONG; + + dprintk("NFS call symlink %pd\n", dentry); if (len > NFS2_MAXPATHLEN) - return -ENAMETOOLONG; + goto out; - dprintk("NFS call symlink %s\n", dentry->d_name.name); + fh = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + status = -ENOMEM; + if (fh == NULL || fattr == NULL) + goto out_free; status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); @@ -386,12 +418,13 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, * filehandle size to zero indicates to nfs_instantiate that it * should fill in the data with a LOOKUP call on the wire. */ - if (status == 0) { - nfs_fattr_init(&fattr); - fhandle.size = 0; - status = nfs_instantiate(dentry, &fhandle, &fattr); - } + if (status == 0) + status = nfs_instantiate(dentry, fh, fattr, NULL); +out_free: + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); +out: dprintk("NFS reply symlink: %d\n", status); return status; } @@ -399,31 +432,25 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, static int nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { - struct nfs_fh fhandle; - struct nfs_fattr fattr; - struct nfs_createargs arg = { - .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len, - .sattr = sattr - }; - struct nfs_diropok res = { - .fh = &fhandle, - .fattr = &fattr - }; + struct nfs_createdata *data; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], - .rpc_argp = &arg, - .rpc_resp = &res, }; - int status; + int status = -ENOMEM; + + dprintk("NFS call mkdir %pd\n", dentry); + data = nfs_alloc_createdata(dir, dentry, sattr); + if (data == NULL) + goto out; + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; - dprintk("NFS call mkdir %s\n", dentry->d_name.name); - nfs_fattr_init(&fattr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); if (status == 0) - status = nfs_instantiate(dentry, &fhandle, &fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); + nfs_free_createdata(data); +out: dprintk("NFS reply mkdir: %d\n", status); return status; } @@ -458,14 +485,14 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name) */ static int nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page *page, unsigned int count, int plus) + u64 cookie, struct page **pages, unsigned int count, int plus) { struct inode *dir = dentry->d_inode; struct nfs_readdirargs arg = { .fh = NFS_FH(dir), .cookie = cookie, .count = count, - .pages = &page, + .pages = pages, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_READDIR], @@ -551,41 +578,56 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return 0; } -static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data) { - nfs_invalidate_atime(data->inode); + struct inode *inode = data->header->inode; + + nfs_invalidate_atime(inode); if (task->tk_status >= 0) { - nfs_refresh_inode(data->inode, data->res.fattr); + nfs_refresh_inode(inode, data->res.fattr); /* Emulate the eof flag, which isn't normally needed in NFSv2 * as it is guaranteed to always return the file attributes */ - if (data->args.offset + data->args.count >= data->res.fattr->size) + if (data->args.offset + data->res.count >= data->res.fattr->size) data->res.eof = 1; } return 0; } -static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { msg->rpc_proc = &nfs_procedures[NFSPROC_READ]; } -static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) +{ + rpc_call_start(task); + return 0; +} + +static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data) { + struct inode *inode = data->header->inode; + if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr); + nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); return 0; } -static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */ data->args.stable = NFS_FILE_SYNC; msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE]; } +static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) +{ + BUG(); +} + static void -nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) +nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) { BUG(); } @@ -593,18 +635,76 @@ nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) static int nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); } +/* Helper functions for NFS lock bounds checking */ +#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL) +static int nfs_lock_check_bounds(const struct file_lock *fl) +{ + __s32 start, end; + + start = (__s32)fl->fl_start; + if ((loff_t)start != fl->fl_start) + goto out_einval; + + if (fl->fl_end != OFFSET_MAX) { + end = (__s32)fl->fl_end; + if ((loff_t)end != fl->fl_end) + goto out_einval; + } else + end = NFS_LOCK32_OFFSET_MAX; + + if (start < 0 || start > end) + goto out_einval; + return 0; +out_einval: + return -EINVAL; +} + +static int nfs_have_delegation(struct inode *inode, fmode_t flags) +{ + return 0; +} + +static int nfs_return_delegation(struct inode *inode) +{ + nfs_wb_all(inode); + return 0; +} + +static const struct inode_operations nfs_dir_inode_operations = { + .create = nfs_create, + .lookup = nfs_lookup, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +}; + +static const struct inode_operations nfs_file_inode_operations = { + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, +}; const struct nfs_rpc_ops nfs_v2_clientops = { .version = 2, /* protocol version */ .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs_dir_inode_operations, .file_inode_ops = &nfs_file_inode_operations, + .file_ops = &nfs_file_operations, .getroot = nfs_proc_get_root, + .submount = nfs_submount, + .try_mount = nfs_try_mount, .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, .lookup = nfs_proc_lookup, @@ -613,8 +713,11 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .create = nfs_proc_create, .remove = nfs_proc_remove, .unlink_setup = nfs_proc_unlink_setup, + .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare, .unlink_done = nfs_proc_unlink_done, - .rename = nfs_proc_rename, + .rename_setup = nfs_proc_rename_setup, + .rename_rpc_prepare = nfs_proc_rename_rpc_prepare, + .rename_done = nfs_proc_rename_done, .link = nfs_proc_link, .symlink = nfs_proc_symlink, .mkdir = nfs_proc_mkdir, @@ -624,13 +727,22 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .statfs = nfs_proc_statfs, .fsinfo = nfs_proc_fsinfo, .pathconf = nfs_proc_pathconf, - .decode_dirent = nfs_decode_dirent, + .decode_dirent = nfs2_decode_dirent, + .pgio_rpc_prepare = nfs_proc_pgio_rpc_prepare, .read_setup = nfs_proc_read_setup, .read_done = nfs_read_done, .write_setup = nfs_proc_write_setup, .write_done = nfs_write_done, .commit_setup = nfs_proc_commit_setup, - .file_open = nfs_open, - .file_release = nfs_release, + .commit_rpc_prepare = nfs_proc_commit_rpc_prepare, .lock = nfs_proc_lock, + .lock_check_bounds = nfs_lock_check_bounds, + .close_context = nfs_close_context, + .have_delegation = nfs_have_delegation, + .return_delegation = nfs_return_delegation, + .alloc_client = nfs_alloc_client, + .init_client = nfs_init_client, + .free_client = nfs_free_client, + .create_server = nfs_create_server, + .clone_server = nfs_clone_server, }; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 5a70be589bb..e818a475ca6 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -18,62 +18,29 @@ #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> -#include <linux/smp_lock.h> - -#include <asm/system.h> +#include <linux/module.h> +#include "nfs4_fs.h" #include "internal.h" #include "iostat.h" +#include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE -static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int); -static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int); -static const struct rpc_call_ops nfs_read_partial_ops; -static const struct rpc_call_ops nfs_read_full_ops; +static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; +static const struct nfs_rw_ops nfs_rw_read_ops; static struct kmem_cache *nfs_rdata_cachep; -static mempool_t *nfs_rdata_mempool; - -#define MIN_POOL_READ (32) - -struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) -{ - struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS); - - if (p) { - memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&p->pages); - p->npages = pagecount; - if (pagecount <= ARRAY_SIZE(p->page_array)) - p->pagevec = p->page_array; - else { - p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); - if (!p->pagevec) { - mempool_free(p, nfs_rdata_mempool); - p = NULL; - } - } - } - return p; -} -static void nfs_readdata_rcu_free(struct rcu_head *head) +static struct nfs_rw_header *nfs_readhdr_alloc(void) { - struct nfs_read_data *p = container_of(head, struct nfs_read_data, task.u.tk_rcu); - if (p && (p->pagevec != &p->page_array[0])) - kfree(p->pagevec); - mempool_free(p, nfs_rdata_mempool); + return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); } -static void nfs_readdata_free(struct nfs_read_data *rdata) +static void nfs_readhdr_free(struct nfs_rw_header *rhdr) { - call_rcu_bh(&rdata->task.u.tk_rcu, nfs_readdata_rcu_free); -} - -void nfs_readdata_release(void *data) -{ - nfs_readdata_free(data); + kmem_cache_free(nfs_rdata_cachep, rhdr); } static @@ -85,46 +52,40 @@ int nfs_return_empty_page(struct page *page) return 0; } -static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) +void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, + struct inode *inode, bool force_mds, + const struct nfs_pgio_completion_ops *compl_ops) { - unsigned int remainder = data->args.count - data->res.count; - unsigned int base = data->args.pgbase + data->res.count; - unsigned int pglen; - struct page **pages; + struct nfs_server *server = NFS_SERVER(inode); + const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops; + +#ifdef CONFIG_NFS_V4_1 + if (server->pnfs_curr_ld && !force_mds) + pg_ops = server->pnfs_curr_ld->pg_read_ops; +#endif + nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, + server->rsize, 0); +} +EXPORT_SYMBOL_GPL(nfs_pageio_init_read); - if (data->res.eof == 0 || remainder == 0) - return; - /* - * Note: "remainder" can never be negative, since we check for - * this in the XDR code. - */ - pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; - base &= ~PAGE_CACHE_MASK; - pglen = PAGE_CACHE_SIZE - base; - for (;;) { - if (remainder <= pglen) { - zero_user(*pages, base, remainder); - break; - } - zero_user(*pages, base, pglen); - pages++; - remainder -= pglen; - pglen = PAGE_CACHE_SIZE; - base = 0; - } +void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) +{ + pgio->pg_ops = &nfs_pgio_rw_ops; + pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize; } +EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); -static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, - struct page *page) +int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, + struct page *page) { - LIST_HEAD(one_request); struct nfs_page *new; unsigned int len; + struct nfs_pageio_descriptor pgio; len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); - new = nfs_create_request(ctx, inode, page, 0, len); + new = nfs_create_request(ctx, page, NULL, 0, len); if (IS_ERR(new)) { unlock_page(page); return PTR_ERR(new); @@ -132,326 +93,171 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); - nfs_list_add_request(new, &one_request); - if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) - nfs_pagein_multi(inode, &one_request, 1, len, 0); - else - nfs_pagein_one(inode, &one_request, 1, len, 0); + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); + nfs_pageio_add_request(&pgio, new); + nfs_pageio_complete(&pgio); + NFS_I(inode)->read_io += pgio.pg_bytes_written; return 0; } static void nfs_readpage_release(struct nfs_page *req) { - unlock_page(req->wb_page); + struct inode *d_inode = req->wb_context->dentry->d_inode; + + dprintk("NFS: read done (%s/%llu %d@%lld)\n", d_inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(d_inode), req->wb_bytes, + (long long)req_offset(req)); + + if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { + if (PageUptodate(req->wb_page)) + nfs_readpage_to_fscache(d_inode, req->wb_page, 0); - dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", - req->wb_context->path.dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), + unlock_page(req->wb_page); + } + + dprintk("NFS: read done (%s/%Lu %d@%Ld)\n", + req->wb_context->dentry->d_inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); - nfs_clear_request(req); nfs_release_request(req); } -/* - * Set up the NFS read request struct - */ -static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, - const struct rpc_call_ops *call_ops, - unsigned int count, unsigned int offset) +static void nfs_page_group_set_uptodate(struct nfs_page *req) { - struct inode *inode = req->wb_context->path.dentry->d_inode; - int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; - struct rpc_task *task; - struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, - .rpc_cred = req->wb_context->cred, - }; - struct rpc_task_setup task_setup_data = { - .task = &data->task, - .rpc_client = NFS_CLIENT(inode), - .rpc_message = &msg, - .callback_ops = call_ops, - .callback_data = data, - .flags = RPC_TASK_ASYNC | swap_flags, - }; - - data->req = req; - data->inode = inode; - data->cred = msg.rpc_cred; - - data->args.fh = NFS_FH(inode); - data->args.offset = req_offset(req) + offset; - data->args.pgbase = req->wb_pgbase + offset; - data->args.pages = data->pagevec; - data->args.count = count; - data->args.context = req->wb_context; - - data->res.fattr = &data->fattr; - data->res.count = count; - data->res.eof = 0; - nfs_fattr_init(&data->fattr); - - /* Set up the initial task struct. */ - NFS_PROTO(inode)->read_setup(data, &msg); - - dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - count, - (unsigned long long)data->args.offset); - - task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + if (nfs_page_group_sync_on_bit(req, PG_UPTODATE)) + SetPageUptodate(req->wb_page); } -static void -nfs_async_read_error(struct list_head *head) +static void nfs_read_completion(struct nfs_pgio_header *hdr) { - struct nfs_page *req; + unsigned long bytes = 0; - while (!list_empty(head)) { - req = nfs_list_entry(head->next); + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) + goto out; + while (!list_empty(&hdr->pages)) { + struct nfs_page *req = nfs_list_entry(hdr->pages.next); + struct page *page = req->wb_page; + unsigned long start = req->wb_pgbase; + unsigned long end = req->wb_pgbase + req->wb_bytes; + + if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { + /* note: regions of the page not covered by a + * request are zeroed in nfs_readpage_async / + * readpage_async_filler */ + if (bytes > hdr->good_bytes) { + /* nothing in this request was good, so zero + * the full extent of the request */ + zero_user_segment(page, start, end); + + } else if (hdr->good_bytes - bytes < req->wb_bytes) { + /* part of this request has good bytes, but + * not all. zero the bad bytes */ + start += hdr->good_bytes - bytes; + WARN_ON(start < req->wb_pgbase); + zero_user_segment(page, start, end); + } + } + bytes += req->wb_bytes; + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { + if (bytes <= hdr->good_bytes) + nfs_page_group_set_uptodate(req); + } else + nfs_page_group_set_uptodate(req); nfs_list_remove_request(req); - SetPageError(req->wb_page); nfs_readpage_release(req); } +out: + hdr->release(hdr); } -/* - * Generate multiple requests to fill a single page. - * - * We optimize to reduce the number of read operations on the wire. If we - * detect that we're reading a page, or an area of a page, that is past the - * end of file, we do not generate NFS read operations but just clear the - * parts of the page that would have come back zero from the server anyway. - * - * We rely on the cached value of i_size to make this determination; another - * client can fill pages on the server past our cached end-of-file, but we - * won't see the new data until our attribute cache is updated. This is more - * or less conventional NFS client behavior. - */ -static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) +static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg, + struct rpc_task_setup *task_setup_data, int how) { - struct nfs_page *req = nfs_list_entry(head->next); - struct page *page = req->wb_page; - struct nfs_read_data *data; - size_t rsize = NFS_SERVER(inode)->rsize, nbytes; - unsigned int offset; - int requests = 0; - LIST_HEAD(list); - - nfs_list_remove_request(req); - - nbytes = count; - do { - size_t len = min(nbytes,rsize); - - data = nfs_readdata_alloc(1); - if (!data) - goto out_bad; - INIT_LIST_HEAD(&data->pages); - list_add(&data->pages, &list); - requests++; - nbytes -= len; - } while(nbytes != 0); - atomic_set(&req->wb_complete, requests); - - ClearPageError(page); - offset = 0; - nbytes = count; - do { - data = list_entry(list.next, struct nfs_read_data, pages); - list_del_init(&data->pages); - - data->pagevec[0] = page; - - if (nbytes < rsize) - rsize = nbytes; - nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, - rsize, offset); - offset += rsize; - nbytes -= rsize; - } while (nbytes != 0); - - return 0; + struct inode *inode = data->header->inode; + int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; -out_bad: - while (!list_empty(&list)) { - data = list_entry(list.next, struct nfs_read_data, pages); - list_del(&data->pages); - nfs_readdata_free(data); - } - SetPageError(page); - nfs_readpage_release(req); - return -ENOMEM; + task_setup_data->flags |= swap_flags; + NFS_PROTO(inode)->read_setup(data, msg); } -static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) +static void +nfs_async_read_error(struct list_head *head) { - struct nfs_page *req; - struct page **pages; - struct nfs_read_data *data; - - data = nfs_readdata_alloc(npages); - if (!data) - goto out_bad; + struct nfs_page *req; - INIT_LIST_HEAD(&data->pages); - pages = data->pagevec; while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - nfs_list_add_request(req, &data->pages); - ClearPageError(req->wb_page); - *pages++ = req->wb_page; + nfs_readpage_release(req); } - req = nfs_list_entry(data->pages.next); - - nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); - return 0; -out_bad: - nfs_async_read_error(head); - return -ENOMEM; } +static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = { + .error_cleanup = nfs_async_read_error, + .completion = nfs_read_completion, +}; + /* * This is the callback from RPC telling us whether a reply was * received or some error occurred (timeout or socket shutdown). */ -int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data, + struct inode *inode) { - int status; - - dprintk("NFS: %s: %5u, (status %d)\n", __FUNCTION__, task->tk_pid, - task->tk_status); - - status = NFS_PROTO(data->inode)->read_done(task, data); + int status = NFS_PROTO(inode)->read_done(task, data); if (status != 0) return status; - nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, data->res.count); + nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, data->res.count); if (task->tk_status == -ESTALE) { - set_bit(NFS_INO_STALE, &NFS_I(data->inode)->flags); - nfs_mark_for_revalidate(data->inode); + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_mark_for_revalidate(inode); } return 0; } -static int nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data) +static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data) { - struct nfs_readargs *argp = &data->args; - struct nfs_readres *resp = &data->res; - - if (resp->eof || resp->count == argp->count) - return 0; + struct nfs_pgio_args *argp = &data->args; + struct nfs_pgio_res *resp = &data->res; /* This is a short read! */ - nfs_inc_stats(data->inode, NFSIOS_SHORTREAD); + nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD); /* Has the server at least made some progress? */ - if (resp->count == 0) - return 0; - + if (resp->count == 0) { + nfs_set_pgio_error(data->header, -EIO, argp->offset); + return; + } /* Yes, so retry the read at the end of the data */ + data->mds_offset += resp->count; argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; - rpc_restart_call(task); - return -EAGAIN; -} - -/* - * Handle a read reply that fills part of a page. - */ -static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata) -{ - struct nfs_read_data *data = calldata; - struct nfs_page *req = data->req; - struct page *page = req->wb_page; - - if (nfs_readpage_result(task, data) != 0) - return; - - if (likely(task->tk_status >= 0)) { - nfs_readpage_truncate_uninitialised_page(data); - if (nfs_readpage_retry(task, data) != 0) - return; - } - if (unlikely(task->tk_status < 0)) - SetPageError(page); - if (atomic_dec_and_test(&req->wb_complete)) { - if (!PageError(page)) - SetPageUptodate(page); - nfs_readpage_release(req); - } + rpc_restart_call_prepare(task); } -static const struct rpc_call_ops nfs_read_partial_ops = { - .rpc_call_done = nfs_readpage_result_partial, - .rpc_release = nfs_readdata_release, -}; - -static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data) +static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data) { - unsigned int count = data->res.count; - unsigned int base = data->args.pgbase; - struct page **pages; + struct nfs_pgio_header *hdr = data->header; - if (data->res.eof) - count = data->args.count; - if (unlikely(count == 0)) - return; - pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; - base &= ~PAGE_CACHE_MASK; - count += base; - for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) - SetPageUptodate(*pages); - if (count == 0) - return; - /* Was this a short read? */ - if (data->res.eof || data->res.count == data->args.count) - SetPageUptodate(*pages); -} + if (data->res.eof) { + loff_t bound; -/* - * This is the callback from RPC telling us whether a reply was - * received or some error occurred (timeout or socket shutdown). - */ -static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) -{ - struct nfs_read_data *data = calldata; - - if (nfs_readpage_result(task, data) != 0) - return; - /* - * Note: nfs_readpage_retry may change the values of - * data->args. In the multi-page case, we therefore need - * to ensure that we call nfs_readpage_set_pages_uptodate() - * first. - */ - if (likely(task->tk_status >= 0)) { - nfs_readpage_truncate_uninitialised_page(data); - nfs_readpage_set_pages_uptodate(data); - if (nfs_readpage_retry(task, data) != 0) - return; - } - while (!list_empty(&data->pages)) { - struct nfs_page *req = nfs_list_entry(data->pages.next); - - nfs_list_remove_request(req); - nfs_readpage_release(req); - } + bound = data->args.offset + data->res.count; + spin_lock(&hdr->lock); + if (bound < hdr->io_start + hdr->good_bytes) { + set_bit(NFS_IOHDR_EOF, &hdr->flags); + clear_bit(NFS_IOHDR_ERROR, &hdr->flags); + hdr->good_bytes = bound - hdr->io_start; + } + spin_unlock(&hdr->lock); + } else if (data->res.count != data->args.count) + nfs_readpage_retry(task, data); } -static const struct rpc_call_ops nfs_read_full_ops = { - .rpc_call_done = nfs_readpage_result_full, - .rpc_release = nfs_readdata_release, -}; - /* * Read a page over NFS. * We read the page synchronously in the following case: @@ -461,11 +267,11 @@ static const struct rpc_call_ops nfs_read_full_ops = { int nfs_readpage(struct file *file, struct page *page) { struct nfs_open_context *ctx; - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; int error; dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", - page, PAGE_CACHE_SIZE, page->index); + page, PAGE_CACHE_SIZE, page_file_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); nfs_add_stats(inode, NFSIOS_READPAGES, 1); @@ -494,8 +300,15 @@ int nfs_readpage(struct file *file, struct page *page) } else ctx = get_nfs_open_context(nfs_file_open_context(file)); + if (!IS_SYNC(inode)) { + error = nfs_readpage_from_fscache(ctx, inode, page); + if (error == 0) + goto out; + } + error = nfs_readpage_async(ctx, inode, page); +out: put_nfs_open_context(ctx); return error; out_unlock: @@ -512,22 +325,15 @@ static int readpage_async_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - struct inode *inode = page->mapping->host; struct nfs_page *new; unsigned int len; int error; - error = nfs_wb_page(inode, page); - if (error) - goto out_unlock; - if (PageUptodate(page)) - goto out_unlock; - len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); - new = nfs_create_request(desc->ctx, inode, page, 0, len); + new = nfs_create_request(desc->ctx, page, NULL, 0, len); if (IS_ERR(new)) goto out_error; @@ -540,7 +346,6 @@ readpage_async_filler(void *data, struct page *page) return 0; out_error: error = PTR_ERR(new); - SetPageError(page); out_unlock: unlock_page(page); return error; @@ -554,14 +359,12 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, .pgio = &pgio, }; struct inode *inode = mapping->host; - struct nfs_server *server = NFS_SERVER(inode); - size_t rsize = server->rsize; unsigned long npages; int ret = -ESTALE; - dprintk("NFS: nfs_readpages (%s/%Ld %d)\n", + dprintk("NFS: nfs_readpages (%s/%Lu %d)\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), + (unsigned long long)NFS_FILEID(inode), nr_pages); nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); @@ -574,16 +377,25 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, return -EBADF; } else desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); - if (rsize < PAGE_CACHE_SIZE) - nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); - else - nfs_pageio_init(&pgio, inode, nfs_pagein_one, rsize, 0); + + /* attempt to read as many of the pages as possible from the cache + * - this returns -ENOBUFS immediately if the cookie is negative + */ + ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping, + pages, &nr_pages); + if (ret == 0) + goto read_complete; /* all pages were read */ + + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); nfs_pageio_complete(&pgio); + NFS_I(inode)->read_io += pgio.pg_bytes_written; npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; nfs_add_stats(inode, NFSIOS_READPAGES, npages); +read_complete: put_nfs_open_context(desc.ctx); out: return ret; @@ -592,22 +404,25 @@ out: int __init nfs_init_readpagecache(void) { nfs_rdata_cachep = kmem_cache_create("nfs_read_data", - sizeof(struct nfs_read_data), + sizeof(struct nfs_rw_header), 0, SLAB_HWCACHE_ALIGN, NULL); if (nfs_rdata_cachep == NULL) return -ENOMEM; - nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ, - nfs_rdata_cachep); - if (nfs_rdata_mempool == NULL) - return -ENOMEM; - return 0; } void nfs_destroy_readpagecache(void) { - mempool_destroy(nfs_rdata_mempool); kmem_cache_destroy(nfs_rdata_cachep); } + +static const struct nfs_rw_ops nfs_rw_read_ops = { + .rw_mode = FMODE_READ, + .rw_alloc_header = nfs_readhdr_alloc, + .rw_free_header = nfs_readhdr_free, + .rw_done = nfs_readpage_done, + .rw_result = nfs_readpage_result, + .rw_initiate = nfs_initiate_read, +}; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f9219024f31..084af1060d7 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -5,7 +5,7 @@ * * nfs superblock handling functions * - * Modularised by Alan Cox <Alan.Cox@linux.org>, while hacking some + * Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some * experimental NFS changes. Modularisation taken straight from SYS5 fs. * * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts. @@ -31,6 +31,7 @@ #include <linux/errno.h> #include <linux/unistd.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/metrics.h> #include <linux/sunrpc/xprtsock.h> @@ -39,19 +40,22 @@ #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> -#include <linux/smp_lock.h> #include <linux/seq_file.h> #include <linux/mount.h> +#include <linux/namei.h> #include <linux/nfs_idmap.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> +#include <linux/slab.h> #include <net/ipv6.h> +#include <linux/netdevice.h> #include <linux/nfs_xdr.h> #include <linux/magic.h> #include <linux/parser.h> +#include <linux/nsproxy.h> +#include <linux/rcupdate.h> -#include <asm/system.h> #include <asm/uaccess.h> #include "nfs4_fs.h" @@ -59,22 +63,34 @@ #include "delegation.h" #include "iostat.h" #include "internal.h" +#include "fscache.h" +#include "nfs4session.h" +#include "pnfs.h" +#include "nfs.h" #define NFSDBG_FACILITY NFSDBG_VFS +#define NFS_TEXT_DATA 1 + +#if IS_ENABLED(CONFIG_NFS_V3) +#define NFS_DEFAULT_VERSION 3 +#else +#define NFS_DEFAULT_VERSION 2 +#endif enum { /* Mount options that take no arguments */ Opt_soft, Opt_hard, - Opt_intr, Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, Opt_noac, Opt_lock, Opt_nolock, - Opt_v2, Opt_v3, Opt_udp, Opt_tcp, Opt_rdma, Opt_acl, Opt_noacl, Opt_rdirplus, Opt_nordirplus, Opt_sharecache, Opt_nosharecache, + Opt_resvport, Opt_noresvport, + Opt_fscache, Opt_nofscache, + Opt_migration, Opt_nomigration, /* Mount options that take integer arguments */ Opt_port, @@ -86,25 +102,33 @@ enum { Opt_namelen, Opt_mountport, Opt_mountvers, - Opt_nfsvers, + Opt_minorversion, /* Mount options that take string arguments */ + Opt_nfsvers, Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, Opt_addr, Opt_mountaddr, Opt_clientaddr, + Opt_lookupcache, + Opt_fscache_uniq, + Opt_local_lock, - /* Mount options that are ignored */ - Opt_userspace, Opt_deprecated, + /* Special mount options */ + Opt_userspace, Opt_deprecated, Opt_sloppy, Opt_err }; -static match_table_t nfs_mount_option_tokens = { +static const match_table_t nfs_mount_option_tokens = { { Opt_userspace, "bg" }, { Opt_userspace, "fg" }, + { Opt_userspace, "retry=%s" }, + + { Opt_sloppy, "sloppy" }, + { Opt_soft, "soft" }, { Opt_hard, "hard" }, - { Opt_intr, "intr" }, - { Opt_nointr, "nointr" }, + { Opt_deprecated, "intr" }, + { Opt_deprecated, "nointr" }, { Opt_posix, "posix" }, { Opt_noposix, "noposix" }, { Opt_cto, "cto" }, @@ -113,8 +137,6 @@ static match_table_t nfs_mount_option_tokens = { { Opt_noac, "noac" }, { Opt_lock, "lock" }, { Opt_nolock, "nolock" }, - { Opt_v2, "v2" }, - { Opt_v3, "v3" }, { Opt_udp, "udp" }, { Opt_tcp, "tcp" }, { Opt_rdma, "rdma" }, @@ -124,24 +146,31 @@ static match_table_t nfs_mount_option_tokens = { { Opt_nordirplus, "nordirplus" }, { Opt_sharecache, "sharecache" }, { Opt_nosharecache, "nosharecache" }, - - { Opt_port, "port=%u" }, - { Opt_rsize, "rsize=%u" }, - { Opt_wsize, "wsize=%u" }, - { Opt_bsize, "bsize=%u" }, - { Opt_timeo, "timeo=%u" }, - { Opt_retrans, "retrans=%u" }, - { Opt_acregmin, "acregmin=%u" }, - { Opt_acregmax, "acregmax=%u" }, - { Opt_acdirmin, "acdirmin=%u" }, - { Opt_acdirmax, "acdirmax=%u" }, - { Opt_actimeo, "actimeo=%u" }, - { Opt_userspace, "retry=%u" }, - { Opt_namelen, "namlen=%u" }, - { Opt_mountport, "mountport=%u" }, - { Opt_mountvers, "mountvers=%u" }, - { Opt_nfsvers, "nfsvers=%u" }, - { Opt_nfsvers, "vers=%u" }, + { Opt_resvport, "resvport" }, + { Opt_noresvport, "noresvport" }, + { Opt_fscache, "fsc" }, + { Opt_nofscache, "nofsc" }, + { Opt_migration, "migration" }, + { Opt_nomigration, "nomigration" }, + + { Opt_port, "port=%s" }, + { Opt_rsize, "rsize=%s" }, + { Opt_wsize, "wsize=%s" }, + { Opt_bsize, "bsize=%s" }, + { Opt_timeo, "timeo=%s" }, + { Opt_retrans, "retrans=%s" }, + { Opt_acregmin, "acregmin=%s" }, + { Opt_acregmax, "acregmax=%s" }, + { Opt_acdirmin, "acdirmin=%s" }, + { Opt_acdirmax, "acdirmax=%s" }, + { Opt_actimeo, "actimeo=%s" }, + { Opt_namelen, "namlen=%s" }, + { Opt_mountport, "mountport=%s" }, + { Opt_mountvers, "mountvers=%s" }, + { Opt_minorversion, "minorversion=%s" }, + + { Opt_nfsvers, "nfsvers=%s" }, + { Opt_nfsvers, "vers=%s" }, { Opt_sec, "sec=%s" }, { Opt_proto, "proto=%s" }, @@ -151,18 +180,27 @@ static match_table_t nfs_mount_option_tokens = { { Opt_mounthost, "mounthost=%s" }, { Opt_mountaddr, "mountaddr=%s" }, + { Opt_lookupcache, "lookupcache=%s" }, + { Opt_fscache_uniq, "fsc=%s" }, + { Opt_local_lock, "local_lock=%s" }, + + /* The following needs to be listed after all other options */ + { Opt_nfsvers, "v%s" }, + { Opt_err, NULL } }; enum { - Opt_xprt_udp, Opt_xprt_tcp, Opt_xprt_rdma, + Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma, Opt_xprt_err }; -static match_table_t nfs_xprt_protocol_tokens = { +static const match_table_t nfs_xprt_protocol_tokens = { { Opt_xprt_udp, "udp" }, + { Opt_xprt_udp6, "udp6" }, { Opt_xprt_tcp, "tcp" }, + { Opt_xprt_tcp6, "tcp6" }, { Opt_xprt_rdma, "rdma" }, { Opt_xprt_err, NULL } @@ -177,7 +215,7 @@ enum { Opt_sec_err }; -static match_table_t nfs_secflavor_tokens = { +static const match_table_t nfs_secflavor_tokens = { { Opt_sec_none, "none" }, { Opt_sec_none, "null" }, { Opt_sec_sys, "sys" }, @@ -197,92 +235,133 @@ static match_table_t nfs_secflavor_tokens = { { Opt_sec_err, NULL } }; +enum { + Opt_lookupcache_all, Opt_lookupcache_positive, + Opt_lookupcache_none, + + Opt_lookupcache_err +}; + +static match_table_t nfs_lookupcache_tokens = { + { Opt_lookupcache_all, "all" }, + { Opt_lookupcache_positive, "pos" }, + { Opt_lookupcache_positive, "positive" }, + { Opt_lookupcache_none, "none" }, + + { Opt_lookupcache_err, NULL } +}; + +enum { + Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix, + Opt_local_lock_none, + + Opt_local_lock_err +}; + +static match_table_t nfs_local_lock_tokens = { + { Opt_local_lock_all, "all" }, + { Opt_local_lock_flock, "flock" }, + { Opt_local_lock_posix, "posix" }, + { Opt_local_lock_none, "none" }, + + { Opt_local_lock_err, NULL } +}; + +enum { + Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0, + Opt_vers_4_1, Opt_vers_4_2, + + Opt_vers_err +}; + +static match_table_t nfs_vers_tokens = { + { Opt_vers_2, "2" }, + { Opt_vers_3, "3" }, + { Opt_vers_4, "4" }, + { Opt_vers_4_0, "4.0" }, + { Opt_vers_4_1, "4.1" }, + { Opt_vers_4_2, "4.2" }, + + { Opt_vers_err, NULL } +}; -static void nfs_umount_begin(struct vfsmount *, int); -static int nfs_statfs(struct dentry *, struct kstatfs *); -static int nfs_show_options(struct seq_file *, struct vfsmount *); -static int nfs_show_stats(struct seq_file *, struct vfsmount *); -static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); -static int nfs_xdev_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); -static void nfs_kill_super(struct super_block *); -static void nfs_put_super(struct super_block *); +static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); -static struct file_system_type nfs_fs_type = { +struct file_system_type nfs_fs_type = { .owner = THIS_MODULE, .name = "nfs", - .get_sb = nfs_get_sb, + .mount = nfs_fs_mount, .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; +MODULE_ALIAS_FS("nfs"); +EXPORT_SYMBOL_GPL(nfs_fs_type); struct file_system_type nfs_xdev_fs_type = { .owner = THIS_MODULE, .name = "nfs", - .get_sb = nfs_xdev_get_sb, + .mount = nfs_xdev_mount, .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; -static const struct super_operations nfs_sops = { +const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, + .drop_inode = nfs_drop_inode, .put_super = nfs_put_super, .statfs = nfs_statfs, - .clear_inode = nfs_clear_inode, + .evict_inode = nfs_evict_inode, .umount_begin = nfs_umount_begin, .show_options = nfs_show_options, + .show_devname = nfs_show_devname, + .show_path = nfs_show_path, .show_stats = nfs_show_stats, + .remount_fs = nfs_remount, }; +EXPORT_SYMBOL_GPL(nfs_sops); -#ifdef CONFIG_NFS_V4 -static int nfs4_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); -static int nfs4_xdev_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); -static int nfs4_referral_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); -static void nfs4_kill_super(struct super_block *sb); +#if IS_ENABLED(CONFIG_NFS_V4) +static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *); +static int nfs4_validate_mount_data(void *options, + struct nfs_parsed_mount_data *args, const char *dev_name); -static struct file_system_type nfs4_fs_type = { +struct file_system_type nfs4_fs_type = { .owner = THIS_MODULE, .name = "nfs4", - .get_sb = nfs4_get_sb, - .kill_sb = nfs4_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .mount = nfs_fs_mount, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; +MODULE_ALIAS_FS("nfs4"); +MODULE_ALIAS("nfs4"); +EXPORT_SYMBOL_GPL(nfs4_fs_type); -struct file_system_type nfs4_xdev_fs_type = { - .owner = THIS_MODULE, - .name = "nfs4", - .get_sb = nfs4_xdev_get_sb, - .kill_sb = nfs4_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, -}; +static int __init register_nfs4_fs(void) +{ + return register_filesystem(&nfs4_fs_type); +} -struct file_system_type nfs4_referral_fs_type = { - .owner = THIS_MODULE, - .name = "nfs4", - .get_sb = nfs4_referral_get_sb, - .kill_sb = nfs4_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, -}; +static void unregister_nfs4_fs(void) +{ + unregister_filesystem(&nfs4_fs_type); +} +#else +static int __init register_nfs4_fs(void) +{ + return 0; +} -static const struct super_operations nfs4_sops = { - .alloc_inode = nfs_alloc_inode, - .destroy_inode = nfs_destroy_inode, - .write_inode = nfs_write_inode, - .statfs = nfs_statfs, - .clear_inode = nfs4_clear_inode, - .umount_begin = nfs_umount_begin, - .show_options = nfs_show_options, - .show_stats = nfs_show_stats, -}; +static void unregister_nfs4_fs(void) +{ +} #endif static struct shrinker acl_shrinker = { - .shrink = nfs_access_cache_shrinker, + .count_objects = nfs_access_cache_count, + .scan_objects = nfs_access_cache_scan, .seeks = DEFAULT_SEEKS, }; @@ -297,21 +376,18 @@ int __init register_nfs_fs(void) if (ret < 0) goto error_0; - ret = nfs_register_sysctl(); + ret = register_nfs4_fs(); if (ret < 0) goto error_1; -#ifdef CONFIG_NFS_V4 - ret = register_filesystem(&nfs4_fs_type); + + ret = nfs_register_sysctl(); if (ret < 0) goto error_2; -#endif register_shrinker(&acl_shrinker); return 0; -#ifdef CONFIG_NFS_V4 error_2: - nfs_unregister_sysctl(); -#endif + unregister_nfs4_fs(); error_1: unregister_filesystem(&nfs_fs_type); error_0: @@ -324,55 +400,59 @@ error_0: void __exit unregister_nfs_fs(void) { unregister_shrinker(&acl_shrinker); -#ifdef CONFIG_NFS_V4 - unregister_filesystem(&nfs4_fs_type); -#endif nfs_unregister_sysctl(); + unregister_nfs4_fs(); unregister_filesystem(&nfs_fs_type); } -void nfs_sb_active(struct nfs_server *server) +void nfs_sb_active(struct super_block *sb) { - atomic_inc(&server->active); -} + struct nfs_server *server = NFS_SB(sb); -void nfs_sb_deactive(struct nfs_server *server) -{ - if (atomic_dec_and_test(&server->active)) - wake_up(&server->active_wq); + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); } +EXPORT_SYMBOL_GPL(nfs_sb_active); -static void nfs_put_super(struct super_block *sb) +void nfs_sb_deactive(struct super_block *sb) { struct nfs_server *server = NFS_SB(sb); - /* - * Make sure there are no outstanding ops to this server. - * If so, wait for them to finish before allowing the - * unmount to continue. - */ - wait_event(server->active_wq, atomic_read(&server->active) == 0); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); } +EXPORT_SYMBOL_GPL(nfs_sb_deactive); /* * Deliver file system statistics to userspace */ -static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) +int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct nfs_server *server = NFS_SB(dentry->d_sb); unsigned char blockbits; unsigned long blockres; struct nfs_fh *fh = NFS_FH(dentry->d_inode); - struct nfs_fattr fattr; - struct nfs_fsstat res = { - .fattr = &fattr, - }; - int error; + struct nfs_fsstat res; + int error = -ENOMEM; - lock_kernel(); + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + goto out_err; error = server->nfs_client->rpc_ops->statfs(server, fh, &res); + if (unlikely(error == -ESTALE)) { + struct dentry *pd_dentry; + + pd_dentry = dget_parent(dentry); + if (pd_dentry != NULL) { + nfs_zap_caches(pd_dentry->d_inode); + dput(pd_dentry); + } + } + nfs_free_fattr(res.fattr); if (error < 0) goto out_err; + buf->f_type = NFS_SUPER_MAGIC; /* @@ -401,14 +481,13 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = server->namelen; - unlock_kernel(); return 0; out_err: - dprintk("%s: statfs error = %d\n", __FUNCTION__, -error); - unlock_kernel(); + dprintk("%s: statfs error = %d\n", __func__, -error); return error; } +EXPORT_SYMBOL_GPL(nfs_statfs); /* * Map the security flavour number to a name @@ -418,7 +497,8 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) static const struct { rpc_authflavor_t flavour; const char *str; - } sec_flavours[] = { + } sec_flavours[NFS_AUTH_INFO_MAX_FLAVORS] = { + /* update NFS_AUTH_INFO_MAX_FLAVORS when this list changes! */ { RPC_AUTH_NULL, "null" }, { RPC_AUTH_UNIX, "sys" }, { RPC_AUTH_GSS_KRB5, "krb5" }, @@ -441,10 +521,108 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) return sec_flavours[i].str; } +static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address; + + seq_printf(m, ",mountproto="); + switch (sap->sa_family) { + case AF_INET: + switch (nfss->mountd_protocol) { + case IPPROTO_UDP: + seq_printf(m, RPCBIND_NETID_UDP); + break; + case IPPROTO_TCP: + seq_printf(m, RPCBIND_NETID_TCP); + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } + break; + case AF_INET6: + switch (nfss->mountd_protocol) { + case IPPROTO_UDP: + seq_printf(m, RPCBIND_NETID_UDP6); + break; + case IPPROTO_TCP: + seq_printf(m, RPCBIND_NETID_TCP6); + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } +} + +static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address; + + if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE) + return; + + switch (sap->sa_family) { + case AF_INET: { + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + seq_printf(m, ",mountaddr=%pI4", &sin->sin_addr.s_addr); + break; + } + case AF_INET6: { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + seq_printf(m, ",mountaddr=%pI6c", &sin6->sin6_addr); + break; + } + default: + if (showdefaults) + seq_printf(m, ",mountaddr=unspecified"); + } + + if (nfss->mountd_version || showdefaults) + seq_printf(m, ",mountvers=%u", nfss->mountd_version); + if ((nfss->mountd_port && + nfss->mountd_port != (unsigned short)NFS_UNSPEC_PORT) || + showdefaults) + seq_printf(m, ",mountport=%u", nfss->mountd_port); + + nfs_show_mountd_netid(m, nfss, showdefaults); +} + +#if IS_ENABLED(CONFIG_NFS_V4) +static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + struct nfs_client *clp = nfss->nfs_client; + + seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); +} +#else +static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ +} +#endif + +static void nfs_show_nfs_version(struct seq_file *m, + unsigned int version, + unsigned int minorversion) +{ + seq_printf(m, ",vers=%u", version); + if (version == 4) + seq_printf(m, ".%u", minorversion); +} + /* * Describe the mount options in force on this server representation */ -static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults) +static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) { static const struct proc_nfs_info { int flag; @@ -452,64 +630,180 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, const char *nostr; } nfs_info[] = { { NFS_MOUNT_SOFT, ",soft", ",hard" }, + { NFS_MOUNT_POSIX, ",posix", "" }, { NFS_MOUNT_NOCTO, ",nocto", "" }, { NFS_MOUNT_NOAC, ",noac", "" }, { NFS_MOUNT_NONLM, ",nolock", "" }, { NFS_MOUNT_NOACL, ",noacl", "" }, { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" }, - { NFS_MOUNT_UNSHARED, ",nosharecache", ""}, + { NFS_MOUNT_UNSHARED, ",nosharecache", "" }, + { NFS_MOUNT_NORESVPORT, ",noresvport", "" }, { 0, NULL, NULL } }; const struct proc_nfs_info *nfs_infop; struct nfs_client *clp = nfss->nfs_client; - - seq_printf(m, ",vers=%d", clp->rpc_ops->version); - seq_printf(m, ",rsize=%d", nfss->rsize); - seq_printf(m, ",wsize=%d", nfss->wsize); - if (nfss->acregmin != 3*HZ || showdefaults) - seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ); - if (nfss->acregmax != 60*HZ || showdefaults) - seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ); - if (nfss->acdirmin != 30*HZ || showdefaults) - seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ); - if (nfss->acdirmax != 60*HZ || showdefaults) - seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ); + u32 version = clp->rpc_ops->version; + int local_flock, local_fcntl; + + nfs_show_nfs_version(m, version, clp->cl_minorversion); + seq_printf(m, ",rsize=%u", nfss->rsize); + seq_printf(m, ",wsize=%u", nfss->wsize); + if (nfss->bsize != 0) + seq_printf(m, ",bsize=%u", nfss->bsize); + seq_printf(m, ",namlen=%u", nfss->namelen); + if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults) + seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ); + if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults) + seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ); + if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults) + seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ); + if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults) + seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ); for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) { if (nfss->flags & nfs_infop->flag) seq_puts(m, nfs_infop->str); else seq_puts(m, nfs_infop->nostr); } + rcu_read_lock(); seq_printf(m, ",proto=%s", - rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO)); + rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID)); + rcu_read_unlock(); + if (version == 4) { + if (nfss->port != NFS_PORT) + seq_printf(m, ",port=%u", nfss->port); + } else + if (nfss->port) + seq_printf(m, ",port=%u", nfss->port); + seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ); seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries); seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor)); + + if (version != 4) + nfs_show_mountd_options(m, nfss, showdefaults); + else + nfs_show_nfsv4_options(m, nfss, showdefaults); + + if (nfss->options & NFS_OPTION_FSCACHE) + seq_printf(m, ",fsc"); + + if (nfss->options & NFS_OPTION_MIGRATION) + seq_printf(m, ",migration"); + + if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) { + if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) + seq_printf(m, ",lookupcache=none"); + else + seq_printf(m, ",lookupcache=pos"); + } + + local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK; + local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL; + + if (!local_flock && !local_fcntl) + seq_printf(m, ",local_lock=none"); + else if (local_flock && local_fcntl) + seq_printf(m, ",local_lock=all"); + else if (local_flock) + seq_printf(m, ",local_lock=flock"); + else + seq_printf(m, ",local_lock=posix"); } /* * Describe the mount options on this VFS mountpoint */ -static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) +int nfs_show_options(struct seq_file *m, struct dentry *root) { - struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); + struct nfs_server *nfss = NFS_SB(root->d_sb); nfs_show_mount_options(m, nfss, 0); + rcu_read_lock(); seq_printf(m, ",addr=%s", rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient, RPC_DISPLAY_ADDR)); + rcu_read_unlock(); + + return 0; +} +EXPORT_SYMBOL_GPL(nfs_show_options); + +#if IS_ENABLED(CONFIG_NFS_V4) +#ifdef CONFIG_NFS_V4_1 +static void show_sessions(struct seq_file *m, struct nfs_server *server) +{ + if (nfs4_has_session(server->nfs_client)) + seq_printf(m, ",sessions"); +} +#else +static void show_sessions(struct seq_file *m, struct nfs_server *server) {} +#endif +#endif + +#ifdef CONFIG_NFS_V4_1 +static void show_pnfs(struct seq_file *m, struct nfs_server *server) +{ + seq_printf(m, ",pnfs="); + if (server->pnfs_curr_ld) + seq_printf(m, "%s", server->pnfs_curr_ld->name); + else + seq_printf(m, "not configured"); +} + +static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss) +{ + if (nfss->nfs_client && nfss->nfs_client->cl_implid) { + struct nfs41_impl_id *impl_id = nfss->nfs_client->cl_implid; + seq_printf(m, "\n\timpl_id:\tname='%s',domain='%s'," + "date='%llu,%u'", + impl_id->name, impl_id->domain, + impl_id->date.seconds, impl_id->date.nseconds); + } +} +#else +#if IS_ENABLED(CONFIG_NFS_V4) +static void show_pnfs(struct seq_file *m, struct nfs_server *server) +{ +} +#endif +static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss) +{ +} +#endif +int nfs_show_devname(struct seq_file *m, struct dentry *root) +{ + char *page = (char *) __get_free_page(GFP_KERNEL); + char *devname, *dummy; + int err = 0; + if (!page) + return -ENOMEM; + devname = nfs_path(&dummy, root, page, PAGE_SIZE, 0); + if (IS_ERR(devname)) + err = PTR_ERR(devname); + else + seq_escape(m, devname, " \t\n\\"); + free_page((unsigned long)page); + return err; +} +EXPORT_SYMBOL_GPL(nfs_show_devname); + +int nfs_show_path(struct seq_file *m, struct dentry *dentry) +{ + seq_puts(m, "/"); return 0; } +EXPORT_SYMBOL_GPL(nfs_show_path); /* * Present statistical information for this VFS mountpoint */ -static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) +int nfs_show_stats(struct seq_file *m, struct dentry *root) { int i, cpu; - struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); + struct nfs_server *nfss = NFS_SB(root->d_sb); struct rpc_auth *auth = nfss->client->cl_auth; struct nfs_iostats totals = { }; @@ -519,36 +813,41 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) * Display all mount option settings */ seq_printf(m, "\n\topts:\t"); - seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw"); - seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); - seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : ""); - seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); + seq_puts(m, root->d_sb->s_flags & MS_RDONLY ? "ro" : "rw"); + seq_puts(m, root->d_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); + seq_puts(m, root->d_sb->s_flags & MS_NOATIME ? ",noatime" : ""); + seq_puts(m, root->d_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); nfs_show_mount_options(m, nfss, 1); seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ); + show_implementation_id(m, nfss); + seq_printf(m, "\n\tcaps:\t"); seq_printf(m, "caps=0x%x", nfss->caps); - seq_printf(m, ",wtmult=%d", nfss->wtmult); - seq_printf(m, ",dtsize=%d", nfss->dtsize); - seq_printf(m, ",bsize=%d", nfss->bsize); - seq_printf(m, ",namelen=%d", nfss->namelen); + seq_printf(m, ",wtmult=%u", nfss->wtmult); + seq_printf(m, ",dtsize=%u", nfss->dtsize); + seq_printf(m, ",bsize=%u", nfss->bsize); + seq_printf(m, ",namlen=%u", nfss->namelen); -#ifdef CONFIG_NFS_V4 +#if IS_ENABLED(CONFIG_NFS_V4) if (nfss->nfs_client->rpc_ops->version == 4) { seq_printf(m, "\n\tnfsv4:\t"); seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]); seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); + show_sessions(m, nfss); + show_pnfs(m, nfss); } #endif /* * Display security flavor in effect for this mount */ - seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor); + seq_printf(m, "\n\tsec:\tflavor=%u", auth->au_ops->au_flavor); if (auth->au_flavor) - seq_printf(m, ",pseudoflavor=%d", auth->au_flavor); + seq_printf(m, ",pseudoflavor=%u", auth->au_flavor); /* * Display superblock I/O counters @@ -563,6 +862,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) totals.events[i] += stats->events[i]; for (i = 0; i < __NFSIOS_BYTESMAX; i++) totals.bytes[i] += stats->bytes[i]; +#ifdef CONFIG_NFS_FSCACHE + for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) + totals.fscache[i] += stats->fscache[i]; +#endif preempt_enable(); } @@ -573,24 +876,31 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, "\n\tbytes:\t"); for (i = 0; i < __NFSIOS_BYTESMAX; i++) seq_printf(m, "%Lu ", totals.bytes[i]); +#ifdef CONFIG_NFS_FSCACHE + if (nfss->options & NFS_OPTION_FSCACHE) { + seq_printf(m, "\n\tfsc:\t"); + for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) + seq_printf(m, "%Lu ", totals.bytes[i]); + } +#endif seq_printf(m, "\n"); rpc_print_iostats(m, nfss->client); return 0; } +EXPORT_SYMBOL_GPL(nfs_show_stats); /* * Begin unmount by attempting to remove all automounted mountpoints we added * in response to xdev traversals and referrals */ -static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags) +void nfs_umount_begin(struct super_block *sb) { - struct nfs_server *server = NFS_SB(vfsmnt->mnt_sb); + struct nfs_server *server; struct rpc_clnt *rpc; - if (!(flags & MNT_FORCE)) - return; + server = NFS_SB(sb); /* -EIO all pending I/O */ rpc = server->client_acl; if (!IS_ERR(rpc)) @@ -599,23 +909,40 @@ static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags) if (!IS_ERR(rpc)) rpc_killall_tasks(rpc); } +EXPORT_SYMBOL_GPL(nfs_umount_begin); -/* - * Set the port number in an address. Be agnostic about the address family. - */ -static void nfs_set_port(struct sockaddr *sap, unsigned short port) +static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) { - switch (sap->sa_family) { - case AF_INET: { - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - ap->sin_port = htons(port); - break; - } - case AF_INET6: { - struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap; - ap->sin6_port = htons(port); - break; + struct nfs_parsed_mount_data *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data) { + data->acregmin = NFS_DEF_ACREGMIN; + data->acregmax = NFS_DEF_ACREGMAX; + data->acdirmin = NFS_DEF_ACDIRMIN; + data->acdirmax = NFS_DEF_ACDIRMAX; + data->mount_server.port = NFS_UNSPEC_PORT; + data->nfs_server.port = NFS_UNSPEC_PORT; + data->nfs_server.protocol = XPRT_TRANSPORT_TCP; + data->selected_flavor = RPC_AUTH_MAXFLAVOR; + data->minorversion = 0; + data->need_mount = true; + data->net = current->nsproxy->net_ns; + security_init_mnt_opts(&data->lsm_opts); } + return data; +} + +static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data) +{ + if (data) { + kfree(data->client_address); + kfree(data->mount_server.hostname); + kfree(data->nfs_server.export_path); + kfree(data->nfs_server.hostname); + kfree(data->fscache_uniq); + security_free_mnt_opts(&data->lsm_opts); + kfree(data); } } @@ -638,53 +965,238 @@ static int nfs_verify_server_address(struct sockaddr *addr) } } + dfprintk(MOUNT, "NFS: Invalid IP address specified\n"); return 0; } /* - * Parse string addresses passed in via a mount option, - * and construct a sockaddr based on the result. + * Select between a default port value and a user-specified port value. + * If a zero value is set, then autobind will be used. + */ +static void nfs_set_port(struct sockaddr *sap, int *port, + const unsigned short default_port) +{ + if (*port == NFS_UNSPEC_PORT) + *port = default_port; + + rpc_set_port(sap, *port); +} + +/* + * Sanity check the NFS transport protocol. * - * If address parsing fails, set the sockaddr's address - * family to AF_UNSPEC to force nfs_verify_server_address() - * to punt the mount. */ -static void nfs_parse_server_address(char *value, - struct sockaddr *sap, - size_t *len) +static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt) { - if (strchr(value, ':')) { - struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap; - u8 *addr = (u8 *)&ap->sin6_addr.in6_u; + switch (mnt->nfs_server.protocol) { + case XPRT_TRANSPORT_UDP: + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: + break; + default: + mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; + } +} - ap->sin6_family = AF_INET6; - *len = sizeof(*ap); - if (in6_pton(value, -1, addr, '\0', NULL)) - return; - } else { - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - u8 *addr = (u8 *)&ap->sin_addr.s_addr; +/* + * For text based NFSv2/v3 mounts, the mount protocol transport default + * settings should depend upon the specified NFS transport. + */ +static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) +{ + nfs_validate_transport_protocol(mnt); - ap->sin_family = AF_INET; - *len = sizeof(*ap); - if (in4_pton(value, -1, addr, '\0', NULL)) + if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP || + mnt->mount_server.protocol == XPRT_TRANSPORT_TCP) return; + switch (mnt->nfs_server.protocol) { + case XPRT_TRANSPORT_UDP: + mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; + break; + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: + mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; } +} - sap->sa_family = AF_UNSPEC; - *len = 0; +/* + * Add 'flavor' to 'auth_info' if not already present. + * Returns true if 'flavor' ends up in the list, false otherwise + */ +static bool nfs_auth_info_add(struct nfs_auth_info *auth_info, + rpc_authflavor_t flavor) +{ + unsigned int i; + unsigned int max_flavor_len = (sizeof(auth_info->flavors) / + sizeof(auth_info->flavors[0])); + + /* make sure this flavor isn't already in the list */ + for (i = 0; i < auth_info->flavor_len; i++) { + if (flavor == auth_info->flavors[i]) + return true; + } + + if (auth_info->flavor_len + 1 >= max_flavor_len) { + dfprintk(MOUNT, "NFS: too many sec= flavors\n"); + return false; + } + + auth_info->flavors[auth_info->flavor_len++] = flavor; + return true; +} + +/* + * Return true if 'match' is in auth_info or auth_info is empty. + * Return false otherwise. + */ +bool nfs_auth_info_match(const struct nfs_auth_info *auth_info, + rpc_authflavor_t match) +{ + int i; + + if (!auth_info->flavor_len) + return true; + + for (i = 0; i < auth_info->flavor_len; i++) { + if (auth_info->flavors[i] == match) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(nfs_auth_info_match); + +/* + * Parse the value of the 'sec=' option. + */ +static int nfs_parse_security_flavors(char *value, + struct nfs_parsed_mount_data *mnt) +{ + substring_t args[MAX_OPT_ARGS]; + rpc_authflavor_t pseudoflavor; + char *p; + + dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value); + + while ((p = strsep(&value, ":")) != NULL) { + switch (match_token(p, nfs_secflavor_tokens, args)) { + case Opt_sec_none: + pseudoflavor = RPC_AUTH_NULL; + break; + case Opt_sec_sys: + pseudoflavor = RPC_AUTH_UNIX; + break; + case Opt_sec_krb5: + pseudoflavor = RPC_AUTH_GSS_KRB5; + break; + case Opt_sec_krb5i: + pseudoflavor = RPC_AUTH_GSS_KRB5I; + break; + case Opt_sec_krb5p: + pseudoflavor = RPC_AUTH_GSS_KRB5P; + break; + case Opt_sec_lkey: + pseudoflavor = RPC_AUTH_GSS_LKEY; + break; + case Opt_sec_lkeyi: + pseudoflavor = RPC_AUTH_GSS_LKEYI; + break; + case Opt_sec_lkeyp: + pseudoflavor = RPC_AUTH_GSS_LKEYP; + break; + case Opt_sec_spkm: + pseudoflavor = RPC_AUTH_GSS_SPKM; + break; + case Opt_sec_spkmi: + pseudoflavor = RPC_AUTH_GSS_SPKMI; + break; + case Opt_sec_spkmp: + pseudoflavor = RPC_AUTH_GSS_SPKMP; + break; + default: + dfprintk(MOUNT, + "NFS: sec= option '%s' not recognized\n", p); + return 0; + } + + if (!nfs_auth_info_add(&mnt->auth_info, pseudoflavor)) + return 0; + } + + return 1; +} + +static int nfs_parse_version_string(char *string, + struct nfs_parsed_mount_data *mnt, + substring_t *args) +{ + mnt->flags &= ~NFS_MOUNT_VER3; + switch (match_token(string, nfs_vers_tokens, args)) { + case Opt_vers_2: + mnt->version = 2; + break; + case Opt_vers_3: + mnt->flags |= NFS_MOUNT_VER3; + mnt->version = 3; + break; + case Opt_vers_4: + /* Backward compatibility option. In future, + * the mount program should always supply + * a NFSv4 minor version number. + */ + mnt->version = 4; + break; + case Opt_vers_4_0: + mnt->version = 4; + mnt->minorversion = 0; + break; + case Opt_vers_4_1: + mnt->version = 4; + mnt->minorversion = 1; + break; + case Opt_vers_4_2: + mnt->version = 4; + mnt->minorversion = 2; + break; + default: + return 0; + } + return 1; +} + +static int nfs_get_option_str(substring_t args[], char **option) +{ + kfree(*option); + *option = match_strdup(args); + return !*option; +} + +static int nfs_get_option_ul(substring_t args[], unsigned long *option) +{ + int rc; + char *string; + + string = match_strdup(args); + if (string == NULL) + return -ENOMEM; + rc = kstrtoul(string, 10, option); + kfree(string); + + return rc; } /* * Error-check and convert a string of mount options from user space into - * a data structure + * a data structure. The whole mount string is processed; bad options are + * skipped as they are encountered. If there were no errors, return 1; + * otherwise return 0 (zero). */ static int nfs_parse_mount_options(char *raw, struct nfs_parsed_mount_data *mnt) { char *p, *string, *secdata; - unsigned short port = 0; - int rc; + int rc, sloppy = 0, invalid_option = 0; + unsigned short protofamily = AF_UNSPEC; + unsigned short mountfamily = AF_UNSPEC; if (!raw) { dfprintk(MOUNT, "NFS: mount options string was NULL.\n"); @@ -708,7 +1220,8 @@ static int nfs_parse_mount_options(char *raw, while ((p = strsep(&raw, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; - int option, token; + unsigned long option; + int token; if (!*p) continue; @@ -717,15 +1230,16 @@ static int nfs_parse_mount_options(char *raw, token = match_token(p, nfs_mount_option_tokens, args); switch (token) { + + /* + * boolean options: foo/nofoo + */ case Opt_soft: mnt->flags |= NFS_MOUNT_SOFT; break; case Opt_hard: mnt->flags &= ~NFS_MOUNT_SOFT; break; - case Opt_intr: - case Opt_nointr: - break; case Opt_posix: mnt->flags |= NFS_MOUNT_POSIX; break; @@ -746,33 +1260,26 @@ static int nfs_parse_mount_options(char *raw, break; case Opt_lock: mnt->flags &= ~NFS_MOUNT_NONLM; + mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); break; case Opt_nolock: mnt->flags |= NFS_MOUNT_NONLM; - break; - case Opt_v2: - mnt->flags &= ~NFS_MOUNT_VER3; - break; - case Opt_v3: - mnt->flags |= NFS_MOUNT_VER3; + mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); break; case Opt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; - mnt->timeo = 7; - mnt->retrans = 5; break; case Opt_tcp: mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; - mnt->timeo = 600; - mnt->retrans = 2; break; case Opt_rdma: mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; - mnt->timeo = 600; - mnt->retrans = 2; + xprt_load_transport(p); break; case Opt_acl: mnt->flags &= ~NFS_MOUNT_NOACL; @@ -792,166 +1299,137 @@ static int nfs_parse_mount_options(char *raw, case Opt_nosharecache: mnt->flags |= NFS_MOUNT_UNSHARED; break; + case Opt_resvport: + mnt->flags &= ~NFS_MOUNT_NORESVPORT; + break; + case Opt_noresvport: + mnt->flags |= NFS_MOUNT_NORESVPORT; + break; + case Opt_fscache: + mnt->options |= NFS_OPTION_FSCACHE; + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = NULL; + break; + case Opt_nofscache: + mnt->options &= ~NFS_OPTION_FSCACHE; + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = NULL; + break; + case Opt_migration: + mnt->options |= NFS_OPTION_MIGRATION; + break; + case Opt_nomigration: + mnt->options &= NFS_OPTION_MIGRATION; + break; + /* + * options that take numeric values + */ case Opt_port: - if (match_int(args, &option)) - return 0; - if (option < 0 || option > 65535) - return 0; - port = option; + if (nfs_get_option_ul(args, &option) || + option > USHRT_MAX) + goto out_invalid_value; + mnt->nfs_server.port = option; break; case Opt_rsize: - if (match_int(args, &mnt->rsize)) - return 0; + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->rsize = option; break; case Opt_wsize: - if (match_int(args, &mnt->wsize)) - return 0; + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->wsize = option; break; case Opt_bsize: - if (match_int(args, &option)) - return 0; - if (option < 0) - return 0; + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; mnt->bsize = option; break; case Opt_timeo: - if (match_int(args, &mnt->timeo)) - return 0; + if (nfs_get_option_ul(args, &option) || option == 0) + goto out_invalid_value; + mnt->timeo = option; break; case Opt_retrans: - if (match_int(args, &mnt->retrans)) - return 0; + if (nfs_get_option_ul(args, &option) || option == 0) + goto out_invalid_value; + mnt->retrans = option; break; case Opt_acregmin: - if (match_int(args, &mnt->acregmin)) - return 0; + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->acregmin = option; break; case Opt_acregmax: - if (match_int(args, &mnt->acregmax)) - return 0; + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->acregmax = option; break; case Opt_acdirmin: - if (match_int(args, &mnt->acdirmin)) - return 0; + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->acdirmin = option; break; case Opt_acdirmax: - if (match_int(args, &mnt->acdirmax)) - return 0; + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->acdirmax = option; break; case Opt_actimeo: - if (match_int(args, &option)) - return 0; - if (option < 0) - return 0; - mnt->acregmin = - mnt->acregmax = - mnt->acdirmin = - mnt->acdirmax = option; + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->acregmin = mnt->acregmax = + mnt->acdirmin = mnt->acdirmax = option; break; case Opt_namelen: - if (match_int(args, &mnt->namlen)) - return 0; + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + mnt->namlen = option; break; case Opt_mountport: - if (match_int(args, &option)) - return 0; - if (option < 0 || option > 65535) - return 0; + if (nfs_get_option_ul(args, &option) || + option > USHRT_MAX) + goto out_invalid_value; mnt->mount_server.port = option; break; case Opt_mountvers: - if (match_int(args, &option)) - return 0; - if (option < 0) - return 0; + if (nfs_get_option_ul(args, &option) || + option < NFS_MNT_VERSION || + option > NFS_MNT3_VERSION) + goto out_invalid_value; mnt->mount_server.version = option; break; - case Opt_nfsvers: - if (match_int(args, &option)) - return 0; - switch (option) { - case 2: - mnt->flags &= ~NFS_MOUNT_VER3; - break; - case 3: - mnt->flags |= NFS_MOUNT_VER3; - break; - default: - goto out_unrec_vers; - } + case Opt_minorversion: + if (nfs_get_option_ul(args, &option)) + goto out_invalid_value; + if (option > NFS4_MAX_MINOR_VERSION) + goto out_invalid_value; + mnt->minorversion = option; break; + /* + * options that take text values + */ + case Opt_nfsvers: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + rc = nfs_parse_version_string(string, mnt, args); + kfree(string); + if (!rc) + goto out_invalid_value; + break; case Opt_sec: string = match_strdup(args); if (string == NULL) goto out_nomem; - token = match_token(string, nfs_secflavor_tokens, args); + rc = nfs_parse_security_flavors(string, mnt); kfree(string); - - /* - * The flags setting is for v2/v3. The flavor_len - * setting is for v4. v2/v3 also need to know the - * difference between NULL and UNIX. - */ - switch (token) { - case Opt_sec_none: - mnt->flags &= ~NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 0; - mnt->auth_flavors[0] = RPC_AUTH_NULL; - break; - case Opt_sec_sys: - mnt->flags &= ~NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 0; - mnt->auth_flavors[0] = RPC_AUTH_UNIX; - break; - case Opt_sec_krb5: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; - break; - case Opt_sec_krb5i: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; - break; - case Opt_sec_krb5p: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; - break; - case Opt_sec_lkey: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; - break; - case Opt_sec_lkeyi: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; - break; - case Opt_sec_lkeyp: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; - break; - case Opt_sec_spkm: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; - break; - case Opt_sec_spkmi: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; - break; - case Opt_sec_spkmp: - mnt->flags |= NFS_MOUNT_SECFLAVOUR; - mnt->auth_flavor_len = 1; - mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; - break; - default: - goto out_unrec_sec; + if (!rc) { + dfprintk(MOUNT, "NFS: unrecognized " + "security flavor\n"); + return 0; } break; case Opt_proto: @@ -960,31 +1438,34 @@ static int nfs_parse_mount_options(char *raw, goto out_nomem; token = match_token(string, nfs_xprt_protocol_tokens, args); - kfree(string); + protofamily = AF_INET; switch (token) { + case Opt_xprt_udp6: + protofamily = AF_INET6; case Opt_xprt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; - mnt->timeo = 7; - mnt->retrans = 5; break; + case Opt_xprt_tcp6: + protofamily = AF_INET6; case Opt_xprt_tcp: mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; - mnt->timeo = 600; - mnt->retrans = 2; break; case Opt_xprt_rdma: /* vector side protocols to TCP */ mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; - mnt->timeo = 600; - mnt->retrans = 2; + xprt_load_transport(string); break; default: - goto out_unrec_xprt; + dfprintk(MOUNT, "NFS: unrecognized " + "transport protocol\n"); + kfree(string); + return 0; } + kfree(string); break; case Opt_mountproto: string = match_strdup(args); @@ -994,64 +1475,189 @@ static int nfs_parse_mount_options(char *raw, nfs_xprt_protocol_tokens, args); kfree(string); + mountfamily = AF_INET; switch (token) { + case Opt_xprt_udp6: + mountfamily = AF_INET6; case Opt_xprt_udp: mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; break; + case Opt_xprt_tcp6: + mountfamily = AF_INET6; case Opt_xprt_tcp: mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; break; case Opt_xprt_rdma: /* not used for side protocols */ default: - goto out_unrec_xprt; + dfprintk(MOUNT, "NFS: unrecognized " + "transport protocol\n"); + return 0; } break; case Opt_addr: string = match_strdup(args); if (string == NULL) goto out_nomem; - nfs_parse_server_address(string, (struct sockaddr *) - &mnt->nfs_server.address, - &mnt->nfs_server.addrlen); + mnt->nfs_server.addrlen = + rpc_pton(mnt->net, string, strlen(string), + (struct sockaddr *) + &mnt->nfs_server.address, + sizeof(mnt->nfs_server.address)); kfree(string); + if (mnt->nfs_server.addrlen == 0) + goto out_invalid_address; break; case Opt_clientaddr: + if (nfs_get_option_str(args, &mnt->client_address)) + goto out_nomem; + break; + case Opt_mounthost: + if (nfs_get_option_str(args, + &mnt->mount_server.hostname)) + goto out_nomem; + break; + case Opt_mountaddr: string = match_strdup(args); if (string == NULL) goto out_nomem; - kfree(mnt->client_address); - mnt->client_address = string; + mnt->mount_server.addrlen = + rpc_pton(mnt->net, string, strlen(string), + (struct sockaddr *) + &mnt->mount_server.address, + sizeof(mnt->mount_server.address)); + kfree(string); + if (mnt->mount_server.addrlen == 0) + goto out_invalid_address; break; - case Opt_mounthost: + case Opt_lookupcache: string = match_strdup(args); if (string == NULL) goto out_nomem; - kfree(mnt->mount_server.hostname); - mnt->mount_server.hostname = string; + token = match_token(string, + nfs_lookupcache_tokens, args); + kfree(string); + switch (token) { + case Opt_lookupcache_all: + mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE); + break; + case Opt_lookupcache_positive: + mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE; + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG; + break; + case Opt_lookupcache_none: + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE; + break; + default: + dfprintk(MOUNT, "NFS: invalid " + "lookupcache argument\n"); + return 0; + }; break; - case Opt_mountaddr: + case Opt_fscache_uniq: + if (nfs_get_option_str(args, &mnt->fscache_uniq)) + goto out_nomem; + mnt->options |= NFS_OPTION_FSCACHE; + break; + case Opt_local_lock: string = match_strdup(args); if (string == NULL) goto out_nomem; - nfs_parse_server_address(string, (struct sockaddr *) - &mnt->mount_server.address, - &mnt->mount_server.addrlen); + token = match_token(string, nfs_local_lock_tokens, + args); kfree(string); + switch (token) { + case Opt_local_lock_all: + mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + case Opt_local_lock_flock: + mnt->flags |= NFS_MOUNT_LOCAL_FLOCK; + break; + case Opt_local_lock_posix: + mnt->flags |= NFS_MOUNT_LOCAL_FCNTL; + break; + case Opt_local_lock_none: + mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + default: + dfprintk(MOUNT, "NFS: invalid " + "local_lock argument\n"); + return 0; + }; break; + /* + * Special options + */ + case Opt_sloppy: + sloppy = 1; + dfprintk(MOUNT, "NFS: relaxing parsing rules\n"); + break; case Opt_userspace: case Opt_deprecated: + dfprintk(MOUNT, "NFS: ignoring mount option " + "'%s'\n", p); break; default: - goto out_unknown; + invalid_option = 1; + dfprintk(MOUNT, "NFS: unrecognized mount option " + "'%s'\n", p); } } - nfs_set_port((struct sockaddr *)&mnt->nfs_server.address, port); + if (!sloppy && invalid_option) + return 0; + + if (mnt->minorversion && mnt->version != 4) + goto out_minorversion_mismatch; + + if (mnt->options & NFS_OPTION_MIGRATION && + (mnt->version != 4 || mnt->minorversion != 0)) + goto out_migration_misuse; + + /* + * verify that any proto=/mountproto= options match the address + * families in the addr=/mountaddr= options. + */ + if (protofamily != AF_UNSPEC && + protofamily != mnt->nfs_server.address.ss_family) + goto out_proto_mismatch; + + if (mountfamily != AF_UNSPEC) { + if (mnt->mount_server.addrlen) { + if (mountfamily != mnt->mount_server.address.ss_family) + goto out_mountproto_mismatch; + } else { + if (mountfamily != mnt->nfs_server.address.ss_family) + goto out_mountproto_mismatch; + } + } return 1; +out_mountproto_mismatch: + printk(KERN_INFO "NFS: mount server address does not match mountproto= " + "option\n"); + return 0; +out_proto_mismatch: + printk(KERN_INFO "NFS: server address does not match proto= option\n"); + return 0; +out_invalid_address: + printk(KERN_INFO "NFS: bad IP address specified: %s\n", p); + return 0; +out_invalid_value: + printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p); + return 0; +out_minorversion_mismatch: + printk(KERN_INFO "NFS: mount option vers=%u does not support " + "minorversion=%u\n", mnt->version, mnt->minorversion); + return 0; +out_migration_misuse: + printk(KERN_INFO + "NFS: 'migration' not supported for this NFS version\n"); + return 0; out_nomem: printk(KERN_INFO "NFS: not enough memory to parse option\n"); return 0; @@ -1059,20 +1665,42 @@ out_security_failure: free_secdata(secdata); printk(KERN_INFO "NFS: security options invalid: %d\n", rc); return 0; -out_unrec_vers: - printk(KERN_INFO "NFS: unrecognized NFS version number\n"); - return 0; +} -out_unrec_xprt: - printk(KERN_INFO "NFS: unrecognized transport protocol\n"); - return 0; +/* + * Ensure that a specified authtype in args->auth_info is supported by + * the server. Returns 0 and sets args->selected_flavor if it's ok, and + * -EACCES if not. + */ +static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, + rpc_authflavor_t *server_authlist, unsigned int count) +{ + rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; + unsigned int i; -out_unrec_sec: - printk(KERN_INFO "NFS: unrecognized security flavor\n"); - return 0; + /* + * If the sec= mount option is used, the specified flavor or AUTH_NULL + * must be in the list returned by the server. + * + * AUTH_NULL has a special meaning when it's in the server list - it + * means that the server will ignore the rpc creds, so any flavor + * can be used. + */ + for (i = 0; i < count; i++) { + flavor = server_authlist[i]; + + if (nfs_auth_info_match(&args->auth_info, flavor) || + flavor == RPC_AUTH_NULL) + goto out; + } + + dfprintk(MOUNT, + "NFS: specified auth flavors not supported by server\n"); + return -EACCES; -out_unknown: - printk(KERN_INFO "NFS: unknown mount option: %s\n", p); +out: + args->selected_flavor = flavor; + dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor); return 0; } @@ -1080,56 +1708,226 @@ out_unknown: * Use the remote server's MOUNT service to request the NFS file handle * corresponding to the provided path. */ -static int nfs_try_mount(struct nfs_parsed_mount_data *args, - struct nfs_fh *root_fh) +static int nfs_request_mount(struct nfs_parsed_mount_data *args, + struct nfs_fh *root_fh, + rpc_authflavor_t *server_authlist, + unsigned int *server_authlist_len) { - struct sockaddr *sap = (struct sockaddr *)&args->mount_server.address; - char *hostname; + struct nfs_mount_request request = { + .sap = (struct sockaddr *) + &args->mount_server.address, + .dirpath = args->nfs_server.export_path, + .protocol = args->mount_server.protocol, + .fh = root_fh, + .noresvport = args->flags & NFS_MOUNT_NORESVPORT, + .auth_flav_len = server_authlist_len, + .auth_flavs = server_authlist, + .net = args->net, + }; int status; if (args->mount_server.version == 0) { - if (args->flags & NFS_MOUNT_VER3) - args->mount_server.version = NFS_MNT3_VERSION; - else - args->mount_server.version = NFS_MNT_VERSION; + switch (args->version) { + default: + args->mount_server.version = NFS_MNT3_VERSION; + break; + case 2: + args->mount_server.version = NFS_MNT_VERSION; + } } + request.version = args->mount_server.version; if (args->mount_server.hostname) - hostname = args->mount_server.hostname; + request.hostname = args->mount_server.hostname; else - hostname = args->nfs_server.hostname; + request.hostname = args->nfs_server.hostname; /* * Construct the mount server's address. */ if (args->mount_server.address.ss_family == AF_UNSPEC) { - memcpy(sap, &args->nfs_server.address, + memcpy(request.sap, &args->nfs_server.address, args->nfs_server.addrlen); args->mount_server.addrlen = args->nfs_server.addrlen; } + request.salen = args->mount_server.addrlen; + nfs_set_port(request.sap, &args->mount_server.port, 0); /* - * autobind will be used if mount_server.port == 0 + * Now ask the mount server to map our export path + * to a file handle. */ - nfs_set_port(sap, args->mount_server.port); + status = nfs_mount(&request); + if (status != 0) { + dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", + request.hostname, status); + return status; + } + + return 0; +} + +static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + int status; + unsigned int i; + bool tried_auth_unix = false; + bool auth_null_in_list = false; + struct nfs_server *server = ERR_PTR(-EACCES); + struct nfs_parsed_mount_data *args = mount_info->parsed; + rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS]; + unsigned int authlist_len = ARRAY_SIZE(authlist); + + status = nfs_request_mount(args, mount_info->mntfh, authlist, + &authlist_len); + if (status) + return ERR_PTR(status); /* - * Now ask the mount server to map our export path - * to a file handle. + * Was a sec= authflavor specified in the options? First, verify + * whether the server supports it, and then just try to use it if so. */ - status = nfs_mount(sap, - args->mount_server.addrlen, - hostname, - args->nfs_server.export_path, - args->mount_server.version, - args->mount_server.protocol, - root_fh); - if (status == 0) - return 0; + if (args->auth_info.flavor_len > 0) { + status = nfs_verify_authflavors(args, authlist, authlist_len); + dfprintk(MOUNT, "NFS: using auth flavor %u\n", + args->selected_flavor); + if (status) + return ERR_PTR(status); + return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); + } + + /* + * No sec= option was provided. RFC 2623, section 2.7 suggests we + * SHOULD prefer the flavor listed first. However, some servers list + * AUTH_NULL first. Avoid ever choosing AUTH_NULL. + */ + for (i = 0; i < authlist_len; ++i) { + rpc_authflavor_t flavor; + struct rpcsec_gss_info info; + + flavor = authlist[i]; + switch (flavor) { + case RPC_AUTH_UNIX: + tried_auth_unix = true; + break; + case RPC_AUTH_NULL: + auth_null_in_list = true; + continue; + default: + if (rpcauth_get_gssinfo(flavor, &info) != 0) + continue; + /* Fallthrough */ + } + dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor); + args->selected_flavor = flavor; + server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); + if (!IS_ERR(server)) + return server; + } + + /* + * Nothing we tried so far worked. At this point, give up if we've + * already tried AUTH_UNIX or if the server's list doesn't contain + * AUTH_NULL + */ + if (tried_auth_unix || !auth_null_in_list) + return server; + + /* Last chance! Try AUTH_UNIX */ + dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX); + args->selected_flavor = RPC_AUTH_UNIX; + return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); +} + +struct dentry *nfs_try_mount(int flags, const char *dev_name, + struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + struct nfs_server *server; + + if (mount_info->parsed->need_mount) + server = nfs_try_mount_request(mount_info, nfs_mod); + else + server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); - dfprintk(MOUNT, "NFS: unable to mount server %s, error %d", - hostname, status); - return status; + if (IS_ERR(server)) + return ERR_CAST(server); + + return nfs_fs_mount_common(server, flags, dev_name, mount_info, nfs_mod); +} +EXPORT_SYMBOL_GPL(nfs_try_mount); + +/* + * Split "dev_name" into "hostname:export_path". + * + * The leftmost colon demarks the split between the server's hostname + * and the export path. If the hostname starts with a left square + * bracket, then it may contain colons. + * + * Note: caller frees hostname and export path, even on error. + */ +static int nfs_parse_devname(const char *dev_name, + char **hostname, size_t maxnamlen, + char **export_path, size_t maxpathlen) +{ + size_t len; + char *end; + + /* Is the host name protected with square brakcets? */ + if (*dev_name == '[') { + end = strchr(++dev_name, ']'); + if (end == NULL || end[1] != ':') + goto out_bad_devname; + + len = end - dev_name; + end++; + } else { + char *comma; + + end = strchr(dev_name, ':'); + if (end == NULL) + goto out_bad_devname; + len = end - dev_name; + + /* kill possible hostname list: not supported */ + comma = strchr(dev_name, ','); + if (comma != NULL && comma < end) + *comma = 0; + } + + if (len > maxnamlen) + goto out_hostname; + + /* N.B. caller will free nfs_server.hostname in all cases */ + *hostname = kstrndup(dev_name, len, GFP_KERNEL); + if (*hostname == NULL) + goto out_nomem; + len = strlen(++end); + if (len > maxpathlen) + goto out_path; + *export_path = kstrndup(end, len, GFP_KERNEL); + if (!*export_path) + goto out_nomem; + + dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path); + return 0; + +out_bad_devname: + dfprintk(MOUNT, "NFS: device name not in host:path format\n"); + return -EINVAL; + +out_nomem: + dfprintk(MOUNT, "NFS: not enough memory to parse device name\n"); + return -ENOMEM; + +out_hostname: + dfprintk(MOUNT, "NFS: server hostname too long\n"); + return -ENAMETOOLONG; + +out_path: + dfprintk(MOUNT, "NFS: export pathname too long\n"); + return -ENAMETOOLONG; } /* @@ -1148,30 +1946,19 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, * + breaking back: trying proto=udp after proto=tcp, v2 after v3, * mountproto=tcp after mountproto=udp, and so on */ -static int nfs_validate_mount_data(void *options, - struct nfs_parsed_mount_data *args, - struct nfs_fh *mntfh, - const char *dev_name) +static int nfs23_validate_mount_data(void *options, + struct nfs_parsed_mount_data *args, + struct nfs_fh *mntfh, + const char *dev_name) { struct nfs_mount_data *data = (struct nfs_mount_data *)options; - - memset(args, 0, sizeof(*args)); + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; + int extra_flags = NFS_MOUNT_LEGACY_INTERFACE; if (data == NULL) goto out_no_data; - args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP); - args->rsize = NFS_MAX_FILE_IO_SIZE; - args->wsize = NFS_MAX_FILE_IO_SIZE; - args->timeo = 600; - args->retrans = 2; - args->acregmin = 3; - args->acregmax = 60; - args->acdirmin = 30; - args->acdirmax = 60; - args->mount_server.protocol = XPRT_TRANSPORT_UDP; - args->nfs_server.protocol = XPRT_TRANSPORT_TCP; - + args->version = NFS_DEFAULT_VERSION; switch (data->version) { case 1: data->namlen = 0; @@ -1182,19 +1969,24 @@ static int nfs_validate_mount_data(void *options, goto out_no_v3; data->root.size = NFS2_FHSIZE; memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); + /* Turn off security negotiation */ + extra_flags |= NFS_MOUNT_SECFLAVOUR; case 4: if (data->flags & NFS_MOUNT_SECFLAVOUR) goto out_no_sec; case 5: memset(data->context, 0, sizeof(data->context)); case 6: - if (data->flags & NFS_MOUNT_VER3) + if (data->flags & NFS_MOUNT_VER3) { + if (data->root.size > NFS3_FHSIZE || data->root.size == 0) + goto out_invalid_fh; mntfh->size = data->root.size; - else + args->version = 3; + } else { mntfh->size = NFS2_FHSIZE; + args->version = 2; + } - if (mntfh->size > sizeof(mntfh->data)) - goto out_invalid_fh; memcpy(mntfh->data, data->root.data, mntfh->size); if (mntfh->size < sizeof(mntfh->data)) @@ -1205,22 +1997,22 @@ static int nfs_validate_mount_data(void *options, * Translate to nfs_parsed_mount_data, which nfs_fill_super * can deal with. */ - args->flags = data->flags; + args->flags = data->flags & NFS_MOUNT_FLAGMASK; + args->flags |= extra_flags; args->rsize = data->rsize; args->wsize = data->wsize; - args->flags = data->flags; args->timeo = data->timeo; args->retrans = data->retrans; args->acregmin = data->acregmin; args->acregmax = data->acregmax; args->acdirmin = data->acdirmin; args->acdirmax = data->acdirmax; + args->need_mount = false; - memcpy(&args->nfs_server.address, &data->addr, - sizeof(data->addr)); + memcpy(sap, &data->addr, sizeof(data->addr)); args->nfs_server.addrlen = sizeof(data->addr); - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + args->nfs_server.port = ntohs(data->addr.sin_port); + if (!nfs_verify_server_address(sap)) goto out_no_address; if (!(data->flags & NFS_MOUNT_TCP)) @@ -1229,8 +2021,20 @@ static int nfs_validate_mount_data(void *options, args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); args->namlen = data->namlen; args->bsize = data->bsize; - args->auth_flavors[0] = data->pseudoflavor; + if (data->flags & NFS_MOUNT_SECFLAVOUR) + args->selected_flavor = data->pseudoflavor; + else + args->selected_flavor = RPC_AUTH_UNIX; + if (!args->nfs_server.hostname) + goto out_nomem; + + if (!(data->flags & NFS_MOUNT_NONLM)) + args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK| + NFS_MOUNT_LOCAL_FCNTL); + else + args->flags |= (NFS_MOUNT_LOCAL_FLOCK| + NFS_MOUNT_LOCAL_FCNTL); /* * The legacy version 6 binary mount data from userspace has a * field used only to transport selinux information into the @@ -1258,43 +2062,12 @@ static int nfs_validate_mount_data(void *options, } break; - default: { - unsigned int len; - char *c; - int status; - - if (nfs_parse_mount_options((char *)options, args) == 0) - return -EINVAL; - - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) - goto out_no_address; - - c = strchr(dev_name, ':'); - if (c == NULL) - return -EINVAL; - len = c - dev_name; - /* N.B. caller will free nfs_server.hostname in all cases */ - args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL); - - c++; - if (strlen(c) > NFS_MAXPATHLEN) - return -ENAMETOOLONG; - args->nfs_server.export_path = c; - - status = nfs_try_mount(args, mntfh); - if (status) - return status; - - break; - } + default: + return NFS_TEXT_DATA; } - if (!(args->flags & NFS_MOUNT_SECFLAVOUR)) - args->auth_flavors[0] = RPC_AUTH_UNIX; - -#ifndef CONFIG_NFS_V3 - if (args->flags & NFS_MOUNT_VER3) +#if !IS_ENABLED(CONFIG_NFS_V3) + if (args->version == 3) goto out_v3_not_compiled; #endif /* !CONFIG_NFS_V3 */ @@ -1313,12 +2086,16 @@ out_no_sec: dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n"); return -EINVAL; -#ifndef CONFIG_NFS_V3 +#if !IS_ENABLED(CONFIG_NFS_V3) out_v3_not_compiled: dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n"); return -EPROTONOSUPPORT; #endif /* !CONFIG_NFS_V3 */ +out_nomem: + dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); + return -ENOMEM; + out_no_address: dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n"); return -EINVAL; @@ -1328,10 +2105,191 @@ out_invalid_fh: return -EINVAL; } +#if IS_ENABLED(CONFIG_NFS_V4) +static int nfs_validate_mount_data(struct file_system_type *fs_type, + void *options, + struct nfs_parsed_mount_data *args, + struct nfs_fh *mntfh, + const char *dev_name) +{ + if (fs_type == &nfs_fs_type) + return nfs23_validate_mount_data(options, args, mntfh, dev_name); + return nfs4_validate_mount_data(options, args, dev_name); +} +#else +static int nfs_validate_mount_data(struct file_system_type *fs_type, + void *options, + struct nfs_parsed_mount_data *args, + struct nfs_fh *mntfh, + const char *dev_name) +{ + return nfs23_validate_mount_data(options, args, mntfh, dev_name); +} +#endif + +static int nfs_validate_text_mount_data(void *options, + struct nfs_parsed_mount_data *args, + const char *dev_name) +{ + int port = 0; + int max_namelen = PAGE_SIZE; + int max_pathlen = NFS_MAXPATHLEN; + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; + + if (nfs_parse_mount_options((char *)options, args) == 0) + return -EINVAL; + + if (!nfs_verify_server_address(sap)) + goto out_no_address; + + if (args->version == 4) { +#if IS_ENABLED(CONFIG_NFS_V4) + port = NFS_PORT; + max_namelen = NFS4_MAXNAMLEN; + max_pathlen = NFS4_MAXPATHLEN; + nfs_validate_transport_protocol(args); + if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP) + goto out_invalid_transport_udp; + nfs4_validate_mount_flags(args); +#else + goto out_v4_not_compiled; +#endif /* CONFIG_NFS_V4 */ + } else + nfs_set_mount_transport_protocol(args); + + nfs_set_port(sap, &args->nfs_server.port, port); + + return nfs_parse_devname(dev_name, + &args->nfs_server.hostname, + max_namelen, + &args->nfs_server.export_path, + max_pathlen); + +#if !IS_ENABLED(CONFIG_NFS_V4) +out_v4_not_compiled: + dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n"); + return -EPROTONOSUPPORT; +#else +out_invalid_transport_udp: + dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n"); + return -EINVAL; +#endif /* !CONFIG_NFS_V4 */ + +out_no_address: + dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n"); + return -EINVAL; +} + +#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ + | NFS_MOUNT_SECURE \ + | NFS_MOUNT_TCP \ + | NFS_MOUNT_VER3 \ + | NFS_MOUNT_KERBEROS \ + | NFS_MOUNT_NONLM \ + | NFS_MOUNT_BROKEN_SUID \ + | NFS_MOUNT_STRICTLOCK \ + | NFS_MOUNT_UNSHARED \ + | NFS_MOUNT_NORESVPORT \ + | NFS_MOUNT_LEGACY_INTERFACE) + +static int +nfs_compare_remount_data(struct nfs_server *nfss, + struct nfs_parsed_mount_data *data) +{ + if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK || + data->rsize != nfss->rsize || + data->wsize != nfss->wsize || + data->version != nfss->nfs_client->rpc_ops->version || + data->minorversion != nfss->nfs_client->cl_minorversion || + data->retrans != nfss->client->cl_timeout->to_retries || + data->selected_flavor != nfss->client->cl_auth->au_flavor || + data->acregmin != nfss->acregmin / HZ || + data->acregmax != nfss->acregmax / HZ || + data->acdirmin != nfss->acdirmin / HZ || + data->acdirmax != nfss->acdirmax / HZ || + data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || + data->nfs_server.port != nfss->port || + data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || + !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address, + (struct sockaddr *)&nfss->nfs_client->cl_addr)) + return -EINVAL; + + return 0; +} + +int +nfs_remount(struct super_block *sb, int *flags, char *raw_data) +{ + int error; + struct nfs_server *nfss = sb->s_fs_info; + struct nfs_parsed_mount_data *data; + struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data; + struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data; + u32 nfsvers = nfss->nfs_client->rpc_ops->version; + + sync_filesystem(sb); + + /* + * Userspace mount programs that send binary options generally send + * them populated with default values. We have no way to know which + * ones were explicitly specified. Fall back to legacy behavior and + * just return success. + */ + if ((nfsvers == 4 && (!options4 || options4->version == 1)) || + (nfsvers <= 3 && (!options || (options->version >= 1 && + options->version <= 6)))) + return 0; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return -ENOMEM; + + /* fill out struct with values from existing mount */ + data->flags = nfss->flags; + data->rsize = nfss->rsize; + data->wsize = nfss->wsize; + data->retrans = nfss->client->cl_timeout->to_retries; + data->selected_flavor = nfss->client->cl_auth->au_flavor; + data->auth_info = nfss->auth_info; + data->acregmin = nfss->acregmin / HZ; + data->acregmax = nfss->acregmax / HZ; + data->acdirmin = nfss->acdirmin / HZ; + data->acdirmax = nfss->acdirmax / HZ; + data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; + data->nfs_server.port = nfss->port; + data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; + data->version = nfsvers; + data->minorversion = nfss->nfs_client->cl_minorversion; + data->net = current->nsproxy->net_ns; + memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, + data->nfs_server.addrlen); + + /* overwrite those values with any that were specified */ + error = -EINVAL; + if (!nfs_parse_mount_options((char *)options, data)) + goto out; + + /* + * noac is a special case. It implies -o sync, but that's not + * necessarily reflected in the mtab options. do_remount_sb + * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the + * remount options, so we have to explicitly reset it. + */ + if (data->flags & NFS_MOUNT_NOAC) + *flags |= MS_SYNCHRONOUS; + + /* compare new mount options with old ones */ + error = nfs_compare_remount_data(nfss, data); +out: + kfree(data); + return error; +} +EXPORT_SYMBOL_GPL(nfs_remount); + /* * Initialise the common bits of the superblock */ -static inline void nfs_initialise_sb(struct super_block *sb) +inline void nfs_initialise_sb(struct super_block *sb) { struct nfs_server *server = NFS_SB(sb); @@ -1339,14 +2297,13 @@ static inline void nfs_initialise_sb(struct super_block *sb) /* We probably want something more informative here */ snprintf(sb->s_id, sizeof(sb->s_id), - "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev)); + "%u:%u", MAJOR(sb->s_dev), MINOR(sb->s_dev)); if (sb->s_blocksize == 0) sb->s_blocksize = nfs_block_bits(server->wsize, &sb->s_blocksize_bits); - if (server->flags & NFS_MOUNT_NOAC) - sb->s_flags |= MS_SYNCHRONOUS; + sb->s_bdi = &server->backing_dev_info; nfs_super_set_maxbytes(sb, server->maxfilesize); } @@ -1354,17 +2311,19 @@ static inline void nfs_initialise_sb(struct super_block *sb) /* * Finish setting up an NFS2/3 superblock */ -static void nfs_fill_super(struct super_block *sb, - struct nfs_parsed_mount_data *data) +void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info) { + struct nfs_parsed_mount_data *data = mount_info->parsed; struct nfs_server *server = NFS_SB(sb); sb->s_blocksize_bits = 0; sb->s_blocksize = 0; - if (data->bsize) + sb->s_xattr = server->nfs_client->cl_nfs_mod->xattr; + sb->s_op = server->nfs_client->cl_nfs_mod->sops; + if (data && data->bsize) sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); - if (server->flags & NFS_MOUNT_VER3) { + if (server->nfs_client->rpc_ops->version != 2) { /* The VFS shouldn't apply the umask to mode bits. We will do * so ourselves when necessary. */ @@ -1372,36 +2331,35 @@ static void nfs_fill_super(struct super_block *sb, sb->s_time_gran = 1; } - sb->s_op = &nfs_sops; nfs_initialise_sb(sb); } +EXPORT_SYMBOL_GPL(nfs_fill_super); /* - * Finish setting up a cloned NFS2/3 superblock + * Finish setting up a cloned NFS2/3/4 superblock */ -static void nfs_clone_super(struct super_block *sb, - const struct super_block *old_sb) +void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) { + const struct super_block *old_sb = mount_info->cloned->sb; struct nfs_server *server = NFS_SB(sb); sb->s_blocksize_bits = old_sb->s_blocksize_bits; sb->s_blocksize = old_sb->s_blocksize; sb->s_maxbytes = old_sb->s_maxbytes; + sb->s_xattr = old_sb->s_xattr; + sb->s_op = old_sb->s_op; + sb->s_time_gran = 1; - if (server->flags & NFS_MOUNT_VER3) { + if (server->nfs_client->rpc_ops->version != 2) { /* The VFS shouldn't apply the umask to mode bits. We will do * so ourselves when necessary. */ sb->s_flags |= MS_POSIXACL; - sb->s_time_gran = 1; } - sb->s_op = old_sb->s_op; nfs_initialise_sb(sb); } -#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) - static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) { const struct nfs_server *a = s->s_fs_info; @@ -1412,7 +2370,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n goto Ebusy; if (a->nfs_client != b->nfs_client) goto Ebusy; - if (a->flags != b->flags) + if ((a->flags ^ b->flags) & NFS_MOUNT_CMP_FLAGMASK) goto Ebusy; if (a->wsize != b->wsize) goto Ebusy; @@ -1426,7 +2384,8 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n goto Ebusy; if (a->acdirmax != b->acdirmax) goto Ebusy; - if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) + if (b->auth_info.flavor_len > 0 && + clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) goto Ebusy; return 1; Ebusy: @@ -1446,6 +2405,7 @@ static int nfs_set_super(struct super_block *s, void *data) s->s_flags = sb_mntdata->mntflags; s->s_fs_info = server; + s->s_d_op = server->nfs_client->rpc_ops->dentry_ops; ret = set_anon_super(s, server); if (ret == 0) server->s_dev = s->s_dev; @@ -1505,75 +2465,144 @@ static int nfs_compare_super(struct super_block *sb, void *data) return nfs_compare_mount_options(sb, server, mntflags); } -static int nfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) +#ifdef CONFIG_NFS_FSCACHE +static void nfs_get_cache_cookie(struct super_block *sb, + struct nfs_parsed_mount_data *parsed, + struct nfs_clone_mount *cloned) +{ + struct nfs_server *nfss = NFS_SB(sb); + char *uniq = NULL; + int ulen = 0; + + nfss->fscache_key = NULL; + nfss->fscache = NULL; + + if (parsed) { + if (!(parsed->options & NFS_OPTION_FSCACHE)) + return; + if (parsed->fscache_uniq) { + uniq = parsed->fscache_uniq; + ulen = strlen(parsed->fscache_uniq); + } + } else if (cloned) { + struct nfs_server *mnt_s = NFS_SB(cloned->sb); + if (!(mnt_s->options & NFS_OPTION_FSCACHE)) + return; + if (mnt_s->fscache_key) { + uniq = mnt_s->fscache_key->key.uniquifier; + ulen = mnt_s->fscache_key->key.uniq_len; + }; + } else + return; + + nfs_fscache_get_super_cookie(sb, uniq, ulen); +} +#else +static void nfs_get_cache_cookie(struct super_block *sb, + struct nfs_parsed_mount_data *parsed, + struct nfs_clone_mount *cloned) +{ +} +#endif + +static int nfs_bdi_register(struct nfs_server *server) +{ + return bdi_register_dev(&server->backing_dev_info, server->s_dev); +} + +int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot, + struct nfs_mount_info *mount_info) +{ + int error; + unsigned long kflags = 0, kflags_out = 0; + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) + kflags |= SECURITY_LSM_NATIVE_LABELS; + + error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts, + kflags, &kflags_out); + if (error) + goto err; + + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && + !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) + NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; +err: + return error; +} +EXPORT_SYMBOL_GPL(nfs_set_sb_security); + +int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, + struct nfs_mount_info *mount_info) +{ + /* clone any lsm security options from the parent to the new sb */ + if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) + return -ESTALE; + return security_sb_clone_mnt_opts(mount_info->cloned->sb, s); +} +EXPORT_SYMBOL_GPL(nfs_clone_sb_security); + +struct dentry *nfs_fs_mount_common(struct nfs_server *server, + int flags, const char *dev_name, + struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) { - struct nfs_server *server = NULL; struct super_block *s; - struct nfs_fh mntfh; - struct nfs_parsed_mount_data data; - struct dentry *mntroot; + struct dentry *mntroot = ERR_PTR(-ENOMEM); int (*compare_super)(struct super_block *, void *) = nfs_compare_super; struct nfs_sb_mountdata sb_mntdata = { .mntflags = flags, + .server = server, }; int error; - security_init_mnt_opts(&data.lsm_opts); - - /* Validate the mount data */ - error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name); - if (error < 0) - goto out; - - /* Get a volume representation */ - server = nfs_create_server(&data, &mntfh); - if (IS_ERR(server)) { - error = PTR_ERR(server); - goto out; - } - sb_mntdata.server = server; - if (server->flags & NFS_MOUNT_UNSHARED) compare_super = NULL; + /* -o noac implies -o sync */ + if (server->flags & NFS_MOUNT_NOAC) + sb_mntdata.mntflags |= MS_SYNCHRONOUS; + + if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL) + if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS) + sb_mntdata.mntflags |= MS_SYNCHRONOUS; + /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata); + s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata); if (IS_ERR(s)) { - error = PTR_ERR(s); + mntroot = ERR_CAST(s); goto out_err_nosb; } if (s->s_fs_info != server) { nfs_free_server(server); server = NULL; + } else { + error = nfs_bdi_register(server); + if (error) { + mntroot = ERR_PTR(error); + goto error_splat_bdi; + } + server->super = s; } if (!s->s_root) { /* initial superblock/root creation */ - nfs_fill_super(s, &data); + mount_info->fill_super(s, mount_info); + nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned); } - mntroot = nfs_get_root(s, &mntfh); - if (IS_ERR(mntroot)) { - error = PTR_ERR(mntroot); + mntroot = nfs_get_root(s, mount_info->mntfh, dev_name); + if (IS_ERR(mntroot)) goto error_splat_super; - } - error = security_sb_set_mnt_opts(s, &data.lsm_opts); + error = mount_info->set_security(s, mntroot, mount_info); if (error) goto error_splat_root; s->s_flags |= MS_ACTIVE; - mnt->mnt_sb = s; - mnt->mnt_root = mntroot; - error = 0; out: - kfree(data.nfs_server.hostname); - kfree(data.mount_server.hostname); - security_free_mnt_opts(&data.lsm_opts); - return error; + return mntroot; out_err_nosb: nfs_free_server(server); @@ -1581,150 +2610,123 @@ out_err_nosb: error_splat_root: dput(mntroot); + mntroot = ERR_PTR(error); error_splat_super: - up_write(&s->s_umount); - deactivate_super(s); + if (server && !s->s_root) + bdi_unregister(&server->backing_dev_info); +error_splat_bdi: + deactivate_locked_super(s); goto out; } +EXPORT_SYMBOL_GPL(nfs_fs_mount_common); -/* - * Destroy an NFS2/3 superblock - */ -static void nfs_kill_super(struct super_block *s) -{ - struct nfs_server *server = NFS_SB(s); - - kill_anon_super(s); - nfs_free_server(server); -} - -/* - * Clone an NFS2/3 server record on xdev traversal (FSID-change) - */ -static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data, - struct vfsmount *mnt) +struct dentry *nfs_fs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) { - struct nfs_clone_mount *data = raw_data; - struct super_block *s; - struct nfs_server *server; - struct dentry *mntroot; - int (*compare_super)(struct super_block *, void *) = nfs_compare_super; - struct nfs_sb_mountdata sb_mntdata = { - .mntflags = flags, + struct nfs_mount_info mount_info = { + .fill_super = nfs_fill_super, + .set_security = nfs_set_sb_security, }; + struct dentry *mntroot = ERR_PTR(-ENOMEM); + struct nfs_subversion *nfs_mod; int error; - dprintk("--> nfs_xdev_get_sb()\n"); - - /* create a new volume representation */ - server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); - if (IS_ERR(server)) { - error = PTR_ERR(server); - goto out_err_noserver; - } - sb_mntdata.server = server; - - if (server->flags & NFS_MOUNT_UNSHARED) - compare_super = NULL; - - /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto out_err_nosb; - } - - if (s->s_fs_info != server) { - nfs_free_server(server); - server = NULL; - } + mount_info.parsed = nfs_alloc_parsed_mount_data(); + mount_info.mntfh = nfs_alloc_fhandle(); + if (mount_info.parsed == NULL || mount_info.mntfh == NULL) + goto out; - if (!s->s_root) { - /* initial superblock/root creation */ - nfs_clone_super(s, data->sb); + /* Validate the mount data */ + error = nfs_validate_mount_data(fs_type, raw_data, mount_info.parsed, mount_info.mntfh, dev_name); + if (error == NFS_TEXT_DATA) + error = nfs_validate_text_mount_data(raw_data, mount_info.parsed, dev_name); + if (error < 0) { + mntroot = ERR_PTR(error); + goto out; } - mntroot = nfs_get_root(s, data->fh); - if (IS_ERR(mntroot)) { - error = PTR_ERR(mntroot); - goto error_splat_super; - } - if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) { - dput(mntroot); - error = -ESTALE; - goto error_splat_super; + nfs_mod = get_nfs_version(mount_info.parsed->version); + if (IS_ERR(nfs_mod)) { + mntroot = ERR_CAST(nfs_mod); + goto out; } - s->s_flags |= MS_ACTIVE; - mnt->mnt_sb = s; - mnt->mnt_root = mntroot; - - /* clone any lsm security options from the parent to the new sb */ - security_sb_clone_mnt_opts(data->sb, s); + mntroot = nfs_mod->rpc_ops->try_mount(flags, dev_name, &mount_info, nfs_mod); - dprintk("<-- nfs_xdev_get_sb() = 0\n"); - return 0; - -out_err_nosb: - nfs_free_server(server); -out_err_noserver: - dprintk("<-- nfs_xdev_get_sb() = %d [error]\n", error); - return error; - -error_splat_super: - up_write(&s->s_umount); - deactivate_super(s); - dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); - return error; + put_nfs_version(nfs_mod); +out: + nfs_free_parsed_mount_data(mount_info.parsed); + nfs_free_fhandle(mount_info.mntfh); + return mntroot; } - -#ifdef CONFIG_NFS_V4 +EXPORT_SYMBOL_GPL(nfs_fs_mount); /* - * Finish setting up a cloned NFS4 superblock + * Ensure that we unregister the bdi before kill_anon_super + * releases the device name */ -static void nfs4_clone_super(struct super_block *sb, - const struct super_block *old_sb) +void nfs_put_super(struct super_block *s) { - sb->s_blocksize_bits = old_sb->s_blocksize_bits; - sb->s_blocksize = old_sb->s_blocksize; - sb->s_maxbytes = old_sb->s_maxbytes; - sb->s_time_gran = 1; - sb->s_op = old_sb->s_op; - nfs_initialise_sb(sb); + struct nfs_server *server = NFS_SB(s); + + bdi_unregister(&server->backing_dev_info); } +EXPORT_SYMBOL_GPL(nfs_put_super); /* - * Set up an NFS4 superblock + * Destroy an NFS2/3 superblock */ -static void nfs4_fill_super(struct super_block *sb) +void nfs_kill_super(struct super_block *s) { - sb->s_time_gran = 1; - sb->s_op = &nfs4_sops; - nfs_initialise_sb(sb); + struct nfs_server *server = NFS_SB(s); + + kill_anon_super(s); + nfs_fscache_release_super_cookie(s); + nfs_free_server(server); } +EXPORT_SYMBOL_GPL(nfs_kill_super); /* - * If the user didn't specify a port, set the port number to - * the NFS version 4 default port. + * Clone an NFS2/3/4 server record on xdev traversal (FSID-change) */ -static void nfs4_default_port(struct sockaddr *sap) +static struct dentry * +nfs_xdev_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) { - switch (sap->sa_family) { - case AF_INET: { - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - if (ap->sin_port == 0) - ap->sin_port = htons(NFS_PORT); - break; - } - case AF_INET6: { - struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap; - if (ap->sin6_port == 0) - ap->sin6_port = htons(NFS_PORT); - break; - } - } + struct nfs_clone_mount *data = raw_data; + struct nfs_mount_info mount_info = { + .fill_super = nfs_clone_super, + .set_security = nfs_clone_sb_security, + .cloned = data, + }; + struct nfs_server *server; + struct dentry *mntroot = ERR_PTR(-ENOMEM); + struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod; + + dprintk("--> nfs_xdev_mount()\n"); + + mount_info.mntfh = mount_info.cloned->fh; + + /* create a new volume representation */ + server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor); + + if (IS_ERR(server)) + mntroot = ERR_CAST(server); + else + mntroot = nfs_fs_mount_common(server, flags, + dev_name, &mount_info, nfs_mod); + + dprintk("<-- nfs_xdev_mount() = %ld\n", + IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L); + return mntroot; +} + +#if IS_ENABLED(CONFIG_NFS_V4) + +static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) +{ + args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3| + NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL); } /* @@ -1734,55 +2736,39 @@ static int nfs4_validate_mount_data(void *options, struct nfs_parsed_mount_data *args, const char *dev_name) { - struct sockaddr_in *ap; + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; char *c; - memset(args, 0, sizeof(*args)); - if (data == NULL) goto out_no_data; - args->rsize = NFS_MAX_FILE_IO_SIZE; - args->wsize = NFS_MAX_FILE_IO_SIZE; - args->timeo = 600; - args->retrans = 2; - args->acregmin = 3; - args->acregmax = 60; - args->acdirmin = 30; - args->acdirmax = 60; - args->nfs_server.protocol = XPRT_TRANSPORT_TCP; + args->version = 4; switch (data->version) { case 1: - ap = (struct sockaddr_in *)&args->nfs_server.address; if (data->host_addrlen > sizeof(args->nfs_server.address)) goto out_no_address; if (data->host_addrlen == 0) goto out_no_address; args->nfs_server.addrlen = data->host_addrlen; - if (copy_from_user(ap, data->host_addr, data->host_addrlen)) + if (copy_from_user(sap, data->host_addr, data->host_addrlen)) return -EFAULT; - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) goto out_no_address; + args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); - nfs4_default_port((struct sockaddr *) - &args->nfs_server.address); - - switch (data->auth_flavourlen) { - case 0: - args->auth_flavors[0] = RPC_AUTH_UNIX; - break; - case 1: - if (copy_from_user(&args->auth_flavors[0], + if (data->auth_flavourlen) { + rpc_authflavor_t pseudoflavor; + if (data->auth_flavourlen > 1) + goto out_inval_auth; + if (copy_from_user(&pseudoflavor, data->auth_flavours, - sizeof(args->auth_flavors[0]))) + sizeof(pseudoflavor))) return -EFAULT; - break; - default: - goto out_inval_auth; - } + args->selected_flavor = pseudoflavor; + } else + args->selected_flavor = RPC_AUTH_UNIX; c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); if (IS_ERR(c)) @@ -1815,57 +2801,13 @@ static int nfs4_validate_mount_data(void *options, args->acdirmin = data->acdirmin; args->acdirmax = data->acdirmax; args->nfs_server.protocol = data->proto; + nfs_validate_transport_protocol(args); + if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP) + goto out_invalid_transport_udp; break; - default: { - unsigned int len; - - if (nfs_parse_mount_options((char *)options, args) == 0) - return -EINVAL; - - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) - return -EINVAL; - - nfs4_default_port((struct sockaddr *) - &args->nfs_server.address); - - switch (args->auth_flavor_len) { - case 0: - args->auth_flavors[0] = RPC_AUTH_UNIX; - break; - case 1: - break; - default: - goto out_inval_auth; - } - - /* - * Split "dev_name" into "hostname:mntpath". - */ - c = strchr(dev_name, ':'); - if (c == NULL) - return -EINVAL; - /* while calculating len, pretend ':' is '\0' */ - len = c - dev_name; - if (len > NFS4_MAXNAMLEN) - return -ENAMETOOLONG; - /* N.B. caller will free nfs_server.hostname in all cases */ - args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL); - - c++; /* step over the ':' */ - len = strlen(c); - if (len > NFS4_MAXPATHLEN) - return -ENAMETOOLONG; - args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL); - - dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path); - - if (args->client_address == NULL) - goto out_no_client_address; - - break; - } + default: + return NFS_TEXT_DATA; } return 0; @@ -1883,257 +2825,75 @@ out_no_address: dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); return -EINVAL; -out_no_client_address: - dfprintk(MOUNT, "NFS4: mount program didn't pass callback address\n"); +out_invalid_transport_udp: + dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n"); return -EINVAL; } /* - * Get the superblock for an NFS4 mountpoint - */ -static int nfs4_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) -{ - struct nfs_parsed_mount_data data; - struct super_block *s; - struct nfs_server *server; - struct nfs_fh mntfh; - struct dentry *mntroot; - int (*compare_super)(struct super_block *, void *) = nfs_compare_super; - struct nfs_sb_mountdata sb_mntdata = { - .mntflags = flags, - }; - int error; - - security_init_mnt_opts(&data.lsm_opts); - - /* Validate the mount data */ - error = nfs4_validate_mount_data(raw_data, &data, dev_name); - if (error < 0) - goto out; - - /* Get a volume representation */ - server = nfs4_create_server(&data, &mntfh); - if (IS_ERR(server)) { - error = PTR_ERR(server); - goto out; - } - sb_mntdata.server = server; - - if (server->flags & NFS4_MOUNT_UNSHARED) - compare_super = NULL; - - /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto out_free; - } - - if (s->s_fs_info != server) { - nfs_free_server(server); - server = NULL; - } - - if (!s->s_root) { - /* initial superblock/root creation */ - nfs4_fill_super(s); - } - - mntroot = nfs4_get_root(s, &mntfh); - if (IS_ERR(mntroot)) { - error = PTR_ERR(mntroot); - goto error_splat_super; - } - - s->s_flags |= MS_ACTIVE; - mnt->mnt_sb = s; - mnt->mnt_root = mntroot; - error = 0; - -out: - kfree(data.client_address); - kfree(data.nfs_server.export_path); - kfree(data.nfs_server.hostname); - security_free_mnt_opts(&data.lsm_opts); - return error; - -out_free: - nfs_free_server(server); - goto out; - -error_splat_super: - up_write(&s->s_umount); - deactivate_super(s); - goto out; -} - -static void nfs4_kill_super(struct super_block *sb) -{ - struct nfs_server *server = NFS_SB(sb); - - nfs_return_all_delegations(sb); - kill_anon_super(sb); - - nfs4_renewd_prepare_shutdown(server); - nfs_free_server(server); -} - -/* - * Clone an NFS4 server record on xdev traversal (FSID-change) + * NFS v4 module parameters need to stay in the + * NFS client for backwards compatibility */ -static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data, - struct vfsmount *mnt) +unsigned int nfs_callback_set_tcpport; +unsigned short nfs_callback_tcpport; +/* Default cache timeout is 10 minutes */ +unsigned int nfs_idmap_cache_timeout = 600; +/* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */ +bool nfs4_disable_idmapping = true; +unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; +unsigned short send_implementation_id = 1; +char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = ""; +bool recover_lost_locks = false; + +EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); +EXPORT_SYMBOL_GPL(nfs_callback_tcpport); +EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout); +EXPORT_SYMBOL_GPL(nfs4_disable_idmapping); +EXPORT_SYMBOL_GPL(max_session_slots); +EXPORT_SYMBOL_GPL(send_implementation_id); +EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier); +EXPORT_SYMBOL_GPL(recover_lost_locks); + +#define NFS_CALLBACK_MAXPORTNR (65535U) + +static int param_set_portnr(const char *val, const struct kernel_param *kp) { - struct nfs_clone_mount *data = raw_data; - struct super_block *s; - struct nfs_server *server; - struct dentry *mntroot; - int (*compare_super)(struct super_block *, void *) = nfs_compare_super; - struct nfs_sb_mountdata sb_mntdata = { - .mntflags = flags, - }; - int error; - - dprintk("--> nfs4_xdev_get_sb()\n"); - - /* create a new volume representation */ - server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); - if (IS_ERR(server)) { - error = PTR_ERR(server); - goto out_err_noserver; - } - sb_mntdata.server = server; - - if (server->flags & NFS4_MOUNT_UNSHARED) - compare_super = NULL; - - /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto out_err_nosb; - } - - if (s->s_fs_info != server) { - nfs_free_server(server); - server = NULL; - } - - if (!s->s_root) { - /* initial superblock/root creation */ - nfs4_clone_super(s, data->sb); - } - - mntroot = nfs4_get_root(s, data->fh); - if (IS_ERR(mntroot)) { - error = PTR_ERR(mntroot); - goto error_splat_super; - } - if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) { - dput(mntroot); - error = -ESTALE; - goto error_splat_super; - } - - s->s_flags |= MS_ACTIVE; - mnt->mnt_sb = s; - mnt->mnt_root = mntroot; + unsigned long num; + int ret; - dprintk("<-- nfs4_xdev_get_sb() = 0\n"); + if (!val) + return -EINVAL; + ret = kstrtoul(val, 0, &num); + if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) + return -EINVAL; + *((unsigned int *)kp->arg) = num; return 0; - -out_err_nosb: - nfs_free_server(server); -out_err_noserver: - dprintk("<-- nfs4_xdev_get_sb() = %d [error]\n", error); - return error; - -error_splat_super: - up_write(&s->s_umount); - deactivate_super(s); - dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); - return error; } +static struct kernel_param_ops param_ops_portnr = { + .set = param_set_portnr, + .get = param_get_uint, +}; +#define param_check_portnr(name, p) __param_check(name, p, unsigned int); + +module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); +module_param(nfs_idmap_cache_timeout, int, 0644); +module_param(nfs4_disable_idmapping, bool, 0644); +module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier, + NFS4_CLIENT_ID_UNIQ_LEN, 0600); +MODULE_PARM_DESC(nfs4_disable_idmapping, + "Turn off NFSv4 idmapping when using 'sec=sys'"); +module_param(max_session_slots, ushort, 0644); +MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 " + "requests the client will negotiate"); +module_param(send_implementation_id, ushort, 0644); +MODULE_PARM_DESC(send_implementation_id, + "Send implementation ID with NFSv4.1 exchange_id"); +MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string"); + +module_param(recover_lost_locks, bool, 0644); +MODULE_PARM_DESC(recover_lost_locks, + "If the server reports that a lock might be lost, " + "try to recover it risking data corruption."); -/* - * Create an NFS4 server record on referral traversal - */ -static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data, - struct vfsmount *mnt) -{ - struct nfs_clone_mount *data = raw_data; - struct super_block *s; - struct nfs_server *server; - struct dentry *mntroot; - struct nfs_fh mntfh; - int (*compare_super)(struct super_block *, void *) = nfs_compare_super; - struct nfs_sb_mountdata sb_mntdata = { - .mntflags = flags, - }; - int error; - - dprintk("--> nfs4_referral_get_sb()\n"); - - /* create a new volume representation */ - server = nfs4_create_referral_server(data, &mntfh); - if (IS_ERR(server)) { - error = PTR_ERR(server); - goto out_err_noserver; - } - sb_mntdata.server = server; - - if (server->flags & NFS4_MOUNT_UNSHARED) - compare_super = NULL; - - /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto out_err_nosb; - } - - if (s->s_fs_info != server) { - nfs_free_server(server); - server = NULL; - } - - if (!s->s_root) { - /* initial superblock/root creation */ - nfs4_fill_super(s); - } - - mntroot = nfs4_get_root(s, &mntfh); - if (IS_ERR(mntroot)) { - error = PTR_ERR(mntroot); - goto error_splat_super; - } - if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) { - dput(mntroot); - error = -ESTALE; - goto error_splat_super; - } - - s->s_flags |= MS_ACTIVE; - mnt->mnt_sb = s; - mnt->mnt_root = mntroot; - - dprintk("<-- nfs4_referral_get_sb() = 0\n"); - return 0; - -out_err_nosb: - nfs_free_server(server); -out_err_noserver: - dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); - return error; - -error_splat_super: - up_write(&s->s_umount); - deactivate_super(s); - dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); - return error; -} #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 83e865a16ad..05c9e02f415 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -10,7 +10,6 @@ * nfs symlink handling code */ -#define NFS_NEED_XDR_TYPES #include <linux/time.h> #include <linux/errno.h> #include <linux/sunrpc/clnt.h> @@ -20,7 +19,6 @@ #include <linux/pagemap.h> #include <linux/stat.h> #include <linux/mm.h> -#include <linux/slab.h> #include <linux/string.h> #include <linux/namei.h> @@ -51,7 +49,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd) struct page *page; void *err; - err = ERR_PTR(nfs_revalidate_mapping_nolock(inode, inode->i_mapping)); + err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); if (err) goto read_failed; page = read_cache_page(&inode->i_data, 0, diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index b62481dabae..bb6ed810fa6 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -9,76 +9,44 @@ #include <linux/fs.h> #include <linux/sysctl.h> #include <linux/module.h> -#include <linux/nfs4.h> -#include <linux/nfs_idmap.h> #include <linux/nfs_fs.h> -#include "callback.h" - -static const int nfs_set_port_min = 0; -static const int nfs_set_port_max = 65535; static struct ctl_table_header *nfs_callback_sysctl_table; -static ctl_table nfs_cb_sysctls[] = { -#ifdef CONFIG_NFS_V4 - { - .ctl_name = CTL_UNNUMBERED, - .procname = "nfs_callback_tcpport", - .data = &nfs_callback_set_tcpport, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = (int *)&nfs_set_port_min, - .extra2 = (int *)&nfs_set_port_max, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "idmap_cache_timeout", - .data = &nfs_idmap_cache_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, -#endif +static struct ctl_table nfs_cb_sysctls[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "nfs_mountpoint_timeout", .data = &nfs_mountpoint_expiry_timeout, .maxlen = sizeof(nfs_mountpoint_expiry_timeout), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "nfs_congestion_kb", .data = &nfs_congestion_kb, .maxlen = sizeof(nfs_congestion_kb), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { .ctl_name = 0 } + { } }; -static ctl_table nfs_cb_sysctl_dir[] = { +static struct ctl_table nfs_cb_sysctl_dir[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "nfs", .mode = 0555, .child = nfs_cb_sysctls, }, - { .ctl_name = 0 } + { } }; -static ctl_table nfs_cb_sysctl_root[] = { +static struct ctl_table nfs_cb_sysctl_root[] = { { - .ctl_name = CTL_FS, .procname = "fs", .mode = 0555, .child = nfs_cb_sysctl_dir, }, - { .ctl_name = 0 } + { } }; int nfs_register_sysctl(void) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 75741536342..de54129336c 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -13,16 +13,15 @@ #include <linux/nfs_fs.h> #include <linux/sched.h> #include <linux/wait.h> +#include <linux/namei.h> +#include <linux/fsnotify.h> #include "internal.h" +#include "nfs4_fs.h" +#include "iostat.h" +#include "delegation.h" -struct nfs_unlinkdata { - struct hlist_node list; - struct nfs_removeargs args; - struct nfs_removeres res; - struct inode *dir; - struct rpc_cred *cred; -}; +#include "nfstrace.h" /** * nfs_free_unlinkdata - release data from a sillydelete operation. @@ -81,8 +80,9 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) struct nfs_unlinkdata *data = calldata; struct inode *dir = data->dir; + trace_nfs_sillyrename_unlink(data, task->tk_status); if (!NFS_PROTO(dir)->unlink_done(task, dir)) - rpc_restart_call(task); + rpc_restart_call_prepare(task); } /** @@ -95,15 +95,23 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) static void nfs_async_unlink_release(void *calldata) { struct nfs_unlinkdata *data = calldata; + struct super_block *sb = data->dir->i_sb; nfs_dec_sillycount(data->dir); - nfs_sb_deactive(NFS_SERVER(data->dir)); nfs_free_unlinkdata(data); + nfs_sb_deactive(sb); +} + +static void nfs_unlink_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_unlinkdata *data = calldata; + NFS_PROTO(data->dir)->unlink_rpc_prepare(task, data); } static const struct rpc_call_ops nfs_unlink_ops = { .rpc_call_done = nfs_async_unlink_done, .rpc_release = nfs_async_unlink_release, + .rpc_call_prepare = nfs_unlink_prepare, }; static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data) @@ -117,6 +125,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n .rpc_message = &msg, .callback_ops = &nfs_unlink_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; struct rpc_task *task; @@ -124,23 +133,33 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n alias = d_lookup(parent, &data->args.name); if (alias != NULL) { - int ret = 0; + int ret; + void *devname_garbage = NULL; /* * Hey, we raced with lookup... See if we need to transfer * the sillyrename information to the aliased dentry. */ nfs_free_dname(data); + ret = nfs_copy_dname(alias, data); spin_lock(&alias->d_lock); - if (alias->d_inode != NULL && + if (ret == 0 && alias->d_inode != NULL && !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { + devname_garbage = alias->d_fsdata; alias->d_fsdata = data; alias->d_flags |= DCACHE_NFSFS_RENAMED; ret = 1; - } + } else + ret = 0; spin_unlock(&alias->d_lock); nfs_dec_sillycount(dir); dput(alias); + /* + * If we'd displaced old cached devname, free it. At that + * point dentry is definitely not a root, so we won't need + * that anymore. + */ + kfree(devname_garbage); return ret; } data->dir = igrab(dir); @@ -148,16 +167,16 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n nfs_dec_sillycount(dir); return 0; } - nfs_sb_active(NFS_SERVER(dir)); + nfs_sb_active(dir->i_sb); data->args.fh = NFS_FH(dir); - nfs_fattr_init(&data->res.dir_attr); + nfs_fattr_init(data->res.dir_attr); NFS_PROTO(dir)->unlink_setup(&msg, dir); task_setup_data.rpc_client = NFS_CLIENT(dir); task = rpc_run_task(&task_setup_data); if (!IS_ERR(task)) - rpc_put_task(task); + rpc_put_task_async(task); return 1; } @@ -172,8 +191,6 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) if (parent == NULL) goto out_free; dir = parent->d_inode; - if (nfs_copy_dname(dentry, data) != 0) - goto out_dput; /* Non-exclusive lock protects against concurrent lookup() calls */ spin_lock(&dir->i_lock); if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { @@ -191,6 +208,13 @@ out_free: return ret; } +void nfs_wait_on_sillyrename(struct dentry *dentry) +{ + struct nfs_inode *nfsi = NFS_I(dentry->d_inode); + + wait_event(nfsi->waitqueue, atomic_read(&nfsi->silly_count) <= 1); +} + void nfs_block_sillyrename(struct dentry *dentry) { struct nfs_inode *nfsi = NFS_I(dentry->d_inode); @@ -224,29 +248,38 @@ void nfs_unblock_sillyrename(struct dentry *dentry) * @dir: parent directory of dentry * @dentry: dentry to unlink */ -int +static int nfs_async_unlink(struct inode *dir, struct dentry *dentry) { struct nfs_unlinkdata *data; int status = -ENOMEM; + void *devname_garbage = NULL; data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) goto out; - data->cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); + data->cred = rpc_lookup_cred(); if (IS_ERR(data->cred)) { status = PTR_ERR(data->cred); goto out_free; } + data->res.dir_attr = &data->dir_attr; status = -EBUSY; spin_lock(&dentry->d_lock); if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto out_unlock; dentry->d_flags |= DCACHE_NFSFS_RENAMED; + devname_garbage = dentry->d_fsdata; dentry->d_fsdata = data; spin_unlock(&dentry->d_lock); + /* + * If we'd displaced old cached devname, free it. At that + * point dentry is definitely not a root, so we won't need + * that anymore. + */ + kfree(devname_garbage); return 0; out_unlock: spin_unlock(&dentry->d_lock); @@ -275,9 +308,297 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode) if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; data = dentry->d_fsdata; + dentry->d_fsdata = NULL; } spin_unlock(&dentry->d_lock); if (data != NULL && (NFS_STALE(inode) || !nfs_call_unlink(dentry, data))) nfs_free_unlinkdata(data); } + +/* Cancel a queued async unlink. Called when a sillyrename run fails. */ +static void +nfs_cancel_async_unlink(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + struct nfs_unlinkdata *data = dentry->d_fsdata; + + dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; + dentry->d_fsdata = NULL; + spin_unlock(&dentry->d_lock); + nfs_free_unlinkdata(data); + return; + } + spin_unlock(&dentry->d_lock); +} + +/** + * nfs_async_rename_done - Sillyrename post-processing + * @task: rpc_task of the sillyrename + * @calldata: nfs_renamedata for the sillyrename + * + * Do the directory attribute updates and the d_move + */ +static void nfs_async_rename_done(struct rpc_task *task, void *calldata) +{ + struct nfs_renamedata *data = calldata; + struct inode *old_dir = data->old_dir; + struct inode *new_dir = data->new_dir; + struct dentry *old_dentry = data->old_dentry; + + trace_nfs_sillyrename_rename(old_dir, old_dentry, + new_dir, data->new_dentry, task->tk_status); + if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { + rpc_restart_call_prepare(task); + return; + } + + if (data->complete) + data->complete(task, data); +} + +/** + * nfs_async_rename_release - Release the sillyrename data. + * @calldata: the struct nfs_renamedata to be released + */ +static void nfs_async_rename_release(void *calldata) +{ + struct nfs_renamedata *data = calldata; + struct super_block *sb = data->old_dir->i_sb; + + if (data->old_dentry->d_inode) + nfs_mark_for_revalidate(data->old_dentry->d_inode); + + dput(data->old_dentry); + dput(data->new_dentry); + iput(data->old_dir); + iput(data->new_dir); + nfs_sb_deactive(sb); + put_rpccred(data->cred); + kfree(data); +} + +static void nfs_rename_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_renamedata *data = calldata; + NFS_PROTO(data->old_dir)->rename_rpc_prepare(task, data); +} + +static const struct rpc_call_ops nfs_rename_ops = { + .rpc_call_done = nfs_async_rename_done, + .rpc_release = nfs_async_rename_release, + .rpc_call_prepare = nfs_rename_prepare, +}; + +/** + * nfs_async_rename - perform an asynchronous rename operation + * @old_dir: directory that currently holds the dentry to be renamed + * @new_dir: target directory for the rename + * @old_dentry: original dentry to be renamed + * @new_dentry: dentry to which the old_dentry should be renamed + * + * It's expected that valid references to the dentries and inodes are held + */ +struct rpc_task * +nfs_async_rename(struct inode *old_dir, struct inode *new_dir, + struct dentry *old_dentry, struct dentry *new_dentry, + void (*complete)(struct rpc_task *, struct nfs_renamedata *)) +{ + struct nfs_renamedata *data; + struct rpc_message msg = { }; + struct rpc_task_setup task_setup_data = { + .rpc_message = &msg, + .callback_ops = &nfs_rename_ops, + .workqueue = nfsiod_workqueue, + .rpc_client = NFS_CLIENT(old_dir), + .flags = RPC_TASK_ASYNC, + }; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return ERR_PTR(-ENOMEM); + task_setup_data.callback_data = data; + + data->cred = rpc_lookup_cred(); + if (IS_ERR(data->cred)) { + struct rpc_task *task = ERR_CAST(data->cred); + kfree(data); + return task; + } + + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + msg.rpc_cred = data->cred; + + /* set up nfs_renamedata */ + data->old_dir = old_dir; + ihold(old_dir); + data->new_dir = new_dir; + ihold(new_dir); + data->old_dentry = dget(old_dentry); + data->new_dentry = dget(new_dentry); + nfs_fattr_init(&data->old_fattr); + nfs_fattr_init(&data->new_fattr); + data->complete = complete; + + /* set up nfs_renameargs */ + data->args.old_dir = NFS_FH(old_dir); + data->args.old_name = &old_dentry->d_name; + data->args.new_dir = NFS_FH(new_dir); + data->args.new_name = &new_dentry->d_name; + + /* set up nfs_renameres */ + data->res.old_fattr = &data->old_fattr; + data->res.new_fattr = &data->new_fattr; + + nfs_sb_active(old_dir->i_sb); + + NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir); + + return rpc_run_task(&task_setup_data); +} + +/* + * Perform tasks needed when a sillyrename is done such as cancelling the + * queued async unlink if it failed. + */ +static void +nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data) +{ + struct dentry *dentry = data->old_dentry; + + if (task->tk_status != 0) { + nfs_cancel_async_unlink(dentry); + return; + } + + /* + * vfs_unlink and the like do not issue this when a file is + * sillyrenamed, so do it here. + */ + fsnotify_nameremove(dentry, 0); +} + +#define SILLYNAME_PREFIX ".nfs" +#define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1) +#define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1) +#define SILLYNAME_COUNTER_LEN ((unsigned)sizeof(unsigned int) << 1) +#define SILLYNAME_LEN (SILLYNAME_PREFIX_LEN + \ + SILLYNAME_FILEID_LEN + \ + SILLYNAME_COUNTER_LEN) + +/** + * nfs_sillyrename - Perform a silly-rename of a dentry + * @dir: inode of directory that contains dentry + * @dentry: dentry to be sillyrenamed + * + * NFSv2/3 is stateless and the server doesn't know when the client is + * holding a file open. To prevent application problems when a file is + * unlinked while it's still open, the client performs a "silly-rename". + * That is, it renames the file to a hidden file in the same directory, + * and only performs the unlink once the last reference to it is put. + * + * The final cleanup is done during dentry_iput. + * + * (Note: NFSv4 is stateful, and has opens, so in theory an NFSv4 server + * could take responsibility for keeping open files referenced. The server + * would also need to ensure that opened-but-deleted files were kept over + * reboots. However, we may not assume a server does so. (RFC 5661 + * does provide an OPEN4_RESULT_PRESERVE_UNLINKED flag that a server can + * use to advertise that it does this; some day we may take advantage of + * it.)) + */ +int +nfs_sillyrename(struct inode *dir, struct dentry *dentry) +{ + static unsigned int sillycounter; + unsigned char silly[SILLYNAME_LEN + 1]; + unsigned long long fileid; + struct dentry *sdentry; + struct rpc_task *task; + int error = -EBUSY; + + dfprintk(VFS, "NFS: silly-rename(%pd2, ct=%d)\n", + dentry, d_count(dentry)); + nfs_inc_stats(dir, NFSIOS_SILLYRENAME); + + /* + * We don't allow a dentry to be silly-renamed twice. + */ + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + goto out; + + fileid = NFS_FILEID(dentry->d_inode); + + /* Return delegation in anticipation of the rename */ + NFS_PROTO(dentry->d_inode)->return_delegation(dentry->d_inode); + + sdentry = NULL; + do { + int slen; + dput(sdentry); + sillycounter++; + slen = scnprintf(silly, sizeof(silly), + SILLYNAME_PREFIX "%0*llx%0*x", + SILLYNAME_FILEID_LEN, fileid, + SILLYNAME_COUNTER_LEN, sillycounter); + + dfprintk(VFS, "NFS: trying to rename %pd to %s\n", + dentry, silly); + + sdentry = lookup_one_len(silly, dentry->d_parent, slen); + /* + * N.B. Better to return EBUSY here ... it could be + * dangerous to delete the file while it's in use. + */ + if (IS_ERR(sdentry)) + goto out; + } while (sdentry->d_inode != NULL); /* need negative lookup */ + + /* queue unlink first. Can't do this from rpc_release as it + * has to allocate memory + */ + error = nfs_async_unlink(dir, dentry); + if (error) + goto out_dput; + + /* populate unlinkdata with the right dname */ + error = nfs_copy_dname(sdentry, + (struct nfs_unlinkdata *)dentry->d_fsdata); + if (error) { + nfs_cancel_async_unlink(dentry); + goto out_dput; + } + + /* run the rename task, undo unlink if it fails */ + task = nfs_async_rename(dir, dir, dentry, sdentry, + nfs_complete_sillyrename); + if (IS_ERR(task)) { + error = -EBUSY; + nfs_cancel_async_unlink(dentry); + goto out_dput; + } + + /* wait for the RPC task to complete, unless a SIGKILL intervenes */ + error = rpc_wait_for_completion_task(task); + if (error == 0) + error = task->tk_status; + switch (error) { + case 0: + /* The rename succeeded */ + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + d_move(dentry, sdentry); + break; + case -ERESTARTSYS: + /* The result of the rename is unknown. Play it safe by + * forcing a new lookup */ + d_drop(dentry); + d_drop(sdentry); + } + rpc_put_task(task); +out_dput: + dput(sdentry); +out: + return error; +} diff --git a/fs/nfs/write.c b/fs/nfs/write.c index bed63416a55..5e2f1030454 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -13,18 +13,25 @@ #include <linux/file.h> #include <linux/writeback.h> #include <linux/swap.h> +#include <linux/migrate.h> #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/nfs_page.h> #include <linux/backing-dev.h> +#include <linux/export.h> #include <asm/uaccess.h> #include "delegation.h" #include "internal.h" #include "iostat.h" +#include "nfs4_fs.h" +#include "fscache.h" +#include "pnfs.h" + +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -34,23 +41,21 @@ /* * Local function declarations */ -static struct nfs_page * nfs_update_request(struct nfs_open_context*, - struct page *, - unsigned int, unsigned int); -static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc, - struct inode *inode, int ioflags); static void nfs_redirty_request(struct nfs_page *req); -static const struct rpc_call_ops nfs_write_partial_ops; -static const struct rpc_call_ops nfs_write_full_ops; static const struct rpc_call_ops nfs_commit_ops; +static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; +static const struct nfs_commit_completion_ops nfs_commit_completion_ops; +static const struct nfs_rw_ops nfs_rw_write_ops; +static void nfs_clear_request_commit(struct nfs_page *req); static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; +static struct kmem_cache *nfs_cdata_cachep; static mempool_t *nfs_commit_mempool; -struct nfs_write_data *nfs_commit_alloc(void) +struct nfs_commit_data *nfs_commitdata_alloc(void) { - struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); + struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); if (p) { memset(p, 0, sizeof(*p)); @@ -58,57 +63,26 @@ struct nfs_write_data *nfs_commit_alloc(void) } return p; } +EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); -static void nfs_commit_rcu_free(struct rcu_head *head) +void nfs_commit_free(struct nfs_commit_data *p) { - struct nfs_write_data *p = container_of(head, struct nfs_write_data, task.u.tk_rcu); - if (p && (p->pagevec != &p->page_array[0])) - kfree(p->pagevec); mempool_free(p, nfs_commit_mempool); } +EXPORT_SYMBOL_GPL(nfs_commit_free); -void nfs_commit_free(struct nfs_write_data *wdata) -{ - call_rcu_bh(&wdata->task.u.tk_rcu, nfs_commit_rcu_free); -} - -struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) +static struct nfs_rw_header *nfs_writehdr_alloc(void) { - struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); + struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); - if (p) { + if (p) memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&p->pages); - p->npages = pagecount; - if (pagecount <= ARRAY_SIZE(p->page_array)) - p->pagevec = p->page_array; - else { - p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); - if (!p->pagevec) { - mempool_free(p, nfs_wdata_mempool); - p = NULL; - } - } - } return p; } -static void nfs_writedata_rcu_free(struct rcu_head *head) -{ - struct nfs_write_data *p = container_of(head, struct nfs_write_data, task.u.tk_rcu); - if (p && (p->pagevec != &p->page_array[0])) - kfree(p->pagevec); - mempool_free(p, nfs_wdata_mempool); -} - -static void nfs_writedata_free(struct nfs_write_data *wdata) +static void nfs_writehdr_free(struct nfs_rw_header *whdr) { - call_rcu_bh(&wdata->task.u.tk_rcu, nfs_writedata_rcu_free); -} - -void nfs_writedata_release(void *wdata) -{ - nfs_writedata_free(wdata); + mempool_free(whdr, nfs_wdata_mempool); } static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) @@ -118,25 +92,53 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); } -static struct nfs_page *nfs_page_find_request_locked(struct page *page) +/* + * nfs_page_find_head_request_locked - find head request associated with @page + * + * must be called while holding the inode lock. + * + * returns matching head request with reference held, or NULL if not found. + */ +static struct nfs_page * +nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) { struct nfs_page *req = NULL; - if (PagePrivate(page)) { + if (PagePrivate(page)) req = (struct nfs_page *)page_private(page); - if (req != NULL) - kref_get(&req->wb_kref); + else if (unlikely(PageSwapCache(page))) { + struct nfs_page *freq, *t; + + /* Linearly search the commit list for the correct req */ + list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) { + if (freq->wb_page == page) { + req = freq->wb_head; + break; + } + } } + + if (req) { + WARN_ON_ONCE(req->wb_head != req); + + kref_get(&req->wb_kref); + } + return req; } -static struct nfs_page *nfs_page_find_request(struct page *page) +/* + * nfs_page_find_head_request - find head request associated with @page + * + * returns matching head request with reference held, or NULL if not found. + */ +static struct nfs_page *nfs_page_find_head_request(struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; struct nfs_page *req = NULL; spin_lock(&inode->i_lock); - req = nfs_page_find_request_locked(page); + req = nfs_page_find_head_request_locked(NFS_I(inode), page); spin_unlock(&inode->i_lock); return req; } @@ -144,70 +146,111 @@ static struct nfs_page *nfs_page_find_request(struct page *page) /* Adjust the file length if we're writing beyond the end */ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) { - struct inode *inode = page->mapping->host; - loff_t end, i_size = i_size_read(inode); - pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + struct inode *inode = page_file_mapping(page)->host; + loff_t end, i_size; + pgoff_t end_index; - if (i_size > 0 && page->index < end_index) - return; - end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); + spin_lock(&inode->i_lock); + i_size = i_size_read(inode); + end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + if (i_size > 0 && page_file_index(page) < end_index) + goto out; + end = page_file_offset(page) + ((loff_t)offset+count); if (i_size >= end) - return; - nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); + goto out; i_size_write(inode, end); + nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); +out: + spin_unlock(&inode->i_lock); } /* A writeback failed: mark the page as bad, and invalidate the page cache */ static void nfs_set_pageerror(struct page *page) { - SetPageError(page); - nfs_zap_mapping(page->mapping->host, page->mapping); + nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); } -/* We can set the PG_uptodate flag if we see that a write request - * covers the full page. +/* + * nfs_page_group_search_locked + * @head - head request of page group + * @page_offset - offset into page + * + * Search page group with head @head to find a request that contains the + * page offset @page_offset. + * + * Returns a pointer to the first matching nfs request, or NULL if no + * match is found. + * + * Must be called with the page group lock held */ -static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count) +static struct nfs_page * +nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) { - if (PageUptodate(page)) - return; - if (base != 0) - return; - if (count != nfs_page_length(page)) - return; - SetPageUptodate(page); + struct nfs_page *req; + + WARN_ON_ONCE(head != head->wb_head); + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags)); + + req = head; + do { + if (page_offset >= req->wb_pgbase && + page_offset < (req->wb_pgbase + req->wb_bytes)) + return req; + + req = req->wb_this_page; + } while (req != head); + + return NULL; } -static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, - unsigned int offset, unsigned int count) +/* + * nfs_page_group_covers_page + * @head - head request of page group + * + * Return true if the page group with head @head covers the whole page, + * returns false otherwise + */ +static bool nfs_page_group_covers_page(struct nfs_page *req) { - struct nfs_page *req; - int ret; + struct nfs_page *tmp; + unsigned int pos = 0; + unsigned int len = nfs_page_length(req->wb_page); - for (;;) { - req = nfs_update_request(ctx, page, offset, count); - if (!IS_ERR(req)) - break; - ret = PTR_ERR(req); - if (ret != -EBUSY) - return ret; - ret = nfs_wb_page(page->mapping->host, page); - if (ret != 0) - return ret; - } - /* Update file length */ - nfs_grow_file(page, offset, count); - nfs_clear_page_tag_locked(req); - return 0; + nfs_page_group_lock(req); + + do { + tmp = nfs_page_group_search_locked(req->wb_head, pos); + if (tmp) { + /* no way this should happen */ + WARN_ON_ONCE(tmp->wb_pgbase != pos); + pos += tmp->wb_bytes - (pos - tmp->wb_pgbase); + } + } while (tmp && pos < len); + + nfs_page_group_unlock(req); + WARN_ON_ONCE(pos > len); + return pos == len; +} + +/* We can set the PG_uptodate flag if we see that a write request + * covers the full page. + */ +static void nfs_mark_uptodate(struct nfs_page *req) +{ + if (PageUptodate(req->wb_page)) + return; + if (!nfs_page_group_covers_page(req)) + return; + SetPageUptodate(req->wb_page); } static int wb_priority(struct writeback_control *wbc) { if (wbc->for_reclaim) return FLUSH_HIGHPRI | FLUSH_STABLE; - if (wbc->for_kupdate) - return FLUSH_LOWPRI; - return 0; + if (wbc->for_kupdate || wbc->for_background) + return FLUSH_LOWPRI | FLUSH_COND_STABLE; + return FLUSH_COND_STABLE; } /* @@ -220,93 +263,319 @@ int nfs_congestion_kb; #define NFS_CONGESTION_OFF_THRESH \ (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) -static int nfs_set_page_writeback(struct page *page) +static void nfs_set_page_writeback(struct page *page) { + struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host); int ret = test_set_page_writeback(page); - if (!ret) { - struct inode *inode = page->mapping->host; - struct nfs_server *nfss = NFS_SERVER(inode); + WARN_ON_ONCE(ret != 0); - if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) - set_bdi_congested(&nfss->backing_dev_info, WRITE); + if (atomic_long_inc_return(&nfss->writeback) > + NFS_CONGESTION_ON_THRESH) { + set_bdi_congested(&nfss->backing_dev_info, + BLK_RW_ASYNC); } - return ret; } -static void nfs_end_page_writeback(struct page *page) +static void nfs_end_page_writeback(struct nfs_page *req) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(req->wb_page)->host; struct nfs_server *nfss = NFS_SERVER(inode); - end_page_writeback(page); + if (!nfs_page_group_sync_on_bit(req, PG_WB_END)) + return; + + end_page_writeback(req->wb_page); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(&nfss->backing_dev_info, WRITE); + clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); +} + + +/* nfs_page_group_clear_bits + * @req - an nfs request + * clears all page group related bits from @req + */ +static void +nfs_page_group_clear_bits(struct nfs_page *req) +{ + clear_bit(PG_TEARDOWN, &req->wb_flags); + clear_bit(PG_UNLOCKPAGE, &req->wb_flags); + clear_bit(PG_UPTODATE, &req->wb_flags); + clear_bit(PG_WB_END, &req->wb_flags); + clear_bit(PG_REMOVE, &req->wb_flags); } + /* - * Find an associated nfs write request, and prepare to flush it out - * May return an error if the user signalled nfs_wait_on_request(). + * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req + * + * this is a helper function for nfs_lock_and_join_requests + * + * @inode - inode associated with request page group, must be holding inode lock + * @head - head request of page group, must be holding head lock + * @req - request that couldn't lock and needs to wait on the req bit lock + * @nonblock - if true, don't actually wait + * + * NOTE: this must be called holding page_group bit lock and inode spin lock + * and BOTH will be released before returning. + * + * returns 0 on success, < 0 on error. */ -static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page) +static int +nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, + struct nfs_page *req, bool nonblock) + __releases(&inode->i_lock) { - struct inode *inode = page->mapping->host; - struct nfs_page *req; + struct nfs_page *tmp; int ret; - spin_lock(&inode->i_lock); - for(;;) { - req = nfs_page_find_request_locked(page); - if (req == NULL) { - spin_unlock(&inode->i_lock); - return 0; - } - if (nfs_set_page_tag_locked(req)) - break; - /* Note: If we hold the page lock, as is the case in nfs_writepage, - * then the call to nfs_set_page_tag_locked() will always - * succeed provided that someone hasn't already marked the - * request as dirty (in which case we don't care). - */ - spin_unlock(&inode->i_lock); + /* relinquish all the locks successfully grabbed this run */ + for (tmp = head ; tmp != req; tmp = tmp->wb_this_page) + nfs_unlock_request(tmp); + + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); + + /* grab a ref on the request that will be waited on */ + kref_get(&req->wb_kref); + + nfs_page_group_unlock(head); + spin_unlock(&inode->i_lock); + + /* release ref from nfs_page_find_head_request_locked */ + nfs_release_request(head); + + if (!nonblock) ret = nfs_wait_on_request(req); - nfs_release_request(req); - if (ret != 0) - return ret; - spin_lock(&inode->i_lock); + else + ret = -EAGAIN; + nfs_release_request(req); + + return ret; +} + +/* + * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests + * + * @destroy_list - request list (using wb_this_page) terminated by @old_head + * @old_head - the old head of the list + * + * All subrequests must be locked and removed from all lists, so at this point + * they are only "active" in this function, and possibly in nfs_wait_on_request + * with a reference held by some other context. + */ +static void +nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, + struct nfs_page *old_head) +{ + while (destroy_list) { + struct nfs_page *subreq = destroy_list; + + destroy_list = (subreq->wb_this_page == old_head) ? + NULL : subreq->wb_this_page; + + WARN_ON_ONCE(old_head != subreq->wb_head); + + /* make sure old group is not used */ + subreq->wb_head = subreq; + subreq->wb_this_page = subreq; + + nfs_clear_request_commit(subreq); + + /* subreq is now totally disconnected from page group or any + * write / commit lists. last chance to wake any waiters */ + nfs_unlock_request(subreq); + + if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) { + /* release ref on old head request */ + nfs_release_request(old_head); + + nfs_page_group_clear_bits(subreq); + + /* release the PG_INODE_REF reference */ + if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) + nfs_release_request(subreq); + else + WARN_ON_ONCE(1); + } else { + WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags)); + /* zombie requests have already released the last + * reference and were waiting on the rest of the + * group to complete. Since it's no longer part of a + * group, simply free the request */ + nfs_page_group_clear_bits(subreq); + nfs_free_request(subreq); + } } - if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { - /* This request is marked for commit */ +} + +/* + * nfs_lock_and_join_requests - join all subreqs to the head req and return + * a locked reference, cancelling any pending + * operations for this page. + * + * @page - the page used to lookup the "page group" of nfs_page structures + * @nonblock - if true, don't block waiting for request locks + * + * This function joins all sub requests to the head request by first + * locking all requests in the group, cancelling any pending operations + * and finally updating the head request to cover the whole range covered by + * the (former) group. All subrequests are removed from any write or commit + * lists, unlinked from the group and destroyed. + * + * Returns a locked, referenced pointer to the head request - which after + * this call is guaranteed to be the only request associated with the page. + * Returns NULL if no requests are found for @page, or a ERR_PTR if an + * error was encountered. + */ +static struct nfs_page * +nfs_lock_and_join_requests(struct page *page, bool nonblock) +{ + struct inode *inode = page_file_mapping(page)->host; + struct nfs_page *head, *subreq; + struct nfs_page *destroy_list = NULL; + unsigned int total_bytes; + int ret; + +try_again: + total_bytes = 0; + + WARN_ON_ONCE(destroy_list); + + spin_lock(&inode->i_lock); + + /* + * A reference is taken only on the head request which acts as a + * reference to the whole page group - the group will not be destroyed + * until the head reference is released. + */ + head = nfs_page_find_head_request_locked(NFS_I(inode), page); + + if (!head) { spin_unlock(&inode->i_lock); - nfs_clear_page_tag_locked(req); - nfs_pageio_complete(pgio); - return 0; + return NULL; } - if (nfs_set_page_writeback(page) != 0) { - spin_unlock(&inode->i_lock); - BUG(); + + /* lock each request in the page group */ + nfs_page_group_lock(head); + subreq = head; + do { + /* + * Subrequests are always contiguous, non overlapping + * and in order. If not, it's a programming error. + */ + WARN_ON_ONCE(subreq->wb_offset != + (head->wb_offset + total_bytes)); + + /* keep track of how many bytes this group covers */ + total_bytes += subreq->wb_bytes; + + if (!nfs_lock_request(subreq)) { + /* releases page group bit lock and + * inode spin lock and all references */ + ret = nfs_unroll_locks_and_wait(inode, head, + subreq, nonblock); + + if (ret == 0) + goto try_again; + + return ERR_PTR(ret); + } + + subreq = subreq->wb_this_page; + } while (subreq != head); + + /* Now that all requests are locked, make sure they aren't on any list. + * Commit list removal accounting is done after locks are dropped */ + subreq = head; + do { + nfs_list_remove_request(subreq); + subreq = subreq->wb_this_page; + } while (subreq != head); + + /* unlink subrequests from head, destroy them later */ + if (head->wb_this_page != head) { + /* destroy list will be terminated by head */ + destroy_list = head->wb_this_page; + head->wb_this_page = head; + + /* change head request to cover whole range that + * the former page group covered */ + head->wb_bytes = total_bytes; } + + /* + * prepare head request to be added to new pgio descriptor + */ + nfs_page_group_clear_bits(head); + + /* + * some part of the group was still on the inode list - otherwise + * the group wouldn't be involved in async write. + * grab a reference for the head request, iff it needs one. + */ + if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags)) + kref_get(&head->wb_kref); + + nfs_page_group_unlock(head); + + /* drop lock to clear_request_commit the head req and clean up + * requests on destroy list */ spin_unlock(&inode->i_lock); + + nfs_destroy_unlinked_subrequests(destroy_list, head); + + /* clean up commit list state */ + nfs_clear_request_commit(head); + + /* still holds ref on head from nfs_page_find_head_request_locked + * and still has lock on head from lock loop */ + return head; +} + +/* + * Find an associated nfs write request, and prepare to flush it out + * May return an error if the user signalled nfs_wait_on_request(). + */ +static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, + struct page *page, bool nonblock) +{ + struct nfs_page *req; + int ret = 0; + + req = nfs_lock_and_join_requests(page, nonblock); + if (!req) + goto out; + ret = PTR_ERR(req); + if (IS_ERR(req)) + goto out; + + nfs_set_page_writeback(page); + WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); + + ret = 0; if (!nfs_pageio_add_request(pgio, req)) { nfs_redirty_request(req); - nfs_end_page_writeback(page); - nfs_clear_page_tag_locked(req); - return pgio->pg_error; + ret = pgio->pg_error; } - return 0; +out: + return ret; } static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; + int ret; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); - nfs_pageio_cond_complete(pgio, page->index); - return nfs_page_async_flush(pgio, page); + nfs_pageio_cond_complete(pgio, page_file_index(page)); + ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); + if (ret == -EAGAIN) { + redirty_page_for_writepage(wbc, page); + ret = 0; + } + return ret; } /* @@ -317,7 +586,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc struct nfs_pageio_descriptor pgio; int err; - nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc)); + nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc), + false, &nfs_async_write_completion_ops); err = nfs_do_writepage(page, wbc, &pgio); nfs_pageio_complete(&pgio); if (err < 0) @@ -348,44 +618,67 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; + unsigned long *bitlock = &NFS_I(inode)->flags; struct nfs_pageio_descriptor pgio; int err; + /* Stop dirtying of new pages while we sync */ + err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (err) + goto out_err; + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); - nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); + nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, + &nfs_async_write_completion_ops); err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); + + clear_bit_unlock(NFS_INO_FLUSHING, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, NFS_INO_FLUSHING); + if (err < 0) - return err; - if (pgio.pg_error < 0) - return pgio.pg_error; + goto out_err; + err = pgio.pg_error; + if (err < 0) + goto out_err; return 0; +out_err: + return err; } /* * Insert a write request into an inode */ -static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) +static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) { struct nfs_inode *nfsi = NFS_I(inode); - int error; - error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); - BUG_ON(error == -EEXIST); - if (error) - return error; - if (!nfsi->npages) { - igrab(inode); - if (nfs_have_delegation(inode, FMODE_WRITE)) - nfsi->change_attr++; + WARN_ON_ONCE(req->wb_this_page != req); + + /* Lock the request! */ + nfs_lock_request(req); + + spin_lock(&inode->i_lock); + if (!nfsi->npages && NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) + inode->i_version++; + /* + * Swap-space should not get truncated. Hence no need to plug the race + * with invalidate/truncate. + */ + if (likely(!PageSwapCache(req->wb_page))) { + set_bit(PG_MAPPED, &req->wb_flags); + SetPagePrivate(req->wb_page); + set_page_private(req->wb_page, (unsigned long)req); } - SetPagePrivate(req->wb_page); - set_page_private(req->wb_page, (unsigned long)req); nfsi->npages++; + /* this a head request for a page group - mark it as having an + * extra reference so sub groups can follow suit */ + WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags)); kref_get(&req->wb_kref); - radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); - return 0; + spin_unlock(&inode->i_lock); } /* @@ -393,291 +686,414 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) */ static void nfs_inode_remove_request(struct nfs_page *req) { - struct inode *inode = req->wb_context->path.dentry->d_inode; + struct inode *inode = req->wb_context->dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_page *head; - BUG_ON (!NFS_WBACK_BUSY(req)); + if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { + head = req->wb_head; - spin_lock(&inode->i_lock); - set_page_private(req->wb_page, 0); - ClearPagePrivate(req->wb_page); - radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); - nfsi->npages--; - if (!nfsi->npages) { - spin_unlock(&inode->i_lock); - iput(inode); - } else + spin_lock(&inode->i_lock); + if (likely(!PageSwapCache(head->wb_page))) { + set_page_private(head->wb_page, 0); + ClearPagePrivate(head->wb_page); + clear_bit(PG_MAPPED, &head->wb_flags); + } + nfsi->npages--; spin_unlock(&inode->i_lock); - nfs_clear_request(req); - nfs_release_request(req); + } + + if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) + nfs_release_request(req); } static void -nfs_redirty_request(struct nfs_page *req) +nfs_mark_request_dirty(struct nfs_page *req) { __set_page_dirty_nobuffers(req->wb_page); } -/* - * Check if a request is dirty +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) +/** + * nfs_request_add_commit_list - add request to a commit list + * @req: pointer to a struct nfs_page + * @dst: commit list head + * @cinfo: holds list lock and accounting info + * + * This sets the PG_CLEAN bit, updates the cinfo count of + * number of outstanding requests requiring a commit as well as + * the MM page stats. + * + * The caller must _not_ hold the cinfo->lock, but must be + * holding the nfs_page lock. + */ +void +nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, + struct nfs_commit_info *cinfo) +{ + set_bit(PG_CLEAN, &(req)->wb_flags); + spin_lock(cinfo->lock); + nfs_list_add_request(req, dst); + cinfo->mds->ncommit++; + spin_unlock(cinfo->lock); + if (!cinfo->dreq) { + inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, + BDI_RECLAIMABLE); + __mark_inode_dirty(req->wb_context->dentry->d_inode, + I_DIRTY_DATASYNC); + } +} +EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); + +/** + * nfs_request_remove_commit_list - Remove request from a commit list + * @req: pointer to a nfs_page + * @cinfo: holds list lock and accounting info + * + * This clears the PG_CLEAN bit, and updates the cinfo's count of + * number of outstanding requests requiring a commit + * It does not update the MM page stats. + * + * The caller _must_ hold the cinfo->lock and the nfs_page lock. */ -static inline int -nfs_dirty_request(struct nfs_page *req) +void +nfs_request_remove_commit_list(struct nfs_page *req, + struct nfs_commit_info *cinfo) { - struct page *page = req->wb_page; + if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) + return; + nfs_list_remove_request(req); + cinfo->mds->ncommit--; +} +EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list); - if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags)) - return 0; - return !PageWriteback(req->wb_page); +static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, + struct inode *inode) +{ + cinfo->lock = &inode->i_lock; + cinfo->mds = &NFS_I(inode)->commit_info; + cinfo->ds = pnfs_get_ds_info(inode); + cinfo->dreq = NULL; + cinfo->completion_ops = &nfs_commit_completion_ops; } -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +void nfs_init_cinfo(struct nfs_commit_info *cinfo, + struct inode *inode, + struct nfs_direct_req *dreq) +{ + if (dreq) + nfs_init_cinfo_from_dreq(cinfo, dreq); + else + nfs_init_cinfo_from_inode(cinfo, inode); +} +EXPORT_SYMBOL_GPL(nfs_init_cinfo); + /* * Add a request to the inode's commit list. */ +void +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) +{ + if (pnfs_mark_request_commit(req, lseg, cinfo)) + return; + nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); +} + static void -nfs_mark_request_commit(struct nfs_page *req) +nfs_clear_page_commit(struct page *page) { - struct inode *inode = req->wb_context->path.dentry->d_inode; - struct nfs_inode *nfsi = NFS_I(inode); + dec_zone_page_state(page, NR_UNSTABLE_NFS); + dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE); +} - spin_lock(&inode->i_lock); - nfsi->ncommit++; - set_bit(PG_NEED_COMMIT, &(req)->wb_flags); - radix_tree_tag_set(&nfsi->nfs_page_tree, - req->wb_index, - NFS_PAGE_TAG_COMMIT); - spin_unlock(&inode->i_lock); - inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); +static void +nfs_clear_request_commit(struct nfs_page *req) +{ + if (test_bit(PG_CLEAN, &req->wb_flags)) { + struct inode *inode = req->wb_context->dentry->d_inode; + struct nfs_commit_info cinfo; + + nfs_init_cinfo_from_inode(&cinfo, inode); + if (!pnfs_clear_request_commit(req, &cinfo)) { + spin_lock(cinfo.lock); + nfs_request_remove_commit_list(req, &cinfo); + spin_unlock(cinfo.lock); + } + nfs_clear_page_commit(req->wb_page); + } } static inline -int nfs_write_need_commit(struct nfs_write_data *data) +int nfs_write_need_commit(struct nfs_pgio_data *data) { + if (data->verf.committed == NFS_DATA_SYNC) + return data->header->lseg == NULL; return data->verf.committed != NFS_FILE_SYNC; } -static inline -int nfs_reschedule_unstable_write(struct nfs_page *req) +#else +static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, + struct inode *inode) { - if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { - nfs_mark_request_commit(req); - return 1; - } - if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) { - nfs_redirty_request(req); - return 1; - } - return 0; } -#else -static inline void -nfs_mark_request_commit(struct nfs_page *req) + +void nfs_init_cinfo(struct nfs_commit_info *cinfo, + struct inode *inode, + struct nfs_direct_req *dreq) { } -static inline -int nfs_write_need_commit(struct nfs_write_data *data) +void +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) +{ +} + +static void +nfs_clear_request_commit(struct nfs_page *req) { - return 0; } static inline -int nfs_reschedule_unstable_write(struct nfs_page *req) +int nfs_write_need_commit(struct nfs_pgio_data *data) { return 0; } + #endif -/* - * Wait for a request to complete. - * - * Interruptible by fatal signals only. - */ -static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages) +static void nfs_write_completion(struct nfs_pgio_header *hdr) { - struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_page *req; - pgoff_t idx_end, next; - unsigned int res = 0; - int error; - - if (npages == 0) - idx_end = ~0; - else - idx_end = idx_start + npages - 1; + struct nfs_commit_info cinfo; + unsigned long bytes = 0; - next = idx_start; - while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_LOCKED)) { - if (req->wb_index > idx_end) - break; - - next = req->wb_index + 1; - BUG_ON(!NFS_WBACK_BUSY(req)); + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) + goto out; + nfs_init_cinfo_from_inode(&cinfo, hdr->inode); + while (!list_empty(&hdr->pages)) { + struct nfs_page *req = nfs_list_entry(hdr->pages.next); - kref_get(&req->wb_kref); - spin_unlock(&inode->i_lock); - error = nfs_wait_on_request(req); + bytes += req->wb_bytes; + nfs_list_remove_request(req); + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && + (hdr->good_bytes < bytes)) { + nfs_set_pageerror(req->wb_page); + nfs_context_set_write_error(req->wb_context, hdr->error); + goto remove_req; + } + if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) { + nfs_mark_request_dirty(req); + goto next; + } + if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { + memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); + nfs_mark_request_commit(req, hdr->lseg, &cinfo); + goto next; + } +remove_req: + nfs_inode_remove_request(req); +next: + nfs_unlock_request(req); + nfs_end_page_writeback(req); nfs_release_request(req); - spin_lock(&inode->i_lock); - if (error < 0) - return error; - res++; } - return res; +out: + hdr->release(hdr); } -static void nfs_cancel_commit_list(struct list_head *head) +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) +unsigned long +nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { - struct nfs_page *req; + return cinfo->mds->ncommit; +} - while(!list_empty(head)) { - req = nfs_list_entry(head->next); - dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); - nfs_list_remove_request(req); - clear_bit(PG_NEED_COMMIT, &(req)->wb_flags); - nfs_inode_remove_request(req); - nfs_unlock_request(req); +/* cinfo->lock held by caller */ +int +nfs_scan_commit_list(struct list_head *src, struct list_head *dst, + struct nfs_commit_info *cinfo, int max) +{ + struct nfs_page *req, *tmp; + int ret = 0; + + list_for_each_entry_safe(req, tmp, src, wb_list) { + if (!nfs_lock_request(req)) + continue; + kref_get(&req->wb_kref); + if (cond_resched_lock(cinfo->lock)) + list_safe_reset_next(req, tmp, wb_list); + nfs_request_remove_commit_list(req, cinfo); + nfs_list_add_request(req, dst); + ret++; + if ((ret == max) && !cinfo->dreq) + break; } + return ret; } -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) /* * nfs_scan_commit - Scan an inode for commit requests * @inode: NFS inode to scan - * @dst: destination list - * @idx_start: lower bound of page->index to scan. - * @npages: idx_start + npages sets the upper bound to scan. + * @dst: mds destination list + * @cinfo: mds and ds lists of reqs ready to commit * * Moves requests from the inode's 'commit' request list. * The requests are *not* checked to ensure that they form a contiguous set. */ -static int -nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) +int +nfs_scan_commit(struct inode *inode, struct list_head *dst, + struct nfs_commit_info *cinfo) { - struct nfs_inode *nfsi = NFS_I(inode); - int res = 0; + int ret = 0; + + spin_lock(cinfo->lock); + if (cinfo->mds->ncommit > 0) { + const int max = INT_MAX; - if (nfsi->ncommit != 0) { - res = nfs_scan_list(nfsi, dst, idx_start, npages, - NFS_PAGE_TAG_COMMIT); - nfsi->ncommit -= res; + ret = nfs_scan_commit_list(&cinfo->mds->list, dst, + cinfo, max); + ret += pnfs_scan_commit_lists(inode, cinfo, max - ret); } - return res; + spin_unlock(cinfo->lock); + return ret; } + #else -static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) +{ + return 0; +} + +int nfs_scan_commit(struct inode *inode, struct list_head *dst, + struct nfs_commit_info *cinfo) { return 0; } #endif /* - * Try to update any existing write request, or create one if there is none. - * In order to match, the request's credentials must match those of - * the calling process. + * Search for an existing write request, and attempt to update + * it to reflect a new dirty region on a given page. * - * Note: Should always be called with the Page Lock held! + * If the attempt fails, then the existing request is flushed out + * to disk. */ -static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, - struct page *page, unsigned int offset, unsigned int bytes) +static struct nfs_page *nfs_try_to_update_request(struct inode *inode, + struct page *page, + unsigned int offset, + unsigned int bytes) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct nfs_page *req, *new = NULL; - pgoff_t rqend, end; + struct nfs_page *req; + unsigned int rqend; + unsigned int end; + int error; + + if (!PagePrivate(page)) + return NULL; end = offset + bytes; + spin_lock(&inode->i_lock); for (;;) { - /* Loop over all inode entries and see if we find - * A request for the page we wish to update + req = nfs_page_find_head_request_locked(NFS_I(inode), page); + if (req == NULL) + goto out_unlock; + + /* should be handled by nfs_flush_incompatible */ + WARN_ON_ONCE(req->wb_head != req); + WARN_ON_ONCE(req->wb_this_page != req); + + rqend = req->wb_offset + req->wb_bytes; + /* + * Tell the caller to flush out the request if + * the offsets are non-contiguous. + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. */ - spin_lock(&inode->i_lock); - req = nfs_page_find_request_locked(page); - if (req) { - if (!nfs_set_page_tag_locked(req)) { - int error; - - spin_unlock(&inode->i_lock); - error = nfs_wait_on_request(req); - nfs_release_request(req); - if (error < 0) { - if (new) - nfs_release_request(new); - return ERR_PTR(error); - } - continue; - } - spin_unlock(&inode->i_lock); - if (new) - nfs_release_request(new); + if (offset > rqend + || end < req->wb_offset) + goto out_flushme; + + if (nfs_lock_request(req)) break; - } - if (new) { - int error; - nfs_lock_request_dontget(new); - error = nfs_inode_add_request(inode, new); - if (error) { - spin_unlock(&inode->i_lock); - nfs_unlock_request(new); - return ERR_PTR(error); - } - spin_unlock(&inode->i_lock); - req = new; - goto zero_page; - } + /* The request is locked, so wait and then retry */ spin_unlock(&inode->i_lock); - - new = nfs_create_request(ctx, inode, page, offset, bytes); - if (IS_ERR(new)) - return new; - } - - /* We have a request for our page. - * If the creds don't match, or the - * page addresses don't match, - * tell the caller to wait on the conflicting - * request. - */ - rqend = req->wb_offset + req->wb_bytes; - if (req->wb_context != ctx - || req->wb_page != page - || !nfs_dirty_request(req) - || offset > rqend || end < req->wb_offset) { - nfs_clear_page_tag_locked(req); - return ERR_PTR(-EBUSY); + error = nfs_wait_on_request(req); + nfs_release_request(req); + if (error != 0) + goto out_err; + spin_lock(&inode->i_lock); } /* Okay, the request matches. Update the region */ if (offset < req->wb_offset) { req->wb_offset = offset; req->wb_pgbase = offset; - req->wb_bytes = max(end, rqend) - req->wb_offset; - goto zero_page; } - if (end > rqend) req->wb_bytes = end - req->wb_offset; - + else + req->wb_bytes = rqend - req->wb_offset; +out_unlock: + spin_unlock(&inode->i_lock); + if (req) + nfs_clear_request_commit(req); return req; -zero_page: - /* If this page might potentially be marked as up to date, - * then we need to zero any uninitalised data. */ - if (req->wb_pgbase == 0 && req->wb_bytes != PAGE_CACHE_SIZE - && !PageUptodate(req->wb_page)) - zero_user_segment(req->wb_page, req->wb_bytes, PAGE_CACHE_SIZE); +out_flushme: + spin_unlock(&inode->i_lock); + nfs_release_request(req); + error = nfs_wb_page(inode, page); +out_err: + return ERR_PTR(error); +} + +/* + * Try to update an existing write request, or create one if there is none. + * + * Note: Should always be called with the Page Lock held to prevent races + * if we have to add a new request. Also assumes that the caller has + * already called nfs_flush_incompatible() if necessary. + */ +static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, + struct page *page, unsigned int offset, unsigned int bytes) +{ + struct inode *inode = page_file_mapping(page)->host; + struct nfs_page *req; + + req = nfs_try_to_update_request(inode, page, offset, bytes); + if (req != NULL) + goto out; + req = nfs_create_request(ctx, page, NULL, offset, bytes); + if (IS_ERR(req)) + goto out; + nfs_inode_add_request(inode, req); +out: return req; } +static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, + unsigned int offset, unsigned int count) +{ + struct nfs_page *req; + + req = nfs_setup_write_request(ctx, page, offset, count); + if (IS_ERR(req)) + return PTR_ERR(req); + /* Update file length */ + nfs_grow_file(page, offset, count); + nfs_mark_uptodate(req); + nfs_mark_request_dirty(req); + nfs_unlock_and_release_request(req); + return 0; +} + int nfs_flush_incompatible(struct file *file, struct page *page) { struct nfs_open_context *ctx = nfs_file_open_context(file); + struct nfs_lock_context *l_ctx; struct nfs_page *req; int do_flush, status; /* @@ -689,28 +1105,95 @@ int nfs_flush_incompatible(struct file *file, struct page *page) * dropped page. */ do { - req = nfs_page_find_request(page); + req = nfs_page_find_head_request(page); if (req == NULL) return 0; - do_flush = req->wb_page != page || req->wb_context != ctx - || !nfs_dirty_request(req); + l_ctx = req->wb_lock_context; + do_flush = req->wb_page != page || req->wb_context != ctx; + /* for now, flush if more than 1 request in page_group */ + do_flush |= req->wb_this_page != req; + if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) { + do_flush |= l_ctx->lockowner.l_owner != current->files + || l_ctx->lockowner.l_pid != current->tgid; + } nfs_release_request(req); if (!do_flush) return 0; - status = nfs_wb_page(page->mapping->host, page); + status = nfs_wb_page(page_file_mapping(page)->host, page); } while (status == 0); return status; } /* + * Avoid buffered writes when a open context credential's key would + * expire soon. + * + * Returns -EACCES if the key will expire within RPC_KEY_EXPIRE_FAIL. + * + * Return 0 and set a credential flag which triggers the inode to flush + * and performs NFS_FILE_SYNC writes if the key will expired within + * RPC_KEY_EXPIRE_TIMEO. + */ +int +nfs_key_timeout_notify(struct file *filp, struct inode *inode) +{ + struct nfs_open_context *ctx = nfs_file_open_context(filp); + struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth; + + return rpcauth_key_timeout_notify(auth, ctx->cred); +} + +/* + * Test if the open context credential key is marked to expire soon. + */ +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx) +{ + return rpcauth_cred_key_to_expire(ctx->cred); +} + +/* * If the page cache is marked as unsafe or invalid, then we can't rely on * the PageUptodate() flag. In this case, we will need to turn off * write optimisations that depend on the page contents being correct. */ -static int nfs_write_pageuptodate(struct page *page, struct inode *inode) +static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + if (nfs_have_delegated_attributes(inode)) + goto out; + if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + return false; + smp_rmb(); + if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) + return false; +out: + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + return false; + return PageUptodate(page) != 0; +} + +/* If we know the page is up to date, and we're not using byte range locks (or + * if we have the whole file locked for writing), it may be more efficient to + * extend the write to cover the entire page in order to avoid fragmentation + * inefficiencies. + * + * If the file is opened for synchronous writes then we can just skip the rest + * of the checks. + */ +static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode) { - return PageUptodate(page) && - !(NFS_I(inode)->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA)); + if (file->f_flags & O_DSYNC) + return 0; + if (!nfs_write_pageuptodate(page, inode)) + return 0; + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) + return 1; + if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 && + inode->i_flock->fl_end == OFFSET_MAX && + inode->i_flock->fl_type != F_RDLCK)) + return 1; + return 0; } /* @@ -723,52 +1206,28 @@ int nfs_updatepage(struct file *file, struct page *page, unsigned int offset, unsigned int count) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; int status = 0; nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); - dprintk("NFS: nfs_updatepage(%s/%s %d@%Ld)\n", - file->f_path.dentry->d_parent->d_name.name, - file->f_path.dentry->d_name.name, count, - (long long)(page_offset(page) +offset)); + dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", + file, count, (long long)(page_file_offset(page) + offset)); - /* If we're not using byte range locks, and we know the page - * is up to date, it may be more efficient to extend the write - * to cover the entire page in order to avoid fragmentation - * inefficiencies. - */ - if (nfs_write_pageuptodate(page, inode) && - inode->i_flock == NULL && - !(file->f_flags & O_SYNC)) { + if (nfs_can_extend_write(file, page, inode)) { count = max(count + offset, nfs_page_length(page)); offset = 0; } status = nfs_writepage_setup(ctx, page, offset, count); - __set_page_dirty_nobuffers(page); - - dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", - status, (long long)i_size_read(inode)); if (status < 0) nfs_set_pageerror(page); - return status; -} - -static void nfs_writepage_release(struct nfs_page *req) -{ + else + __set_page_dirty_nobuffers(page); - if (PageError(req->wb_page)) { - nfs_end_page_writeback(req->wb_page); - nfs_inode_remove_request(req); - } else if (!nfs_reschedule_unstable_write(req)) { - /* Set the PG_uptodate flag */ - nfs_mark_uptodate(req->wb_page, req->wb_pgbase, req->wb_bytes); - nfs_end_page_writeback(req->wb_page); - nfs_inode_remove_request(req); - } else - nfs_end_page_writeback(req->wb_page); - nfs_clear_page_tag_locked(req); + dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", + status, (long long)i_size_read(inode)); + return status; } static int flush_task_priority(int how) @@ -782,313 +1241,128 @@ static int flush_task_priority(int how) return RPC_PRIORITY_NORMAL; } -/* - * Set up the argument/result storage required for the RPC call. - */ -static void nfs_write_rpcsetup(struct nfs_page *req, - struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, - unsigned int count, unsigned int offset, - int how) -{ - struct inode *inode = req->wb_context->path.dentry->d_inode; - int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; +static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg, + struct rpc_task_setup *task_setup_data, int how) +{ + struct inode *inode = data->header->inode; int priority = flush_task_priority(how); - struct rpc_task *task; - struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, - .rpc_cred = req->wb_context->cred, - }; - struct rpc_task_setup task_setup_data = { - .rpc_client = NFS_CLIENT(inode), - .task = &data->task, - .rpc_message = &msg, - .callback_ops = call_ops, - .callback_data = data, - .flags = flags, - .priority = priority, - }; - - /* Set up the RPC argument and reply structs - * NB: take care not to mess about with data->commit et al. */ - - data->req = req; - data->inode = inode = req->wb_context->path.dentry->d_inode; - data->cred = msg.rpc_cred; - - data->args.fh = NFS_FH(inode); - data->args.offset = req_offset(req) + offset; - data->args.pgbase = req->wb_pgbase + offset; - data->args.pages = data->pagevec; - data->args.count = count; - data->args.context = req->wb_context; - data->args.stable = NFS_UNSTABLE; - if (how & FLUSH_STABLE) { - data->args.stable = NFS_DATA_SYNC; - if (!NFS_I(inode)->ncommit) - data->args.stable = NFS_FILE_SYNC; - } - data->res.fattr = &data->fattr; - data->res.count = count; - data->res.verf = &data->verf; - nfs_fattr_init(&data->fattr); + task_setup_data->priority = priority; + NFS_PROTO(inode)->write_setup(data, msg); - /* Set up the initial task struct. */ - NFS_PROTO(inode)->write_setup(data, &msg); - - dprintk("NFS: %5u initiated write call " - "(req %s/%Ld, %u bytes @ offset %Lu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - count, - (unsigned long long)data->args.offset); - - task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client, + &task_setup_data->rpc_client, msg, data); } -/* - * Generate multiple small requests to write out a single - * contiguous dirty area on one page. +/* If a nfs_flush_* function fails, it should remove reqs from @head and + * call this on each, which will prepare them to be retried on next + * writeback using standard nfs. */ -static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) +static void nfs_redirty_request(struct nfs_page *req) { - struct nfs_page *req = nfs_list_entry(head->next); - struct page *page = req->wb_page; - struct nfs_write_data *data; - size_t wsize = NFS_SERVER(inode)->wsize, nbytes; - unsigned int offset; - int requests = 0; - LIST_HEAD(list); - - nfs_list_remove_request(req); - - nbytes = count; - do { - size_t len = min(nbytes, wsize); - - data = nfs_writedata_alloc(1); - if (!data) - goto out_bad; - list_add(&data->pages, &list); - requests++; - nbytes -= len; - } while (nbytes != 0); - atomic_set(&req->wb_complete, requests); - - ClearPageError(page); - offset = 0; - nbytes = count; - do { - data = list_entry(list.next, struct nfs_write_data, pages); - list_del_init(&data->pages); - - data->pagevec[0] = page; - - if (nbytes < wsize) - wsize = nbytes; - nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, - wsize, offset, how); - offset += wsize; - nbytes -= wsize; - } while (nbytes != 0); - - return 0; - -out_bad: - while (!list_empty(&list)) { - data = list_entry(list.next, struct nfs_write_data, pages); - list_del(&data->pages); - nfs_writedata_release(data); - } - nfs_redirty_request(req); - nfs_end_page_writeback(req->wb_page); - nfs_clear_page_tag_locked(req); - return -ENOMEM; + nfs_mark_request_dirty(req); + nfs_unlock_request(req); + nfs_end_page_writeback(req); + nfs_release_request(req); } -/* - * Create an RPC task for the given write request and kick it. - * The page must have been locked by the caller. - * - * It may happen that the page we're passed is not marked dirty. - * This is the case if nfs_updatepage detects a conflicting request - * that has been written but not committed. - */ -static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) +static void nfs_async_write_error(struct list_head *head) { - struct nfs_page *req; - struct page **pages; - struct nfs_write_data *data; - - data = nfs_writedata_alloc(npages); - if (!data) - goto out_bad; - - pages = data->pagevec; - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_list_add_request(req, &data->pages); - ClearPageError(req->wb_page); - *pages++ = req->wb_page; - } - req = nfs_list_entry(data->pages.next); - - /* Set up the argument struct */ - nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how); + struct nfs_page *req; - return 0; - out_bad: while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_redirty_request(req); - nfs_end_page_writeback(req->wb_page); - nfs_clear_page_tag_locked(req); } - return -ENOMEM; } -static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, - struct inode *inode, int ioflags) +static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = { + .error_cleanup = nfs_async_write_error, + .completion = nfs_write_completion, +}; + +void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, + struct inode *inode, int ioflags, bool force_mds, + const struct nfs_pgio_completion_ops *compl_ops) { - size_t wsize = NFS_SERVER(inode)->wsize; + struct nfs_server *server = NFS_SERVER(inode); + const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops; - if (wsize < PAGE_CACHE_SIZE) - nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); - else - nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags); +#ifdef CONFIG_NFS_V4_1 + if (server->pnfs_curr_ld && !force_mds) + pg_ops = server->pnfs_curr_ld->pg_write_ops; +#endif + nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, + server->wsize, ioflags); } +EXPORT_SYMBOL_GPL(nfs_pageio_init_write); -/* - * Handle a write reply that flushed part of a page. - */ -static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) +void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) { - struct nfs_write_data *data = calldata; - struct nfs_page *req = data->req; - struct page *page = req->wb_page; - - dprintk("NFS: write (%s/%Ld %d@%Ld)", - req->wb_context->path.dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), - req->wb_bytes, - (long long)req_offset(req)); + pgio->pg_ops = &nfs_pgio_rw_ops; + pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; +} +EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); - if (nfs_writeback_done(task, data) != 0) - return; - if (task->tk_status < 0) { - nfs_set_pageerror(page); - nfs_context_set_write_error(req->wb_context, task->tk_status); - dprintk(", error = %d\n", task->tk_status); - goto out; - } +void nfs_commit_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_commit_data *data = calldata; - if (nfs_write_need_commit(data)) { - struct inode *inode = page->mapping->host; + NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); +} - spin_lock(&inode->i_lock); - if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) { - /* Do nothing we need to resend the writes */ - } else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) { - memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); - dprintk(" defer commit\n"); - } else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) { - set_bit(PG_NEED_RESCHED, &req->wb_flags); - clear_bit(PG_NEED_COMMIT, &req->wb_flags); - dprintk(" server reboot detected\n"); - } - spin_unlock(&inode->i_lock); - } else - dprintk(" OK\n"); +static void nfs_writeback_release_common(struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + int status = data->task.tk_status; -out: - if (atomic_dec_and_test(&req->wb_complete)) - nfs_writepage_release(req); + if ((status >= 0) && nfs_write_need_commit(data)) { + spin_lock(&hdr->lock); + if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) + ; /* Do nothing */ + else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) + memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf)); + else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf))) + set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags); + spin_unlock(&hdr->lock); + } } -static const struct rpc_call_ops nfs_write_partial_ops = { - .rpc_call_done = nfs_writeback_done_partial, - .rpc_release = nfs_writedata_release, -}; - /* - * Handle a write reply that flushes a whole page. - * - * FIXME: There is an inherent race with invalidate_inode_pages and - * writebacks since the page->count is kept > 1 for as long - * as the page has a write request pending. + * Special version of should_remove_suid() that ignores capabilities. */ -static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) +static int nfs_should_remove_suid(const struct inode *inode) { - struct nfs_write_data *data = calldata; - struct nfs_page *req; - struct page *page; - - if (nfs_writeback_done(task, data) != 0) - return; + umode_t mode = inode->i_mode; + int kill = 0; - /* Update attributes as result of writeback. */ - while (!list_empty(&data->pages)) { - req = nfs_list_entry(data->pages.next); - nfs_list_remove_request(req); - page = req->wb_page; + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; - dprintk("NFS: write (%s/%Ld %d@%Ld)", - req->wb_context->path.dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), - req->wb_bytes, - (long long)req_offset(req)); + /* + * sgid without any exec bits is just a mandatory locking mark; leave + * it alone. If some exec bits are set, it's a real sgid; kill it. + */ + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + kill |= ATTR_KILL_SGID; - if (task->tk_status < 0) { - nfs_set_pageerror(page); - nfs_context_set_write_error(req->wb_context, task->tk_status); - dprintk(", error = %d\n", task->tk_status); - goto remove_request; - } + if (unlikely(kill && S_ISREG(mode))) + return kill; - if (nfs_write_need_commit(data)) { - memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); - nfs_mark_request_commit(req); - nfs_end_page_writeback(page); - dprintk(" marked for commit\n"); - goto next; - } - /* Set the PG_uptodate flag? */ - nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); - dprintk(" OK\n"); -remove_request: - nfs_end_page_writeback(page); - nfs_inode_remove_request(req); - next: - nfs_clear_page_tag_locked(req); - } + return 0; } -static const struct rpc_call_ops nfs_write_full_ops = { - .rpc_call_done = nfs_writeback_done_full, - .rpc_release = nfs_writedata_release, -}; - - /* * This function is called when the WRITE call is complete. */ -int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data, + struct inode *inode) { - struct nfs_writeargs *argp = &data->args; - struct nfs_writeres *resp = &data->res; int status; - dprintk("NFS: %5u nfs_writeback_done (status %d)\n", - task->tk_pid, task->tk_status); - /* * ->write_done will attempt to use post-op attributes to detect * conflicting writes by other clients. A strict interpretation @@ -1096,13 +1370,13 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) * another writer had changed the file, but some applications * depend on tighter cache coherency when writing. */ - status = NFS_PROTO(data->inode)->write_done(task, data); + status = NFS_PROTO(inode)->write_done(task, data); if (status != 0) return status; - nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); + nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count); -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) - if (resp->verf->committed < argp->stable && task->tk_status >= 0) { +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) + if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) { /* We tried a write call, but the server did not * commit data to stable storage even though we * requested it. @@ -1113,83 +1387,146 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ static unsigned long complain; + /* Note this will print the MDS for a DS write */ if (time_before(complain, jiffies)) { - dprintk("NFS: faulty NFS server %s:" + dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", - NFS_SERVER(data->inode)->nfs_client->cl_hostname, - resp->verf->committed, argp->stable); + NFS_SERVER(inode)->nfs_client->cl_hostname, + data->res.verf->committed, data->args.stable); complain = jiffies + 300 * HZ; } } #endif - /* Is this a short write? */ - if (task->tk_status >= 0 && resp->count < argp->count) { + + /* Deal with the suid/sgid bit corner case */ + if (nfs_should_remove_suid(inode)) + nfs_mark_for_revalidate(inode); + return 0; +} + +/* + * This function is called when the WRITE call is complete. + */ +static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data) +{ + struct nfs_pgio_args *argp = &data->args; + struct nfs_pgio_res *resp = &data->res; + + if (resp->count < argp->count) { static unsigned long complain; - nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); + /* This a short write! */ + nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE); /* Has the server at least made some progress? */ - if (resp->count != 0) { - /* Was this an NFSv2 write or an NFSv3 stable write? */ - if (resp->verf->committed != NFS_UNSTABLE) { - /* Resend from where the server left off */ - argp->offset += resp->count; - argp->pgbase += resp->count; - argp->count -= resp->count; - } else { - /* Resend as a stable write in order to avoid - * headaches in the case of a server crash. - */ - argp->stable = NFS_FILE_SYNC; + if (resp->count == 0) { + if (time_before(complain, jiffies)) { + printk(KERN_WARNING + "NFS: Server wrote zero bytes, expected %u.\n", + argp->count); + complain = jiffies + 300 * HZ; } - rpc_restart_call(task); - return -EAGAIN; + nfs_set_pgio_error(data->header, -EIO, argp->offset); + task->tk_status = -EIO; + return; } - if (time_before(complain, jiffies)) { - printk(KERN_WARNING - "NFS: Server wrote zero bytes, expected %u.\n", - argp->count); - complain = jiffies + 300 * HZ; + /* Was this an NFSv2 write or an NFSv3 stable write? */ + if (resp->verf->committed != NFS_UNSTABLE) { + /* Resend from where the server left off */ + data->mds_offset += resp->count; + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; + } else { + /* Resend as a stable write in order to avoid + * headaches in the case of a server crash. + */ + argp->stable = NFS_FILE_SYNC; } - /* Can't do anything about it except throw an error. */ - task->tk_status = -EIO; + rpc_restart_call_prepare(task); } - return 0; } -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -void nfs_commit_release(void *wdata) +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) +static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) +{ + int ret; + + if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) + return 1; + if (!may_wait) + return 0; + ret = out_of_line_wait_on_bit_lock(&nfsi->flags, + NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); + return (ret < 0) ? ret : 1; +} + +static void nfs_commit_clear_lock(struct nfs_inode *nfsi) { - nfs_commit_free(wdata); + clear_bit(NFS_INO_COMMIT, &nfsi->flags); + smp_mb__after_atomic(); + wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); } -/* - * Set up the argument/result storage required for the RPC call. - */ -static void nfs_commit_rpcsetup(struct list_head *head, - struct nfs_write_data *data, - int how) +void nfs_commitdata_release(struct nfs_commit_data *data) +{ + put_nfs_open_context(data->context); + nfs_commit_free(data); +} +EXPORT_SYMBOL_GPL(nfs_commitdata_release); + +int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, + const struct rpc_call_ops *call_ops, + int how, int flags) { - struct nfs_page *first = nfs_list_entry(head->next); - struct inode *inode = first->wb_context->path.dentry->d_inode; - int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; - int priority = flush_task_priority(how); struct rpc_task *task; + int priority = flush_task_priority(how); struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, - .rpc_cred = first->wb_context->cred, + .rpc_cred = data->cred, }; struct rpc_task_setup task_setup_data = { .task = &data->task, - .rpc_client = NFS_CLIENT(inode), + .rpc_client = clnt, .rpc_message = &msg, - .callback_ops = &nfs_commit_ops, + .callback_ops = call_ops, .callback_data = data, - .flags = flags, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC | flags, .priority = priority, }; + /* Set up the initial task struct. */ + NFS_PROTO(data->inode)->commit_setup(data, &msg); + + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + + nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client, + NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (how & FLUSH_SYNC) + rpc_wait_for_completion_task(task); + rpc_put_task(task); + return 0; +} +EXPORT_SYMBOL_GPL(nfs_initiate_commit); + +/* + * Set up the argument/result storage required for the RPC call. + */ +void nfs_init_commit(struct nfs_commit_data *data, + struct list_head *head, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) +{ + struct nfs_page *first = nfs_list_entry(head->next); + struct inode *inode = first->wb_context->dentry->d_inode; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ @@ -1197,55 +1534,65 @@ static void nfs_commit_rpcsetup(struct list_head *head, list_splice_init(head, &data->pages); data->inode = inode; - data->cred = msg.rpc_cred; + data->cred = first->wb_context->cred; + data->lseg = lseg; /* reference transferred */ + data->mds_ops = &nfs_commit_ops; + data->completion_ops = cinfo->completion_ops; + data->dreq = cinfo->dreq; data->args.fh = NFS_FH(data->inode); /* Note: we always request a commit of the entire inode */ data->args.offset = 0; data->args.count = 0; - data->res.count = 0; + data->context = get_nfs_open_context(first->wb_context); data->res.fattr = &data->fattr; data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); +} +EXPORT_SYMBOL_GPL(nfs_init_commit); - /* Set up the initial task struct. */ - NFS_PROTO(inode)->commit_setup(data, &msg); - - dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); +void nfs_retry_commit(struct list_head *page_list, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) +{ + struct nfs_page *req; - task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + while (!list_empty(page_list)) { + req = nfs_list_entry(page_list->next); + nfs_list_remove_request(req); + nfs_mark_request_commit(req, lseg, cinfo); + if (!cinfo->dreq) { + dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, + BDI_RECLAIMABLE); + } + nfs_unlock_and_release_request(req); + } } +EXPORT_SYMBOL_GPL(nfs_retry_commit); /* * Commit dirty pages */ static int -nfs_commit_list(struct inode *inode, struct list_head *head, int how) +nfs_commit_list(struct inode *inode, struct list_head *head, int how, + struct nfs_commit_info *cinfo) { - struct nfs_write_data *data; - struct nfs_page *req; + struct nfs_commit_data *data; - data = nfs_commit_alloc(); + data = nfs_commitdata_alloc(); if (!data) goto out_bad; /* Set up the argument struct */ - nfs_commit_rpcsetup(head, data, how); - - return 0; + nfs_init_commit(data, head, NULL, cinfo); + atomic_inc(&cinfo->mds->rpcs_out); + return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops, + how, 0); out_bad: - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_mark_request_commit(req); - dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); - nfs_clear_page_tag_locked(req); - } + nfs_retry_commit(head, NULL, cinfo); + cinfo->completion_ops->error_cleanup(NFS_I(inode)); return -ENOMEM; } @@ -1254,267 +1601,290 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) */ static void nfs_commit_done(struct rpc_task *task, void *calldata) { - struct nfs_write_data *data = calldata; - struct nfs_page *req; + struct nfs_commit_data *data = calldata; dprintk("NFS: %5u nfs_commit_done (status %d)\n", task->tk_pid, task->tk_status); /* Call the NFS version-specific code */ - if (NFS_PROTO(data->inode)->commit_done(task, data) != 0) - return; + NFS_PROTO(data->inode)->commit_done(task, data); +} + +static void nfs_commit_release_pages(struct nfs_commit_data *data) +{ + struct nfs_page *req; + int status = data->task.tk_status; + struct nfs_commit_info cinfo; while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - clear_bit(PG_NEED_COMMIT, &(req)->wb_flags); - dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); - - dprintk("NFS: commit (%s/%Ld %d@%Ld)", - req->wb_context->path.dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), + nfs_clear_page_commit(req->wb_page); + + dprintk("NFS: commit (%s/%llu %d@%lld)", + req->wb_context->dentry->d_sb->s_id, + (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); - if (task->tk_status < 0) { - nfs_context_set_write_error(req->wb_context, task->tk_status); + if (status < 0) { + nfs_context_set_write_error(req->wb_context, status); nfs_inode_remove_request(req); - dprintk(", error = %d\n", task->tk_status); + dprintk(", error = %d\n", status); goto next; } /* Okay, COMMIT succeeded, apparently. Check the verifier * returned by the server against all stored verfs. */ - if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) { + if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) { /* We have a match */ - /* Set the PG_uptodate flag */ - nfs_mark_uptodate(req->wb_page, req->wb_pgbase, - req->wb_bytes); nfs_inode_remove_request(req); dprintk(" OK\n"); goto next; } /* We have a mismatch. Write the page again */ dprintk(" mismatch\n"); - nfs_redirty_request(req); + nfs_mark_request_dirty(req); + set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); next: - nfs_clear_page_tag_locked(req); + nfs_unlock_and_release_request(req); } + nfs_init_cinfo(&cinfo, data->inode, data->dreq); + if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) + nfs_commit_clear_lock(NFS_I(data->inode)); +} + +static void nfs_commit_release(void *calldata) +{ + struct nfs_commit_data *data = calldata; + + data->completion_ops->completion(data); + nfs_commitdata_release(calldata); } static const struct rpc_call_ops nfs_commit_ops = { + .rpc_call_prepare = nfs_commit_prepare, .rpc_call_done = nfs_commit_done, .rpc_release = nfs_commit_release, }; +static const struct nfs_commit_completion_ops nfs_commit_completion_ops = { + .completion = nfs_commit_release_pages, + .error_cleanup = nfs_commit_clear_lock, +}; + +int nfs_generic_commit_list(struct inode *inode, struct list_head *head, + int how, struct nfs_commit_info *cinfo) +{ + int status; + + status = pnfs_commit_list(inode, head, how, cinfo); + if (status == PNFS_NOT_ATTEMPTED) + status = nfs_commit_list(inode, head, how, cinfo); + return status; +} + int nfs_commit_inode(struct inode *inode, int how) { LIST_HEAD(head); + struct nfs_commit_info cinfo; + int may_wait = how & FLUSH_SYNC; int res; - spin_lock(&inode->i_lock); - res = nfs_scan_commit(inode, &head, 0, 0); - spin_unlock(&inode->i_lock); + res = nfs_commit_set_lock(NFS_I(inode), may_wait); + if (res <= 0) + goto out_mark_dirty; + nfs_init_cinfo_from_inode(&cinfo, inode); + res = nfs_scan_commit(inode, &head, &cinfo); if (res) { - int error = nfs_commit_list(inode, &head, how); + int error; + + error = nfs_generic_commit_list(inode, &head, how, &cinfo); if (error < 0) return error; - } + if (!may_wait) + goto out_mark_dirty; + error = wait_on_bit(&NFS_I(inode)->flags, + NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); + if (error < 0) + return error; + } else + nfs_commit_clear_lock(NFS_I(inode)); + return res; + /* Note: If we exit without ensuring that the commit is complete, + * we must mark the inode as dirty. Otherwise, future calls to + * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure + * that the data is on the disk. + */ +out_mark_dirty: + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return res; } -#else -static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how) -{ - return 0; -} -#endif -long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) +static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) { - struct inode *inode = mapping->host; - pgoff_t idx_start, idx_end; - unsigned int npages = 0; - LIST_HEAD(head); - int nocommit = how & FLUSH_NOCOMMIT; - long pages, ret; - - /* FIXME */ - if (wbc->range_cyclic) - idx_start = 0; - else { - idx_start = wbc->range_start >> PAGE_CACHE_SHIFT; - idx_end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (idx_end > idx_start) { - pgoff_t l_npages = 1 + idx_end - idx_start; - npages = l_npages; - if (sizeof(npages) != sizeof(l_npages) && - (pgoff_t)npages != l_npages) - npages = 0; - } + struct nfs_inode *nfsi = NFS_I(inode); + int flags = FLUSH_SYNC; + int ret = 0; + + /* no commits means nothing needs to be done */ + if (!nfsi->commit_info.ncommit) + return ret; + + if (wbc->sync_mode == WB_SYNC_NONE) { + /* Don't commit yet if this is a non-blocking flush and there + * are a lot of outstanding writes for this mapping. + */ + if (nfsi->commit_info.ncommit <= (nfsi->npages >> 1)) + goto out_mark_dirty; + + /* don't wait for the COMMIT response */ + flags = 0; } - how &= ~FLUSH_NOCOMMIT; - spin_lock(&inode->i_lock); - do { - ret = nfs_wait_on_requests_locked(inode, idx_start, npages); - if (ret != 0) - continue; - if (nocommit) - break; - pages = nfs_scan_commit(inode, &head, idx_start, npages); - if (pages == 0) - break; - if (how & FLUSH_INVALIDATE) { - spin_unlock(&inode->i_lock); - nfs_cancel_commit_list(&head); - ret = pages; - spin_lock(&inode->i_lock); - continue; - } - pages += nfs_scan_commit(inode, &head, 0, 0); - spin_unlock(&inode->i_lock); - ret = nfs_commit_list(inode, &head, how); - spin_lock(&inode->i_lock); - } while (ret >= 0); - spin_unlock(&inode->i_lock); + ret = nfs_commit_inode(inode, flags); + if (ret >= 0) { + if (wbc->sync_mode == WB_SYNC_NONE) { + if (ret < wbc->nr_to_write) + wbc->nr_to_write -= ret; + else + wbc->nr_to_write = 0; + } + return 0; + } +out_mark_dirty: + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } - -static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how) +#else +static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) { - int ret; - - ret = nfs_writepages(mapping, wbc); - if (ret < 0) - goto out; - ret = nfs_sync_mapping_wait(mapping, wbc, how); - if (ret < 0) - goto out; return 0; -out: - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - return ret; } +#endif -/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */ -static int nfs_write_mapping(struct address_space *mapping, int how) +int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct writeback_control wbc = { - .bdi = mapping->backing_dev_info, - .sync_mode = WB_SYNC_NONE, - .nr_to_write = LONG_MAX, - .for_writepages = 1, - .range_cyclic = 1, - }; - int ret; - - ret = __nfs_write_mapping(mapping, &wbc, how); - if (ret < 0) - return ret; - wbc.sync_mode = WB_SYNC_ALL; - return __nfs_write_mapping(mapping, &wbc, how); + return nfs_commit_unstable_pages(inode, wbc); } +EXPORT_SYMBOL_GPL(nfs_write_inode); /* * flush the inode to disk. */ int nfs_wb_all(struct inode *inode) { - return nfs_write_mapping(inode->i_mapping, 0); -} + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_start = 0, + .range_end = LLONG_MAX, + }; + int ret; -int nfs_wb_nocommit(struct inode *inode) -{ - return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT); + trace_nfs_writeback_inode_enter(inode); + + ret = sync_inode(inode, &wbc); + + trace_nfs_writeback_inode_exit(inode, ret); + return ret; } +EXPORT_SYMBOL_GPL(nfs_wb_all); int nfs_wb_page_cancel(struct inode *inode, struct page *page) { struct nfs_page *req; - loff_t range_start = page_offset(page); - loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); - struct writeback_control wbc = { - .bdi = page->mapping->backing_dev_info, - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = range_start, - .range_end = range_end, - }; int ret = 0; - BUG_ON(!PageLocked(page)); - for (;;) { - req = nfs_page_find_request(page); - if (req == NULL) - goto out; - if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) { - nfs_release_request(req); - break; - } - if (nfs_lock_request_dontget(req)) { - nfs_inode_remove_request(req); - /* - * In case nfs_inode_remove_request has marked the - * page as being dirty - */ - cancel_dirty_page(page, PAGE_CACHE_SIZE); - nfs_unlock_request(req); - break; - } - ret = nfs_wait_on_request(req); - if (ret < 0) - goto out; + wait_on_page_writeback(page); + + /* blocking call to cancel all requests and join to a single (head) + * request */ + req = nfs_lock_and_join_requests(page, false); + + if (IS_ERR(req)) { + ret = PTR_ERR(req); + } else if (req) { + /* all requests from this page have been cancelled by + * nfs_lock_and_join_requests, so just remove the head + * request from the inode / page_private pointer and + * release it */ + nfs_inode_remove_request(req); + /* + * In case nfs_inode_remove_request has marked the + * page as being dirty + */ + cancel_dirty_page(page, PAGE_CACHE_SIZE); + nfs_unlock_and_release_request(req); } - if (!PagePrivate(page)) - return 0; - ret = nfs_sync_mapping_wait(page->mapping, &wbc, FLUSH_INVALIDATE); -out: + return ret; } -static int nfs_wb_page_priority(struct inode *inode, struct page *page, - int how) +/* + * Write back all requests on one page - we do this before reading it. + */ +int nfs_wb_page(struct inode *inode, struct page *page) { - loff_t range_start = page_offset(page); + loff_t range_start = page_file_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); struct writeback_control wbc = { - .bdi = page->mapping->backing_dev_info, .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, + .nr_to_write = 0, .range_start = range_start, .range_end = range_end, }; int ret; - BUG_ON(!PageLocked(page)); - if (clear_page_dirty_for_io(page)) { - ret = nfs_writepage_locked(page, &wbc); + trace_nfs_writeback_page_enter(inode); + + for (;;) { + wait_on_page_writeback(page); + if (clear_page_dirty_for_io(page)) { + ret = nfs_writepage_locked(page, &wbc); + if (ret < 0) + goto out_error; + continue; + } + ret = 0; + if (!PagePrivate(page)) + break; + ret = nfs_commit_inode(inode, FLUSH_SYNC); if (ret < 0) - goto out; + goto out_error; } - if (!PagePrivate(page)) - return 0; - ret = nfs_sync_mapping_wait(page->mapping, &wbc, how); - if (ret >= 0) - return 0; -out: - __mark_inode_dirty(inode, I_DIRTY_PAGES); +out_error: + trace_nfs_writeback_page_exit(inode, ret); return ret; } -/* - * Write back all requests on one page - we do this before reading it. - */ -int nfs_wb_page(struct inode *inode, struct page* page) +#ifdef CONFIG_MIGRATION +int nfs_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode) { - return nfs_wb_page_priority(inode, page, FLUSH_STABLE); + /* + * If PagePrivate is set, then the page is currently associated with + * an in-progress read or write request. Don't try to migrate it. + * + * FIXME: we could do this in principle, but we'll need a way to ensure + * that we can safely release the inode reference while holding + * the page lock. + */ + if (PagePrivate(page)) + return -EBUSY; + + if (!nfs_fscache_release_page(page, GFP_KERNEL)) + return -EBUSY; + + return migrate_page(mapping, newpage, page, mode); } +#endif int __init nfs_init_writepagecache(void) { nfs_wdata_cachep = kmem_cache_create("nfs_write_data", - sizeof(struct nfs_write_data), + sizeof(struct nfs_rw_header), 0, SLAB_HWCACHE_ALIGN, NULL); if (nfs_wdata_cachep == NULL) @@ -1523,12 +1893,19 @@ int __init nfs_init_writepagecache(void) nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE, nfs_wdata_cachep); if (nfs_wdata_mempool == NULL) - return -ENOMEM; + goto out_destroy_write_cache; + + nfs_cdata_cachep = kmem_cache_create("nfs_commit_data", + sizeof(struct nfs_commit_data), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (nfs_cdata_cachep == NULL) + goto out_destroy_write_mempool; nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT, - nfs_wdata_cachep); + nfs_cdata_cachep); if (nfs_commit_mempool == NULL) - return -ENOMEM; + goto out_destroy_commit_cache; /* * NFS congestion size, scale with available memory. @@ -1551,12 +1928,30 @@ int __init nfs_init_writepagecache(void) nfs_congestion_kb = 256*1024; return 0; + +out_destroy_commit_cache: + kmem_cache_destroy(nfs_cdata_cachep); +out_destroy_write_mempool: + mempool_destroy(nfs_wdata_mempool); +out_destroy_write_cache: + kmem_cache_destroy(nfs_wdata_cachep); + return -ENOMEM; } void nfs_destroy_writepagecache(void) { mempool_destroy(nfs_commit_mempool); + kmem_cache_destroy(nfs_cdata_cachep); mempool_destroy(nfs_wdata_mempool); kmem_cache_destroy(nfs_wdata_cachep); } +static const struct nfs_rw_ops nfs_rw_write_ops = { + .rw_mode = FMODE_WRITE, + .rw_alloc_header = nfs_writehdr_alloc, + .rw_free_header = nfs_writehdr_free, + .rw_release = nfs_writeback_release_common, + .rw_done = nfs_writeback_done, + .rw_result = nfs_writeback_result, + .rw_initiate = nfs_initiate_write, +}; |
