aboutsummaryrefslogtreecommitdiff
path: root/net/sunrpc
diff options
context:
space:
mode:
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/Kconfig41
-rw-r--r--net/sunrpc/Makefile3
-rw-r--r--net/sunrpc/addr.c3
-rw-r--r--net/sunrpc/auth.c157
-rw-r--r--net/sunrpc/auth_generic.c98
-rw-r--r--net/sunrpc/auth_gss/Makefile3
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c636
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_keys.c17
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c13
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_unseal.c8
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c12
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c135
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.c382
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.h48
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c839
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.h267
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c393
-rw-r--r--net/sunrpc/auth_null.c6
-rw-r--r--net/sunrpc/auth_unix.c42
-rw-r--r--net/sunrpc/backchannel_rqst.c103
-rw-r--r--net/sunrpc/cache.c211
-rw-r--r--net/sunrpc/clnt.c571
-rw-r--r--net/sunrpc/netns.h9
-rw-r--r--net/sunrpc/rpc_pipe.c426
-rw-r--r--net/sunrpc/rpcb_clnt.c49
-rw-r--r--net/sunrpc/sched.c66
-rw-r--r--net/sunrpc/socklib.c3
-rw-r--r--net/sunrpc/stats.c4
-rw-r--r--net/sunrpc/sunrpc.h13
-rw-r--r--net/sunrpc/sunrpc_syms.c8
-rw-r--r--net/sunrpc/svc.c42
-rw-r--r--net/sunrpc/svc_xprt.c92
-rw-r--r--net/sunrpc/svcauth.c5
-rw-r--r--net/sunrpc/svcauth_unix.c69
-rw-r--r--net/sunrpc/svcsock.c66
-rw-r--r--net/sunrpc/sysctl.c10
-rw-r--r--net/sunrpc/xdr.c246
-rw-r--r--net/sunrpc/xprt.c204
-rw-r--r--net/sunrpc/xprtrdma/Makefile4
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c127
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c8
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c20
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c653
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c232
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c72
-rw-r--r--net/sunrpc/xprtrdma/transport.c112
-rw-r--r--net/sunrpc/xprtrdma/verbs.c753
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h23
-rw-r--r--net/sunrpc/xprtsock.c290
49 files changed, 5296 insertions, 2298 deletions
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 03d03e37a7d..0754d0f466d 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -3,28 +3,15 @@ config SUNRPC
config SUNRPC_GSS
tristate
+ select OID_REGISTRY
config SUNRPC_BACKCHANNEL
bool
depends on SUNRPC
-config SUNRPC_XPRT_RDMA
- tristate
- depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS && EXPERIMENTAL
- default SUNRPC && INFINIBAND
- help
- This option allows the NFS client and server to support
- an RDMA-enabled transport.
-
- To compile RPC client RDMA transport support as a module,
- choose M here: the module will be called xprtrdma.
-
- If unsure, say N.
-
config SUNRPC_SWAP
bool
depends on SUNRPC
- select NETVM
config RPCSEC_GSS_KRB5
tristate "Secure RPC: Kerberos V mechanism"
@@ -57,3 +44,29 @@ config SUNRPC_DEBUG
but makes troubleshooting NFS issues significantly harder.
If unsure, say Y.
+
+config SUNRPC_XPRT_RDMA_CLIENT
+ tristate "RPC over RDMA Client Support"
+ depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
+ default SUNRPC && INFINIBAND
+ help
+ This option allows the NFS client to support an RDMA-enabled
+ transport.
+
+ To compile RPC client RDMA transport support as a module,
+ choose M here: the module will be called xprtrdma.
+
+ If unsure, say N.
+
+config SUNRPC_XPRT_RDMA_SERVER
+ tristate "RPC over RDMA Server Support"
+ depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
+ default SUNRPC && INFINIBAND
+ help
+ This option allows the NFS server to support an RDMA-enabled
+ transport.
+
+ To compile RPC server RDMA transport support as a module,
+ choose M here: the module will be called svcrdma.
+
+ If unsure, say N.
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 8209a0411bc..e5a7a1cac8f 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -5,7 +5,8 @@
obj-$(CONFIG_SUNRPC) += sunrpc.o
obj-$(CONFIG_SUNRPC_GSS) += auth_gss/
-obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/
+
+obj-y += xprtrdma/
sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
auth.o auth_null.o auth_unix.o auth_generic.o \
diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
index d11418f97f1..a622ad64acd 100644
--- a/net/sunrpc/addr.c
+++ b/net/sunrpc/addr.c
@@ -17,7 +17,8 @@
*/
#include <net/ipv6.h>
-#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/msg_prot.h>
#include <linux/slab.h>
#include <linux/export.h>
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index b5c067bccc4..f7736671742 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -82,7 +82,7 @@ MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size");
static u32
pseudoflavor_to_flavor(u32 flavor) {
- if (flavor >= RPC_AUTH_MAXFLAVOR)
+ if (flavor > RPC_AUTH_MAXFLAVOR)
return RPC_AUTH_GSS;
return flavor;
}
@@ -124,6 +124,79 @@ rpcauth_unregister(const struct rpc_authops *ops)
EXPORT_SYMBOL_GPL(rpcauth_unregister);
/**
+ * rpcauth_get_pseudoflavor - check if security flavor is supported
+ * @flavor: a security flavor
+ * @info: a GSS mech OID, quality of protection, and service value
+ *
+ * Verifies that an appropriate kernel module is available or already loaded.
+ * Returns an equivalent pseudoflavor, or RPC_AUTH_MAXFLAVOR if "flavor" is
+ * not supported locally.
+ */
+rpc_authflavor_t
+rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info)
+{
+ const struct rpc_authops *ops;
+ rpc_authflavor_t pseudoflavor;
+
+ ops = auth_flavors[flavor];
+ if (ops == NULL)
+ request_module("rpc-auth-%u", flavor);
+ spin_lock(&rpc_authflavor_lock);
+ ops = auth_flavors[flavor];
+ if (ops == NULL || !try_module_get(ops->owner)) {
+ spin_unlock(&rpc_authflavor_lock);
+ return RPC_AUTH_MAXFLAVOR;
+ }
+ spin_unlock(&rpc_authflavor_lock);
+
+ pseudoflavor = flavor;
+ if (ops->info2flavor != NULL)
+ pseudoflavor = ops->info2flavor(info);
+
+ module_put(ops->owner);
+ return pseudoflavor;
+}
+EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor);
+
+/**
+ * rpcauth_get_gssinfo - find GSS tuple matching a GSS pseudoflavor
+ * @pseudoflavor: GSS pseudoflavor to match
+ * @info: rpcsec_gss_info structure to fill in
+ *
+ * Returns zero and fills in "info" if pseudoflavor matches a
+ * supported mechanism.
+ */
+int
+rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info)
+{
+ rpc_authflavor_t flavor = pseudoflavor_to_flavor(pseudoflavor);
+ const struct rpc_authops *ops;
+ int result;
+
+ if (flavor >= RPC_AUTH_MAXFLAVOR)
+ return -EINVAL;
+
+ ops = auth_flavors[flavor];
+ if (ops == NULL)
+ request_module("rpc-auth-%u", flavor);
+ spin_lock(&rpc_authflavor_lock);
+ ops = auth_flavors[flavor];
+ if (ops == NULL || !try_module_get(ops->owner)) {
+ spin_unlock(&rpc_authflavor_lock);
+ return -ENOENT;
+ }
+ spin_unlock(&rpc_authflavor_lock);
+
+ result = -ENOENT;
+ if (ops->flavor2info != NULL)
+ result = ops->flavor2info(pseudoflavor, info);
+
+ module_put(ops->owner);
+ return result;
+}
+EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo);
+
+/**
* rpcauth_list_flavors - discover registered flavors and pseudoflavors
* @array: array to fill in
* @size: size of "array"
@@ -177,11 +250,11 @@ rpcauth_list_flavors(rpc_authflavor_t *array, int size)
EXPORT_SYMBOL_GPL(rpcauth_list_flavors);
struct rpc_auth *
-rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
+rpcauth_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
{
struct rpc_auth *auth;
const struct rpc_authops *ops;
- u32 flavor = pseudoflavor_to_flavor(pseudoflavor);
+ u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor);
auth = ERR_PTR(-EINVAL);
if (flavor >= RPC_AUTH_MAXFLAVOR)
@@ -196,7 +269,7 @@ rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
goto out;
}
spin_unlock(&rpc_authflavor_lock);
- auth = ops->create(clnt, pseudoflavor);
+ auth = ops->create(args, clnt);
module_put(ops->owner);
if (IS_ERR(auth))
return auth;
@@ -223,7 +296,7 @@ static void
rpcauth_unhash_cred_locked(struct rpc_cred *cred)
{
hlist_del_rcu(&cred->cr_hash);
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags);
}
@@ -270,6 +343,27 @@ out_nocache:
EXPORT_SYMBOL_GPL(rpcauth_init_credcache);
/*
+ * Setup a credential key lifetime timeout notification
+ */
+int
+rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred)
+{
+ if (!cred->cr_auth->au_ops->key_timeout)
+ return 0;
+ return cred->cr_auth->au_ops->key_timeout(auth, cred);
+}
+EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify);
+
+bool
+rpcauth_cred_key_to_expire(struct rpc_cred *cred)
+{
+ if (!cred->cr_ops->crkey_to_expire)
+ return false;
+ return cred->cr_ops->crkey_to_expire(cred);
+}
+EXPORT_SYMBOL_GPL(rpcauth_cred_key_to_expire);
+
+/*
* Destroy a list of credentials
*/
static inline
@@ -340,12 +434,13 @@ EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache);
/*
* Remove stale credentials. Avoid sleeping inside the loop.
*/
-static int
+static long
rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
{
spinlock_t *cache_lock;
struct rpc_cred *cred, *next;
unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM;
+ long freed = 0;
list_for_each_entry_safe(cred, next, &cred_unused, cr_lru) {
@@ -357,10 +452,11 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
*/
if (time_in_range(cred->cr_expire, expired, jiffies) &&
test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0)
- return 0;
+ break;
list_del_init(&cred->cr_lru);
number_cred_unused--;
+ freed++;
if (atomic_read(&cred->cr_count) != 0)
continue;
@@ -373,29 +469,39 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
}
spin_unlock(cache_lock);
}
- return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
+ return freed;
}
/*
* Run memory cache shrinker.
*/
-static int
-rpcauth_cache_shrinker(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+
{
LIST_HEAD(free);
- int res;
- int nr_to_scan = sc->nr_to_scan;
- gfp_t gfp_mask = sc->gfp_mask;
+ unsigned long freed;
+
+ if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+ return SHRINK_STOP;
- if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
- return (nr_to_scan == 0) ? 0 : -1;
+ /* nothing left, don't come back */
if (list_empty(&cred_unused))
- return 0;
+ return SHRINK_STOP;
+
spin_lock(&rpc_credcache_lock);
- res = rpcauth_prune_expired(&free, nr_to_scan);
+ freed = rpcauth_prune_expired(&free, sc->nr_to_scan);
spin_unlock(&rpc_credcache_lock);
rpcauth_destroy_credlist(&free);
- return res;
+
+ return freed;
+}
+
+static unsigned long
+rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+
+{
+ return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
}
/*
@@ -407,15 +513,14 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
{
LIST_HEAD(free);
struct rpc_cred_cache *cache = auth->au_credcache;
- struct hlist_node *pos;
struct rpc_cred *cred = NULL,
*entry, *new;
unsigned int nr;
- nr = hash_long(acred->uid, cache->hashbits);
+ nr = hash_long(from_kuid(&init_user_ns, acred->uid), cache->hashbits);
rcu_read_lock();
- hlist_for_each_entry_rcu(entry, pos, &cache->hashtable[nr], cr_hash) {
+ hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
if (!entry->cr_ops->crmatch(acred, entry, flags))
continue;
spin_lock(&cache->lock);
@@ -439,7 +544,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
}
spin_lock(&cache->lock);
- hlist_for_each_entry(entry, pos, &cache->hashtable[nr], cr_hash) {
+ hlist_for_each_entry(entry, &cache->hashtable[nr], cr_hash) {
if (!entry->cr_ops->crmatch(acred, entry, flags))
continue;
cred = get_rpccred(entry);
@@ -487,6 +592,7 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags)
put_group_info(acred.group_info);
return ret;
}
+EXPORT_SYMBOL_GPL(rpcauth_lookupcred);
void
rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
@@ -519,8 +625,8 @@ rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags)
{
struct rpc_auth *auth = task->tk_client->cl_auth;
struct auth_cred acred = {
- .uid = 0,
- .gid = 0,
+ .uid = GLOBAL_ROOT_UID,
+ .gid = GLOBAL_ROOT_GID,
};
dprintk("RPC: %5u looking up %s cred\n",
@@ -712,7 +818,8 @@ rpcauth_uptodatecred(struct rpc_task *task)
}
static struct shrinker rpc_cred_shrinker = {
- .shrink = rpcauth_cache_shrinker,
+ .count_objects = rpcauth_cache_shrink_count,
+ .scan_objects = rpcauth_cache_shrink_scan,
.seeks = DEFAULT_SEEKS,
};
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 6ed6f201b02..ed04869b2d4 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -18,8 +18,8 @@
# define RPCDBG_FACILITY RPCDBG_AUTH
#endif
-#define RPC_MACHINE_CRED_USERID ((uid_t)0)
-#define RPC_MACHINE_CRED_GROUPID ((gid_t)0)
+#define RPC_MACHINE_CRED_USERID GLOBAL_ROOT_UID
+#define RPC_MACHINE_CRED_GROUPID GLOBAL_ROOT_GID
struct generic_cred {
struct rpc_cred gc_base;
@@ -89,6 +89,7 @@ generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
gcred->acred.uid = acred->uid;
gcred->acred.gid = acred->gid;
gcred->acred.group_info = acred->group_info;
+ gcred->acred.ac_flags = 0;
if (gcred->acred.group_info != NULL)
get_group_info(gcred->acred.group_info);
gcred->acred.machine_cred = acred->machine_cred;
@@ -96,7 +97,9 @@ generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
dprintk("RPC: allocated %s cred %p for uid %d gid %d\n",
gcred->acred.machine_cred ? "machine" : "generic",
- gcred, acred->uid, acred->gid);
+ gcred,
+ from_kuid(&init_user_ns, acred->uid),
+ from_kgid(&init_user_ns, acred->gid));
return &gcred->gc_base;
}
@@ -129,8 +132,8 @@ machine_cred_match(struct auth_cred *acred, struct generic_cred *gcred, int flag
{
if (!gcred->acred.machine_cred ||
gcred->acred.principal != acred->principal ||
- gcred->acred.uid != acred->uid ||
- gcred->acred.gid != acred->gid)
+ !uid_eq(gcred->acred.uid, acred->uid) ||
+ !gid_eq(gcred->acred.gid, acred->gid))
return 0;
return 1;
}
@@ -147,8 +150,8 @@ generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags)
if (acred->machine_cred)
return machine_cred_match(acred, gcred, flags);
- if (gcred->acred.uid != acred->uid ||
- gcred->acred.gid != acred->gid ||
+ if (!uid_eq(gcred->acred.uid, acred->uid) ||
+ !gid_eq(gcred->acred.gid, acred->gid) ||
gcred->acred.machine_cred != 0)
goto out_nomatch;
@@ -180,11 +183,78 @@ void rpc_destroy_generic_auth(void)
rpcauth_destroy_credcache(&generic_auth);
}
+/*
+ * Test the the current time (now) against the underlying credential key expiry
+ * minus a timeout and setup notification.
+ *
+ * The normal case:
+ * If 'now' is before the key expiry minus RPC_KEY_EXPIRE_TIMEO, set
+ * the RPC_CRED_NOTIFY_TIMEOUT flag to setup the underlying credential
+ * rpc_credops crmatch routine to notify this generic cred when it's key
+ * expiration is within RPC_KEY_EXPIRE_TIMEO, and return 0.
+ *
+ * The error case:
+ * If the underlying cred lookup fails, return -EACCES.
+ *
+ * The 'almost' error case:
+ * If 'now' is within key expiry minus RPC_KEY_EXPIRE_TIMEO, but not within
+ * key expiry minus RPC_KEY_EXPIRE_FAIL, set the RPC_CRED_EXPIRE_SOON bit
+ * on the acred ac_flags and return 0.
+ */
+static int
+generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
+{
+ struct auth_cred *acred = &container_of(cred, struct generic_cred,
+ gc_base)->acred;
+ struct rpc_cred *tcred;
+ int ret = 0;
+
+
+ /* Fast track for non crkey_timeout (no key) underlying credentials */
+ if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags))
+ return 0;
+
+ /* Fast track for the normal case */
+ if (test_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags))
+ return 0;
+
+ /* lookup_cred either returns a valid referenced rpc_cred, or PTR_ERR */
+ tcred = auth->au_ops->lookup_cred(auth, acred, 0);
+ if (IS_ERR(tcred))
+ return -EACCES;
+
+ if (!tcred->cr_ops->crkey_timeout) {
+ set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags);
+ ret = 0;
+ goto out_put;
+ }
+
+ /* Test for the almost error case */
+ ret = tcred->cr_ops->crkey_timeout(tcred);
+ if (ret != 0) {
+ set_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags);
+ ret = 0;
+ } else {
+ /* In case underlying cred key has been reset */
+ if (test_and_clear_bit(RPC_CRED_KEY_EXPIRE_SOON,
+ &acred->ac_flags))
+ dprintk("RPC: UID %d Credential key reset\n",
+ from_kuid(&init_user_ns, tcred->cr_uid));
+ /* set up fasttrack for the normal case */
+ set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags);
+ }
+
+out_put:
+ put_rpccred(tcred);
+ return ret;
+}
+
static const struct rpc_authops generic_auth_ops = {
.owner = THIS_MODULE,
.au_name = "Generic",
.lookup_cred = generic_lookup_cred,
.crcreate = generic_create_cred,
+ .key_timeout = generic_key_timeout,
};
static struct rpc_auth generic_auth = {
@@ -192,9 +262,23 @@ static struct rpc_auth generic_auth = {
.au_count = ATOMIC_INIT(0),
};
+static bool generic_key_to_expire(struct rpc_cred *cred)
+{
+ struct auth_cred *acred = &container_of(cred, struct generic_cred,
+ gc_base)->acred;
+ bool ret;
+
+ get_rpccred(cred);
+ ret = test_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags);
+ put_rpccred(cred);
+
+ return ret;
+}
+
static const struct rpc_credops generic_credops = {
.cr_name = "Generic cred",
.crdestroy = generic_destroy_cred,
.crbind = generic_bind_cred,
.crmatch = generic_match,
+ .crkey_to_expire = generic_key_to_expire,
};
diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile
index 9e4cb59ef9f..14e9e53e63d 100644
--- a/net/sunrpc/auth_gss/Makefile
+++ b/net/sunrpc/auth_gss/Makefile
@@ -5,7 +5,8 @@
obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o
auth_rpcgss-y := auth_gss.o gss_generic_token.o \
- gss_mech_switch.o svcauth_gss.o
+ gss_mech_switch.o svcauth_gss.o \
+ gss_rpc_upcall.o gss_rpc_xdr.o
obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 6e5c824b040..b6e440baccc 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -51,6 +51,9 @@
#include <linux/sunrpc/rpc_pipe_fs.h>
#include <linux/sunrpc/gss_api.h>
#include <asm/uaccess.h>
+#include <linux/hashtable.h>
+
+#include "../netns.h"
static const struct rpc_authops authgss_ops;
@@ -60,6 +63,9 @@ static const struct rpc_credops gss_nullops;
#define GSS_RETRY_EXPIRED 5
static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED;
+#define GSS_KEY_EXPIRE_TIMEO 240
+static unsigned int gss_key_expire_timeo = GSS_KEY_EXPIRE_TIMEO;
+
#ifdef RPC_DEBUG
# define RPCDBG_FACILITY RPCDBG_AUTH
#endif
@@ -69,27 +75,40 @@ static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED;
* using integrity (two 4-byte integers): */
#define GSS_VERF_SLACK 100
+static DEFINE_HASHTABLE(gss_auth_hash_table, 4);
+static DEFINE_SPINLOCK(gss_auth_hash_lock);
+
+struct gss_pipe {
+ struct rpc_pipe_dir_object pdo;
+ struct rpc_pipe *pipe;
+ struct rpc_clnt *clnt;
+ const char *name;
+ struct kref kref;
+};
+
struct gss_auth {
struct kref kref;
+ struct hlist_node hash;
struct rpc_auth rpc_auth;
struct gss_api_mech *mech;
enum rpc_gss_svc service;
struct rpc_clnt *client;
+ struct net *net;
/*
* There are two upcall pipes; dentry[1], named "gssd", is used
* for the new text-based upcall; dentry[0] is named after the
* mechanism (for example, "krb5") and exists for
* backwards-compatibility with older gssd's.
*/
- struct rpc_pipe *pipe[2];
+ struct gss_pipe *gss_pipe[2];
+ const char *target_name;
};
/* pipe_version >= 0 if and only if someone has a pipe open. */
-static int pipe_version = -1;
-static atomic_t pipe_users = ATOMIC_INIT(0);
static DEFINE_SPINLOCK(pipe_version_lock);
static struct rpc_wait_queue pipe_version_rpc_waitqueue;
static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue);
+static void gss_put_auth(struct gss_auth *gss_auth);
static void gss_free_ctx(struct gss_cl_ctx *);
static const struct rpc_pipe_ops gss_upcall_ops_v0;
@@ -124,7 +143,7 @@ gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx)
gss_get_ctx(ctx);
rcu_assign_pointer(gss_cred->gc_ctx, ctx);
set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags);
}
@@ -238,7 +257,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
p = ERR_PTR(-EFAULT);
goto err;
}
- ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, GFP_NOFS);
+ ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_NOFS);
if (ret < 0) {
p = ERR_PTR(ret);
goto err;
@@ -247,8 +266,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
__func__, ctx->gc_expiry, now, timeout);
return q;
err:
- dprintk("RPC: %s returns %ld gc_expiry %lu now %lu timeout %u\n",
- __func__, -PTR_ERR(p), ctx->gc_expiry, now, timeout);
+ dprintk("RPC: %s returns error %ld\n", __func__, -PTR_ERR(p));
return p;
}
@@ -256,7 +274,7 @@ err:
struct gss_upcall_msg {
atomic_t count;
- uid_t uid;
+ kuid_t uid;
struct rpc_pipe_msg msg;
struct list_head list;
struct gss_auth *auth;
@@ -267,24 +285,27 @@ struct gss_upcall_msg {
char databuf[UPCALL_BUF_LEN];
};
-static int get_pipe_version(void)
+static int get_pipe_version(struct net *net)
{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
int ret;
spin_lock(&pipe_version_lock);
- if (pipe_version >= 0) {
- atomic_inc(&pipe_users);
- ret = pipe_version;
+ if (sn->pipe_version >= 0) {
+ atomic_inc(&sn->pipe_users);
+ ret = sn->pipe_version;
} else
ret = -EAGAIN;
spin_unlock(&pipe_version_lock);
return ret;
}
-static void put_pipe_version(void)
+static void put_pipe_version(struct net *net)
{
- if (atomic_dec_and_lock(&pipe_users, &pipe_version_lock)) {
- pipe_version = -1;
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ if (atomic_dec_and_lock(&sn->pipe_users, &pipe_version_lock)) {
+ sn->pipe_version = -1;
spin_unlock(&pipe_version_lock);
}
}
@@ -292,22 +313,24 @@ static void put_pipe_version(void)
static void
gss_release_msg(struct gss_upcall_msg *gss_msg)
{
+ struct net *net = gss_msg->auth->net;
if (!atomic_dec_and_test(&gss_msg->count))
return;
- put_pipe_version();
+ put_pipe_version(net);
BUG_ON(!list_empty(&gss_msg->list));
if (gss_msg->ctx != NULL)
gss_put_ctx(gss_msg->ctx);
rpc_destroy_wait_queue(&gss_msg->rpc_waitqueue);
+ gss_put_auth(gss_msg->auth);
kfree(gss_msg);
}
static struct gss_upcall_msg *
-__gss_find_upcall(struct rpc_pipe *pipe, uid_t uid)
+__gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid)
{
struct gss_upcall_msg *pos;
list_for_each_entry(pos, &pipe->in_downcall, list) {
- if (pos->uid != uid)
+ if (!uid_eq(pos->uid, uid))
continue;
atomic_inc(&pos->count);
dprintk("RPC: %s found msg %p\n", __func__, pos);
@@ -395,89 +418,109 @@ gss_upcall_callback(struct rpc_task *task)
static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg)
{
- gss_msg->msg.data = &gss_msg->uid;
- gss_msg->msg.len = sizeof(gss_msg->uid);
+ uid_t uid = from_kuid(&init_user_ns, gss_msg->uid);
+ memcpy(gss_msg->databuf, &uid, sizeof(uid));
+ gss_msg->msg.data = gss_msg->databuf;
+ gss_msg->msg.len = sizeof(uid);
+
+ BUILD_BUG_ON(sizeof(uid) > sizeof(gss_msg->databuf));
}
-static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
- struct rpc_clnt *clnt,
- const char *service_name)
+static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
+ const char *service_name,
+ const char *target_name)
{
struct gss_api_mech *mech = gss_msg->auth->mech;
char *p = gss_msg->databuf;
- int len = 0;
-
- gss_msg->msg.len = sprintf(gss_msg->databuf, "mech=%s uid=%d ",
- mech->gm_name,
- gss_msg->uid);
- p += gss_msg->msg.len;
- if (clnt->cl_principal) {
- len = sprintf(p, "target=%s ", clnt->cl_principal);
+ size_t buflen = sizeof(gss_msg->databuf);
+ int len;
+
+ len = scnprintf(p, buflen, "mech=%s uid=%d ", mech->gm_name,
+ from_kuid(&init_user_ns, gss_msg->uid));
+ buflen -= len;
+ p += len;
+ gss_msg->msg.len = len;
+ if (target_name) {
+ len = scnprintf(p, buflen, "target=%s ", target_name);
+ buflen -= len;
p += len;
gss_msg->msg.len += len;
}
if (service_name != NULL) {
- len = sprintf(p, "service=%s ", service_name);
+ len = scnprintf(p, buflen, "service=%s ", service_name);
+ buflen -= len;
p += len;
gss_msg->msg.len += len;
}
if (mech->gm_upcall_enctypes) {
- len = sprintf(p, "enctypes=%s ", mech->gm_upcall_enctypes);
+ len = scnprintf(p, buflen, "enctypes=%s ",
+ mech->gm_upcall_enctypes);
+ buflen -= len;
p += len;
gss_msg->msg.len += len;
}
- len = sprintf(p, "\n");
+ len = scnprintf(p, buflen, "\n");
+ if (len == 0)
+ goto out_overflow;
gss_msg->msg.len += len;
gss_msg->msg.data = gss_msg->databuf;
- BUG_ON(gss_msg->msg.len > UPCALL_BUF_LEN);
-}
-
-static void gss_encode_msg(struct gss_upcall_msg *gss_msg,
- struct rpc_clnt *clnt,
- const char *service_name)
-{
- if (pipe_version == 0)
- gss_encode_v0_msg(gss_msg);
- else /* pipe_version == 1 */
- gss_encode_v1_msg(gss_msg, clnt, service_name);
+ return 0;
+out_overflow:
+ WARN_ON_ONCE(1);
+ return -ENOMEM;
}
static struct gss_upcall_msg *
-gss_alloc_msg(struct gss_auth *gss_auth, struct rpc_clnt *clnt,
- uid_t uid, const char *service_name)
+gss_alloc_msg(struct gss_auth *gss_auth,
+ kuid_t uid, const char *service_name)
{
struct gss_upcall_msg *gss_msg;
int vers;
+ int err = -ENOMEM;
gss_msg = kzalloc(sizeof(*gss_msg), GFP_NOFS);
if (gss_msg == NULL)
- return ERR_PTR(-ENOMEM);
- vers = get_pipe_version();
- if (vers < 0) {
- kfree(gss_msg);
- return ERR_PTR(vers);
- }
- gss_msg->pipe = gss_auth->pipe[vers];
+ goto err;
+ vers = get_pipe_version(gss_auth->net);
+ err = vers;
+ if (err < 0)
+ goto err_free_msg;
+ gss_msg->pipe = gss_auth->gss_pipe[vers]->pipe;
INIT_LIST_HEAD(&gss_msg->list);
rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq");
init_waitqueue_head(&gss_msg->waitqueue);
atomic_set(&gss_msg->count, 1);
gss_msg->uid = uid;
gss_msg->auth = gss_auth;
- gss_encode_msg(gss_msg, clnt, service_name);
+ switch (vers) {
+ case 0:
+ gss_encode_v0_msg(gss_msg);
+ break;
+ default:
+ err = gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name);
+ if (err)
+ goto err_put_pipe_version;
+ };
+ kref_get(&gss_auth->kref);
return gss_msg;
+err_put_pipe_version:
+ put_pipe_version(gss_auth->net);
+err_free_msg:
+ kfree(gss_msg);
+err:
+ return ERR_PTR(err);
}
static struct gss_upcall_msg *
-gss_setup_upcall(struct rpc_clnt *clnt, struct gss_auth *gss_auth, struct rpc_cred *cred)
+gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred)
{
struct gss_cred *gss_cred = container_of(cred,
struct gss_cred, gc_base);
struct gss_upcall_msg *gss_new, *gss_msg;
- uid_t uid = cred->cr_uid;
+ kuid_t uid = cred->cr_uid;
- gss_new = gss_alloc_msg(gss_auth, clnt, uid, gss_cred->gc_principal);
+ gss_new = gss_alloc_msg(gss_auth, uid, gss_cred->gc_principal);
if (IS_ERR(gss_new))
return gss_new;
gss_msg = gss_add_msg(gss_new);
@@ -494,14 +537,7 @@ gss_setup_upcall(struct rpc_clnt *clnt, struct gss_auth *gss_auth, struct rpc_cr
static void warn_gssd(void)
{
- static unsigned long ratelimit;
- unsigned long now = jiffies;
-
- if (time_after(now, ratelimit)) {
- printk(KERN_WARNING "RPC: AUTH_GSS upcall timed out.\n"
- "Please check user daemon is running.\n");
- ratelimit = now + 15*HZ;
- }
+ dprintk("AUTH_GSS upcall failed. Please check user daemon is running.\n");
}
static inline int
@@ -517,8 +553,8 @@ gss_refresh_upcall(struct rpc_task *task)
int err = 0;
dprintk("RPC: %5u %s for uid %u\n",
- task->tk_pid, __func__, cred->cr_uid);
- gss_msg = gss_setup_upcall(task->tk_client, gss_auth, cred);
+ task->tk_pid, __func__, from_kuid(&init_user_ns, cred->cr_uid));
+ gss_msg = gss_setup_upcall(gss_auth, cred);
if (PTR_ERR(gss_msg) == -EAGAIN) {
/* XXX: warning on the first, under the assumption we
* shouldn't normally hit this case on a refresh. */
@@ -549,30 +585,40 @@ gss_refresh_upcall(struct rpc_task *task)
gss_release_msg(gss_msg);
out:
dprintk("RPC: %5u %s for uid %u result %d\n",
- task->tk_pid, __func__, cred->cr_uid, err);
+ task->tk_pid, __func__,
+ from_kuid(&init_user_ns, cred->cr_uid), err);
return err;
}
static inline int
gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
{
+ struct net *net = gss_auth->net;
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
struct rpc_pipe *pipe;
struct rpc_cred *cred = &gss_cred->gc_base;
struct gss_upcall_msg *gss_msg;
DEFINE_WAIT(wait);
- int err = 0;
+ int err;
- dprintk("RPC: %s for uid %u\n", __func__, cred->cr_uid);
+ dprintk("RPC: %s for uid %u\n",
+ __func__, from_kuid(&init_user_ns, cred->cr_uid));
retry:
- gss_msg = gss_setup_upcall(gss_auth->client, gss_auth, cred);
+ err = 0;
+ /* if gssd is down, just skip upcalling altogether */
+ if (!gssd_running(net)) {
+ warn_gssd();
+ return -EACCES;
+ }
+ gss_msg = gss_setup_upcall(gss_auth, cred);
if (PTR_ERR(gss_msg) == -EAGAIN) {
err = wait_event_interruptible_timeout(pipe_version_waitqueue,
- pipe_version >= 0, 15*HZ);
- if (pipe_version < 0) {
+ sn->pipe_version >= 0, 15 * HZ);
+ if (sn->pipe_version < 0) {
warn_gssd();
err = -EACCES;
}
- if (err)
+ if (err < 0)
goto out;
goto retry;
}
@@ -604,7 +650,7 @@ out_intr:
gss_release_msg(gss_msg);
out:
dprintk("RPC: %s for uid %u result %d\n",
- __func__, cred->cr_uid, err);
+ __func__, from_kuid(&init_user_ns, cred->cr_uid), err);
return err;
}
@@ -616,9 +662,10 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
const void *p, *end;
void *buf;
struct gss_upcall_msg *gss_msg;
- struct rpc_pipe *pipe = RPC_I(filp->f_dentry->d_inode)->pipe;
+ struct rpc_pipe *pipe = RPC_I(file_inode(filp))->pipe;
struct gss_cl_ctx *ctx;
- uid_t uid;
+ uid_t id;
+ kuid_t uid;
ssize_t err = -EFBIG;
if (mlen > MSG_BUF_MAXSIZE)
@@ -633,12 +680,18 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
goto err;
end = (const void *)((char *)buf + mlen);
- p = simple_get_bytes(buf, end, &uid, sizeof(uid));
+ p = simple_get_bytes(buf, end, &id, sizeof(id));
if (IS_ERR(p)) {
err = PTR_ERR(p);
goto err;
}
+ uid = make_kuid(&init_user_ns, id);
+ if (!uid_valid(uid)) {
+ err = -EINVAL;
+ goto err;
+ }
+
err = -ENOMEM;
ctx = gss_alloc_context();
if (ctx == NULL)
@@ -696,20 +749,22 @@ out:
static int gss_pipe_open(struct inode *inode, int new_version)
{
+ struct net *net = inode->i_sb->s_fs_info;
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
int ret = 0;
spin_lock(&pipe_version_lock);
- if (pipe_version < 0) {
+ if (sn->pipe_version < 0) {
/* First open of any gss pipe determines the version: */
- pipe_version = new_version;
+ sn->pipe_version = new_version;
rpc_wake_up(&pipe_version_rpc_waitqueue);
wake_up(&pipe_version_waitqueue);
- } else if (pipe_version != new_version) {
+ } else if (sn->pipe_version != new_version) {
/* Trying to open a pipe of a different version */
ret = -EBUSY;
goto out;
}
- atomic_inc(&pipe_users);
+ atomic_inc(&sn->pipe_users);
out:
spin_unlock(&pipe_version_lock);
return ret;
@@ -729,6 +784,7 @@ static int gss_pipe_open_v1(struct inode *inode)
static void
gss_pipe_release(struct inode *inode)
{
+ struct net *net = inode->i_sb->s_fs_info;
struct rpc_pipe *pipe = RPC_I(inode)->pipe;
struct gss_upcall_msg *gss_msg;
@@ -747,7 +803,7 @@ restart:
}
spin_unlock(&pipe->lock);
- put_pipe_version();
+ put_pipe_version(net);
}
static void
@@ -766,83 +822,153 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
}
}
-static void gss_pipes_dentries_destroy(struct rpc_auth *auth)
+static void gss_pipe_dentry_destroy(struct dentry *dir,
+ struct rpc_pipe_dir_object *pdo)
{
- struct gss_auth *gss_auth;
+ struct gss_pipe *gss_pipe = pdo->pdo_data;
+ struct rpc_pipe *pipe = gss_pipe->pipe;
- gss_auth = container_of(auth, struct gss_auth, rpc_auth);
- if (gss_auth->pipe[0]->dentry)
- rpc_unlink(gss_auth->pipe[0]->dentry);
- if (gss_auth->pipe[1]->dentry)
- rpc_unlink(gss_auth->pipe[1]->dentry);
+ if (pipe->dentry != NULL) {
+ rpc_unlink(pipe->dentry);
+ pipe->dentry = NULL;
+ }
}
-static int gss_pipes_dentries_create(struct rpc_auth *auth)
+static int gss_pipe_dentry_create(struct dentry *dir,
+ struct rpc_pipe_dir_object *pdo)
{
- int err;
- struct gss_auth *gss_auth;
- struct rpc_clnt *clnt;
+ struct gss_pipe *p = pdo->pdo_data;
+ struct dentry *dentry;
- gss_auth = container_of(auth, struct gss_auth, rpc_auth);
- clnt = gss_auth->client;
-
- gss_auth->pipe[1]->dentry = rpc_mkpipe_dentry(clnt->cl_dentry,
- "gssd",
- clnt, gss_auth->pipe[1]);
- if (IS_ERR(gss_auth->pipe[1]->dentry))
- return PTR_ERR(gss_auth->pipe[1]->dentry);
- gss_auth->pipe[0]->dentry = rpc_mkpipe_dentry(clnt->cl_dentry,
- gss_auth->mech->gm_name,
- clnt, gss_auth->pipe[0]);
- if (IS_ERR(gss_auth->pipe[0]->dentry)) {
- err = PTR_ERR(gss_auth->pipe[0]->dentry);
- goto err_unlink_pipe_1;
- }
+ dentry = rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ p->pipe->dentry = dentry;
return 0;
+}
-err_unlink_pipe_1:
- rpc_unlink(gss_auth->pipe[1]->dentry);
- return err;
+static const struct rpc_pipe_dir_object_ops gss_pipe_dir_object_ops = {
+ .create = gss_pipe_dentry_create,
+ .destroy = gss_pipe_dentry_destroy,
+};
+
+static struct gss_pipe *gss_pipe_alloc(struct rpc_clnt *clnt,
+ const char *name,
+ const struct rpc_pipe_ops *upcall_ops)
+{
+ struct gss_pipe *p;
+ int err = -ENOMEM;
+
+ p = kmalloc(sizeof(*p), GFP_KERNEL);
+ if (p == NULL)
+ goto err;
+ p->pipe = rpc_mkpipe_data(upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
+ if (IS_ERR(p->pipe)) {
+ err = PTR_ERR(p->pipe);
+ goto err_free_gss_pipe;
+ }
+ p->name = name;
+ p->clnt = clnt;
+ kref_init(&p->kref);
+ rpc_init_pipe_dir_object(&p->pdo,
+ &gss_pipe_dir_object_ops,
+ p);
+ return p;
+err_free_gss_pipe:
+ kfree(p);
+err:
+ return ERR_PTR(err);
+}
+
+struct gss_alloc_pdo {
+ struct rpc_clnt *clnt;
+ const char *name;
+ const struct rpc_pipe_ops *upcall_ops;
+};
+
+static int gss_pipe_match_pdo(struct rpc_pipe_dir_object *pdo, void *data)
+{
+ struct gss_pipe *gss_pipe;
+ struct gss_alloc_pdo *args = data;
+
+ if (pdo->pdo_ops != &gss_pipe_dir_object_ops)
+ return 0;
+ gss_pipe = container_of(pdo, struct gss_pipe, pdo);
+ if (strcmp(gss_pipe->name, args->name) != 0)
+ return 0;
+ if (!kref_get_unless_zero(&gss_pipe->kref))
+ return 0;
+ return 1;
+}
+
+static struct rpc_pipe_dir_object *gss_pipe_alloc_pdo(void *data)
+{
+ struct gss_pipe *gss_pipe;
+ struct gss_alloc_pdo *args = data;
+
+ gss_pipe = gss_pipe_alloc(args->clnt, args->name, args->upcall_ops);
+ if (!IS_ERR(gss_pipe))
+ return &gss_pipe->pdo;
+ return NULL;
}
-static void gss_pipes_dentries_destroy_net(struct rpc_clnt *clnt,
- struct rpc_auth *auth)
+static struct gss_pipe *gss_pipe_get(struct rpc_clnt *clnt,
+ const char *name,
+ const struct rpc_pipe_ops *upcall_ops)
{
struct net *net = rpc_net_ns(clnt);
- struct super_block *sb;
+ struct rpc_pipe_dir_object *pdo;
+ struct gss_alloc_pdo args = {
+ .clnt = clnt,
+ .name = name,
+ .upcall_ops = upcall_ops,
+ };
- sb = rpc_get_sb_net(net);
- if (sb) {
- if (clnt->cl_dentry)
- gss_pipes_dentries_destroy(auth);
- rpc_put_sb_net(net);
- }
+ pdo = rpc_find_or_alloc_pipe_dir_object(net,
+ &clnt->cl_pipedir_objects,
+ gss_pipe_match_pdo,
+ gss_pipe_alloc_pdo,
+ &args);
+ if (pdo != NULL)
+ return container_of(pdo, struct gss_pipe, pdo);
+ return ERR_PTR(-ENOMEM);
}
-static int gss_pipes_dentries_create_net(struct rpc_clnt *clnt,
- struct rpc_auth *auth)
+static void __gss_pipe_free(struct gss_pipe *p)
{
+ struct rpc_clnt *clnt = p->clnt;
struct net *net = rpc_net_ns(clnt);
- struct super_block *sb;
- int err = 0;
- sb = rpc_get_sb_net(net);
- if (sb) {
- if (clnt->cl_dentry)
- err = gss_pipes_dentries_create(auth);
- rpc_put_sb_net(net);
- }
- return err;
+ rpc_remove_pipe_dir_object(net,
+ &clnt->cl_pipedir_objects,
+ &p->pdo);
+ rpc_destroy_pipe_data(p->pipe);
+ kfree(p);
+}
+
+static void __gss_pipe_release(struct kref *kref)
+{
+ struct gss_pipe *p = container_of(kref, struct gss_pipe, kref);
+
+ __gss_pipe_free(p);
+}
+
+static void gss_pipe_free(struct gss_pipe *p)
+{
+ if (p != NULL)
+ kref_put(&p->kref, __gss_pipe_release);
}
/*
* NOTE: we have the opportunity to use different
* parameters based on the input flavor (which must be a pseudoflavor)
*/
-static struct rpc_auth *
-gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
+static struct gss_auth *
+gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
{
+ rpc_authflavor_t flavor = args->pseudoflavor;
struct gss_auth *gss_auth;
+ struct gss_pipe *gss_pipe;
struct rpc_auth * auth;
int err = -ENOMEM; /* XXX? */
@@ -852,17 +978,26 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
return ERR_PTR(err);
if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL)))
goto out_dec;
+ INIT_HLIST_NODE(&gss_auth->hash);
+ gss_auth->target_name = NULL;
+ if (args->target_name) {
+ gss_auth->target_name = kstrdup(args->target_name, GFP_KERNEL);
+ if (gss_auth->target_name == NULL)
+ goto err_free;
+ }
gss_auth->client = clnt;
+ gss_auth->net = get_net(rpc_net_ns(clnt));
err = -EINVAL;
gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
if (!gss_auth->mech) {
- printk(KERN_WARNING "%s: Pseudoflavor %d not found!\n",
- __func__, flavor);
- goto err_free;
+ dprintk("RPC: Pseudoflavor %d not found!\n", flavor);
+ goto err_put_net;
}
gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor);
if (gss_auth->service == 0)
goto err_put_mech;
+ if (!gssd_running(gss_auth->net))
+ goto err_put_mech;
auth = &gss_auth->rpc_auth;
auth->au_cslack = GSS_CRED_SLACK >> 2;
auth->au_rslack = GSS_VERF_SLACK >> 2;
@@ -871,42 +1006,41 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
atomic_set(&auth->au_count, 1);
kref_init(&gss_auth->kref);
+ err = rpcauth_init_credcache(auth);
+ if (err)
+ goto err_put_mech;
/*
* Note: if we created the old pipe first, then someone who
* examined the directory at the right moment might conclude
* that we supported only the old pipe. So we instead create
* the new pipe first.
*/
- gss_auth->pipe[1] = rpc_mkpipe_data(&gss_upcall_ops_v1,
- RPC_PIPE_WAIT_FOR_OPEN);
- if (IS_ERR(gss_auth->pipe[1])) {
- err = PTR_ERR(gss_auth->pipe[1]);
- goto err_put_mech;
+ gss_pipe = gss_pipe_get(clnt, "gssd", &gss_upcall_ops_v1);
+ if (IS_ERR(gss_pipe)) {
+ err = PTR_ERR(gss_pipe);
+ goto err_destroy_credcache;
}
+ gss_auth->gss_pipe[1] = gss_pipe;
- gss_auth->pipe[0] = rpc_mkpipe_data(&gss_upcall_ops_v0,
- RPC_PIPE_WAIT_FOR_OPEN);
- if (IS_ERR(gss_auth->pipe[0])) {
- err = PTR_ERR(gss_auth->pipe[0]);
+ gss_pipe = gss_pipe_get(clnt, gss_auth->mech->gm_name,
+ &gss_upcall_ops_v0);
+ if (IS_ERR(gss_pipe)) {
+ err = PTR_ERR(gss_pipe);
goto err_destroy_pipe_1;
}
- err = gss_pipes_dentries_create_net(clnt, auth);
- if (err)
- goto err_destroy_pipe_0;
- err = rpcauth_init_credcache(auth);
- if (err)
- goto err_unlink_pipes;
+ gss_auth->gss_pipe[0] = gss_pipe;
- return auth;
-err_unlink_pipes:
- gss_pipes_dentries_destroy_net(clnt, auth);
-err_destroy_pipe_0:
- rpc_destroy_pipe_data(gss_auth->pipe[0]);
+ return gss_auth;
err_destroy_pipe_1:
- rpc_destroy_pipe_data(gss_auth->pipe[1]);
+ gss_pipe_free(gss_auth->gss_pipe[1]);
+err_destroy_credcache:
+ rpcauth_destroy_credcache(auth);
err_put_mech:
gss_mech_put(gss_auth->mech);
+err_put_net:
+ put_net(gss_auth->net);
err_free:
+ kfree(gss_auth->target_name);
kfree(gss_auth);
out_dec:
module_put(THIS_MODULE);
@@ -916,10 +1050,11 @@ out_dec:
static void
gss_free(struct gss_auth *gss_auth)
{
- gss_pipes_dentries_destroy_net(gss_auth->client, &gss_auth->rpc_auth);
- rpc_destroy_pipe_data(gss_auth->pipe[0]);
- rpc_destroy_pipe_data(gss_auth->pipe[1]);
+ gss_pipe_free(gss_auth->gss_pipe[0]);
+ gss_pipe_free(gss_auth->gss_pipe[1]);
gss_mech_put(gss_auth->mech);
+ put_net(gss_auth->net);
+ kfree(gss_auth->target_name);
kfree(gss_auth);
module_put(THIS_MODULE);
@@ -934,17 +1069,118 @@ gss_free_callback(struct kref *kref)
}
static void
+gss_put_auth(struct gss_auth *gss_auth)
+{
+ kref_put(&gss_auth->kref, gss_free_callback);
+}
+
+static void
gss_destroy(struct rpc_auth *auth)
{
- struct gss_auth *gss_auth;
+ struct gss_auth *gss_auth = container_of(auth,
+ struct gss_auth, rpc_auth);
dprintk("RPC: destroying GSS authenticator %p flavor %d\n",
auth, auth->au_flavor);
+ if (hash_hashed(&gss_auth->hash)) {
+ spin_lock(&gss_auth_hash_lock);
+ hash_del(&gss_auth->hash);
+ spin_unlock(&gss_auth_hash_lock);
+ }
+
+ gss_pipe_free(gss_auth->gss_pipe[0]);
+ gss_auth->gss_pipe[0] = NULL;
+ gss_pipe_free(gss_auth->gss_pipe[1]);
+ gss_auth->gss_pipe[1] = NULL;
rpcauth_destroy_credcache(auth);
- gss_auth = container_of(auth, struct gss_auth, rpc_auth);
- kref_put(&gss_auth->kref, gss_free_callback);
+ gss_put_auth(gss_auth);
+}
+
+/*
+ * Auths may be shared between rpc clients that were cloned from a
+ * common client with the same xprt, if they also share the flavor and
+ * target_name.
+ *
+ * The auth is looked up from the oldest parent sharing the same
+ * cl_xprt, and the auth itself references only that common parent
+ * (which is guaranteed to last as long as any of its descendants).
+ */
+static struct gss_auth *
+gss_auth_find_or_add_hashed(struct rpc_auth_create_args *args,
+ struct rpc_clnt *clnt,
+ struct gss_auth *new)
+{
+ struct gss_auth *gss_auth;
+ unsigned long hashval = (unsigned long)clnt;
+
+ spin_lock(&gss_auth_hash_lock);
+ hash_for_each_possible(gss_auth_hash_table,
+ gss_auth,
+ hash,
+ hashval) {
+ if (gss_auth->client != clnt)
+ continue;
+ if (gss_auth->rpc_auth.au_flavor != args->pseudoflavor)
+ continue;
+ if (gss_auth->target_name != args->target_name) {
+ if (gss_auth->target_name == NULL)
+ continue;
+ if (args->target_name == NULL)
+ continue;
+ if (strcmp(gss_auth->target_name, args->target_name))
+ continue;
+ }
+ if (!atomic_inc_not_zero(&gss_auth->rpc_auth.au_count))
+ continue;
+ goto out;
+ }
+ if (new)
+ hash_add(gss_auth_hash_table, &new->hash, hashval);
+ gss_auth = new;
+out:
+ spin_unlock(&gss_auth_hash_lock);
+ return gss_auth;
+}
+
+static struct gss_auth *
+gss_create_hashed(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+{
+ struct gss_auth *gss_auth;
+ struct gss_auth *new;
+
+ gss_auth = gss_auth_find_or_add_hashed(args, clnt, NULL);
+ if (gss_auth != NULL)
+ goto out;
+ new = gss_create_new(args, clnt);
+ if (IS_ERR(new))
+ return new;
+ gss_auth = gss_auth_find_or_add_hashed(args, clnt, new);
+ if (gss_auth != new)
+ gss_destroy(&new->rpc_auth);
+out:
+ return gss_auth;
+}
+
+static struct rpc_auth *
+gss_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+{
+ struct gss_auth *gss_auth;
+ struct rpc_xprt *xprt = rcu_access_pointer(clnt->cl_xprt);
+
+ while (clnt != clnt->cl_parent) {
+ struct rpc_clnt *parent = clnt->cl_parent;
+ /* Find the original parent for this transport */
+ if (rcu_access_pointer(parent->cl_xprt) != xprt)
+ break;
+ clnt = parent;
+ }
+
+ gss_auth = gss_create_hashed(args, clnt);
+ if (IS_ERR(gss_auth))
+ return ERR_CAST(gss_auth);
+ return &gss_auth->rpc_auth;
}
/*
@@ -1030,7 +1266,7 @@ gss_destroy_nullcred(struct rpc_cred *cred)
call_rcu(&cred->cr_rcu, gss_free_cred_callback);
if (ctx)
gss_put_ctx(ctx);
- kref_put(&gss_auth->kref, gss_free_callback);
+ gss_put_auth(gss_auth);
}
static void
@@ -1059,7 +1295,8 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
int err = -ENOMEM;
dprintk("RPC: %s for uid %d, flavor %d\n",
- __func__, acred->uid, auth->au_flavor);
+ __func__, from_kuid(&init_user_ns, acred->uid),
+ auth->au_flavor);
if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS)))
goto out_err;
@@ -1095,10 +1332,32 @@ gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred)
return err;
}
+/*
+ * Returns -EACCES if GSS context is NULL or will expire within the
+ * timeout (miliseconds)
+ */
+static int
+gss_key_timeout(struct rpc_cred *rc)
+{
+ struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
+ unsigned long now = jiffies;
+ unsigned long expire;
+
+ if (gss_cred->gc_ctx == NULL)
+ return -EACCES;
+
+ expire = gss_cred->gc_ctx->gc_expiry - (gss_key_expire_timeo * HZ);
+
+ if (time_after(now, expire))
+ return -EACCES;
+ return 0;
+}
+
static int
gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
{
struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
+ int ret;
if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags))
goto out;
@@ -1111,11 +1370,26 @@ out:
if (acred->principal != NULL) {
if (gss_cred->gc_principal == NULL)
return 0;
- return strcmp(acred->principal, gss_cred->gc_principal) == 0;
+ ret = strcmp(acred->principal, gss_cred->gc_principal) == 0;
+ goto check_expire;
}
if (gss_cred->gc_principal != NULL)
return 0;
- return rc->cr_uid == acred->uid;
+ ret = uid_eq(rc->cr_uid, acred->uid);
+
+check_expire:
+ if (ret == 0)
+ return ret;
+
+ /* Notify acred users of GSS context expiration timeout */
+ if (test_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags) &&
+ (gss_key_timeout(rc) != 0)) {
+ /* test will now be done from generic cred */
+ test_and_clear_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags);
+ /* tell NFS layer that key will expire soon */
+ set_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags);
+ }
+ return ret;
}
/*
@@ -1154,7 +1428,7 @@ gss_marshal(struct rpc_task *task, __be32 *p)
/* We compute the checksum for the verifier over the xdr-encoded bytes
* starting with the xid and ending at the end of the credential: */
- iov.iov_base = xprt_skip_transport_header(task->tk_xprt,
+ iov.iov_base = xprt_skip_transport_header(req->rq_xprt,
req->rq_snd_buf.head[0].iov_base);
iov.iov_len = (u8 *)p - (u8 *)iov.iov_base;
xdr_buf_from_iov(&iov, &verf_buf);
@@ -1247,7 +1521,7 @@ out:
static int
gss_refresh_null(struct rpc_task *task)
{
- return -EACCES;
+ return 0;
}
static __be32 *
@@ -1261,6 +1535,7 @@ gss_validate(struct rpc_task *task, __be32 *p)
struct xdr_netobj mic;
u32 flav,len;
u32 maj_stat;
+ __be32 *ret = ERR_PTR(-EIO);
dprintk("RPC: %5u %s\n", task->tk_pid, __func__);
@@ -1276,6 +1551,7 @@ gss_validate(struct rpc_task *task, __be32 *p)
mic.data = (u8 *)p;
mic.len = len;
+ ret = ERR_PTR(-EACCES);
maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
@@ -1293,8 +1569,9 @@ gss_validate(struct rpc_task *task, __be32 *p)
return p + XDR_QUADLEN(len);
out_bad:
gss_put_ctx(ctx);
- dprintk("RPC: %5u %s failed.\n", task->tk_pid, __func__);
- return NULL;
+ dprintk("RPC: %5u %s failed ret %ld.\n", task->tk_pid, __func__,
+ PTR_ERR(ret));
+ return ret;
}
static void gss_wrap_req_encode(kxdreproc_t encode, struct rpc_rqst *rqstp,
@@ -1626,9 +1903,9 @@ static const struct rpc_authops authgss_ops = {
.destroy = gss_destroy,
.lookup_cred = gss_lookup_cred,
.crcreate = gss_create_cred,
- .pipes_create = gss_pipes_dentries_create,
- .pipes_destroy = gss_pipes_dentries_destroy,
.list_pseudoflavors = gss_mech_list_pseudoflavors,
+ .info2flavor = gss_mech_info2flavor,
+ .flavor2info = gss_mech_flavor2info,
};
static const struct rpc_credops gss_credops = {
@@ -1642,6 +1919,7 @@ static const struct rpc_credops gss_credops = {
.crvalidate = gss_validate,
.crwrap_req = gss_wrap_req,
.crunwrap_resp = gss_unwrap_resp,
+ .crkey_timeout = gss_key_timeout,
};
static const struct rpc_credops gss_nullops = {
@@ -1721,6 +1999,7 @@ static void __exit exit_rpcsec_gss(void)
rcu_barrier(); /* Wait for completion of call_rcu()'s */
}
+MODULE_ALIAS("rpc-auth-6");
MODULE_LICENSE("GPL");
module_param_named(expired_cred_retry_delay,
gss_expired_cred_retry_delay,
@@ -1728,5 +2007,12 @@ module_param_named(expired_cred_retry_delay,
MODULE_PARM_DESC(expired_cred_retry_delay, "Timeout (in seconds) until "
"the RPC engine retries an expired credential");
+module_param_named(key_expire_timeo,
+ gss_key_expire_timeo,
+ uint, 0644);
+MODULE_PARM_DESC(key_expire_timeo, "Time (in seconds) at the end of a "
+ "credential keys lifetime where the NFS layer cleans up "
+ "prior to key expiration");
+
module_init(init_rpcsec_gss)
module_exit(exit_rpcsec_gss)
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
index 76e42e6be75..24589bd2a4b 100644
--- a/net/sunrpc/auth_gss/gss_krb5_keys.c
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -59,6 +59,7 @@
#include <linux/crypto.h>
#include <linux/sunrpc/gss_krb5.h>
#include <linux/sunrpc/xdr.h>
+#include <linux/lcm.h>
#ifdef RPC_DEBUG
# define RPCDBG_FACILITY RPCDBG_AUTH
@@ -72,7 +73,7 @@
static void krb5_nfold(u32 inbits, const u8 *in,
u32 outbits, u8 *out)
{
- int a, b, c, lcm;
+ unsigned long ulcm;
int byte, i, msbit;
/* the code below is more readable if I make these bytes
@@ -82,17 +83,7 @@ static void krb5_nfold(u32 inbits, const u8 *in,
outbits >>= 3;
/* first compute lcm(n,k) */
-
- a = outbits;
- b = inbits;
-
- while (b != 0) {
- c = b;
- b = a%b;
- a = c;
- }
-
- lcm = outbits*inbits/a;
+ ulcm = lcm(inbits, outbits);
/* now do the real work */
@@ -101,7 +92,7 @@ static void krb5_nfold(u32 inbits, const u8 *in,
/* this will end up cycling through k lcm(k,n)/k times, which
is correct */
- for (i = lcm-1; i >= 0; i--) {
+ for (i = ulcm-1; i >= 0; i--) {
/* compute the msbit in k which gets added into this byte */
msbit = (
/* first, start with the msbit in the first,
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index d3611f11a8d..0d3c158ef8f 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -679,6 +679,7 @@ out_err:
static int
gss_import_sec_context_kerberos(const void *p, size_t len,
struct gss_ctx *ctx_id,
+ time_t *endtime,
gfp_t gfp_mask)
{
const void *end = (const void *)((const char *)p + len);
@@ -694,9 +695,11 @@ gss_import_sec_context_kerberos(const void *p, size_t len,
else
ret = gss_import_v2_context(p, end, ctx, gfp_mask);
- if (ret == 0)
+ if (ret == 0) {
ctx_id->internal_ctx_id = ctx;
- else
+ if (endtime)
+ *endtime = ctx->endtime;
+ } else
kfree(ctx);
dprintk("RPC: %s: returning %d\n", __func__, ret);
@@ -729,16 +732,19 @@ static const struct gss_api_ops gss_kerberos_ops = {
static struct pf_desc gss_kerberos_pfs[] = {
[0] = {
.pseudoflavor = RPC_AUTH_GSS_KRB5,
+ .qop = GSS_C_QOP_DEFAULT,
.service = RPC_GSS_SVC_NONE,
.name = "krb5",
},
[1] = {
.pseudoflavor = RPC_AUTH_GSS_KRB5I,
+ .qop = GSS_C_QOP_DEFAULT,
.service = RPC_GSS_SVC_INTEGRITY,
.name = "krb5i",
},
[2] = {
.pseudoflavor = RPC_AUTH_GSS_KRB5P,
+ .qop = GSS_C_QOP_DEFAULT,
.service = RPC_GSS_SVC_PRIVACY,
.name = "krb5p",
},
@@ -750,11 +756,12 @@ MODULE_ALIAS("rpc-auth-gss-krb5p");
MODULE_ALIAS("rpc-auth-gss-390003");
MODULE_ALIAS("rpc-auth-gss-390004");
MODULE_ALIAS("rpc-auth-gss-390005");
+MODULE_ALIAS("rpc-auth-gss-1.2.840.113554.1.2.2");
static struct gss_api_mech gss_kerberos_mech = {
.gm_name = "krb5",
.gm_owner = THIS_MODULE,
- .gm_oid = {9, (void *)"\x2a\x86\x48\x86\xf7\x12\x01\x02\x02"},
+ .gm_oid = { 9, "\x2a\x86\x48\x86\xf7\x12\x01\x02\x02" },
.gm_ops = &gss_kerberos_ops,
.gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs),
.gm_pfs = gss_kerberos_pfs,
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
index 6cd930f3678..6c981ddc19f 100644
--- a/net/sunrpc/auth_gss/gss_krb5_unseal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -150,7 +150,6 @@ gss_verify_mic_v2(struct krb5_ctx *ctx,
struct xdr_netobj cksumobj = {.len = sizeof(cksumdata),
.data = cksumdata};
s32 now;
- u64 seqnum;
u8 *ptr = read_token->data;
u8 *cksumkey;
u8 flags;
@@ -197,9 +196,10 @@ gss_verify_mic_v2(struct krb5_ctx *ctx,
if (now > ctx->endtime)
return GSS_S_CONTEXT_EXPIRED;
- /* do sequencing checks */
-
- seqnum = be64_to_cpup((__be64 *)ptr + 8);
+ /*
+ * NOTE: the sequence number at ptr + 8 is skipped, rpcsec_gss
+ * doesn't want it checked; see page 6 of rfc 2203.
+ */
return GSS_S_COMPLETE;
}
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 107c4528654..42560e55d97 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -130,8 +130,8 @@ gss_krb5_make_confounder(char *p, u32 conflen)
/* initialize to random value */
if (i == 0) {
- i = random32();
- i = (i << 32) | random32();
+ i = prandom_u32();
+ i = (i << 32) | prandom_u32();
}
switch (conflen) {
@@ -489,7 +489,6 @@ static u32
gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
{
s32 now;
- u64 seqnum;
u8 *ptr;
u8 flags = 0x00;
u16 ec, rrc;
@@ -525,7 +524,10 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
ec = be16_to_cpup((__be16 *)(ptr + 4));
rrc = be16_to_cpup((__be16 *)(ptr + 6));
- seqnum = be64_to_cpup((__be64 *)(ptr + 8));
+ /*
+ * NOTE: the sequence number at ptr + 8 is skipped, rpcsec_gss
+ * doesn't want it checked; see page 6 of rfc 2203.
+ */
if (rrc != 0)
rotate_left(offset + 16, buf, rrc);
@@ -574,6 +576,8 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip;
buf->len -= GSS_KRB5_TOK_HDR_LEN + headskip;
+ /* Trim off the trailing "extra count" and checksum blob */
+ xdr_buf_trim(buf, ec + GSS_KRB5_TOK_HDR_LEN + tailskip);
return GSS_S_COMPLETE;
}
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index b174fcd9ff4..92d5ab99fbf 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -36,6 +36,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/module.h>
+#include <linux/oid_registry.h>
#include <linux/sunrpc/msg_prot.h>
#include <linux/sunrpc/gss_asn1.h>
#include <linux/sunrpc/auth_gss.h>
@@ -102,8 +103,13 @@ out:
return status;
}
-int
-gss_mech_register(struct gss_api_mech *gm)
+/**
+ * gss_mech_register - register a GSS mechanism
+ * @gm: GSS mechanism handle
+ *
+ * Returns zero if successful, or a negative errno.
+ */
+int gss_mech_register(struct gss_api_mech *gm)
{
int status;
@@ -116,11 +122,14 @@ gss_mech_register(struct gss_api_mech *gm)
dprintk("RPC: registered gss mechanism %s\n", gm->gm_name);
return 0;
}
-
EXPORT_SYMBOL_GPL(gss_mech_register);
-void
-gss_mech_unregister(struct gss_api_mech *gm)
+/**
+ * gss_mech_unregister - release a GSS mechanism
+ * @gm: GSS mechanism handle
+ *
+ */
+void gss_mech_unregister(struct gss_api_mech *gm)
{
spin_lock(&registered_mechs_lock);
list_del(&gm->gm_list);
@@ -128,19 +137,16 @@ gss_mech_unregister(struct gss_api_mech *gm)
dprintk("RPC: unregistered gss mechanism %s\n", gm->gm_name);
gss_mech_free(gm);
}
-
EXPORT_SYMBOL_GPL(gss_mech_unregister);
-struct gss_api_mech *
-gss_mech_get(struct gss_api_mech *gm)
+struct gss_api_mech *gss_mech_get(struct gss_api_mech *gm)
{
__module_get(gm->gm_owner);
return gm;
}
+EXPORT_SYMBOL(gss_mech_get);
-EXPORT_SYMBOL_GPL(gss_mech_get);
-
-struct gss_api_mech *
+static struct gss_api_mech *
_gss_mech_get_by_name(const char *name)
{
struct gss_api_mech *pos, *gm = NULL;
@@ -169,12 +175,16 @@ struct gss_api_mech * gss_mech_get_by_name(const char *name)
}
return gm;
}
-EXPORT_SYMBOL_GPL(gss_mech_get_by_name);
-struct gss_api_mech *
-gss_mech_get_by_OID(struct xdr_netobj *obj)
+struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj)
{
struct gss_api_mech *pos, *gm = NULL;
+ char buf[32];
+
+ if (sprint_oid(obj->data, obj->len, buf, sizeof(buf)) < 0)
+ return NULL;
+ dprintk("RPC: %s(%s)\n", __func__, buf);
+ request_module("rpc-auth-gss-%s", buf);
spin_lock(&registered_mechs_lock);
list_for_each_entry(pos, &registered_mechs, gm_list) {
@@ -188,11 +198,8 @@ gss_mech_get_by_OID(struct xdr_netobj *obj)
}
spin_unlock(&registered_mechs_lock);
return gm;
-
}
-EXPORT_SYMBOL_GPL(gss_mech_get_by_OID);
-
static inline int
mech_supports_pseudoflavor(struct gss_api_mech *gm, u32 pseudoflavor)
{
@@ -205,16 +212,14 @@ mech_supports_pseudoflavor(struct gss_api_mech *gm, u32 pseudoflavor)
return 0;
}
-struct gss_api_mech *_gss_mech_get_by_pseudoflavor(u32 pseudoflavor)
+static struct gss_api_mech *_gss_mech_get_by_pseudoflavor(u32 pseudoflavor)
{
struct gss_api_mech *gm = NULL, *pos;
spin_lock(&registered_mechs_lock);
list_for_each_entry(pos, &registered_mechs, gm_list) {
- if (!mech_supports_pseudoflavor(pos, pseudoflavor)) {
- module_put(pos->gm_owner);
+ if (!mech_supports_pseudoflavor(pos, pseudoflavor))
continue;
- }
if (try_module_get(pos->gm_owner))
gm = pos;
break;
@@ -237,8 +242,6 @@ gss_mech_get_by_pseudoflavor(u32 pseudoflavor)
return gm;
}
-EXPORT_SYMBOL_GPL(gss_mech_get_by_pseudoflavor);
-
/**
* gss_mech_list_pseudoflavors - Discover registered GSS pseudoflavors
* @array: array to fill in
@@ -268,19 +271,82 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size)
return i;
}
-u32
-gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service)
+/**
+ * gss_svc_to_pseudoflavor - map a GSS service number to a pseudoflavor
+ * @gm: GSS mechanism handle
+ * @qop: GSS quality-of-protection value
+ * @service: GSS service value
+ *
+ * Returns a matching security flavor, or RPC_AUTH_MAXFLAVOR if none is found.
+ */
+rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 qop,
+ u32 service)
{
int i;
for (i = 0; i < gm->gm_pf_num; i++) {
- if (gm->gm_pfs[i].service == service) {
+ if (gm->gm_pfs[i].qop == qop &&
+ gm->gm_pfs[i].service == service) {
return gm->gm_pfs[i].pseudoflavor;
}
}
- return RPC_AUTH_MAXFLAVOR; /* illegal value */
+ return RPC_AUTH_MAXFLAVOR;
+}
+
+/**
+ * gss_mech_info2flavor - look up a pseudoflavor given a GSS tuple
+ * @info: a GSS mech OID, quality of protection, and service value
+ *
+ * Returns a matching pseudoflavor, or RPC_AUTH_MAXFLAVOR if the tuple is
+ * not supported.
+ */
+rpc_authflavor_t gss_mech_info2flavor(struct rpcsec_gss_info *info)
+{
+ rpc_authflavor_t pseudoflavor;
+ struct gss_api_mech *gm;
+
+ gm = gss_mech_get_by_OID(&info->oid);
+ if (gm == NULL)
+ return RPC_AUTH_MAXFLAVOR;
+
+ pseudoflavor = gss_svc_to_pseudoflavor(gm, info->qop, info->service);
+
+ gss_mech_put(gm);
+ return pseudoflavor;
+}
+
+/**
+ * gss_mech_flavor2info - look up a GSS tuple for a given pseudoflavor
+ * @pseudoflavor: GSS pseudoflavor to match
+ * @info: rpcsec_gss_info structure to fill in
+ *
+ * Returns zero and fills in "info" if pseudoflavor matches a
+ * supported mechanism. Otherwise a negative errno is returned.
+ */
+int gss_mech_flavor2info(rpc_authflavor_t pseudoflavor,
+ struct rpcsec_gss_info *info)
+{
+ struct gss_api_mech *gm;
+ int i;
+
+ gm = gss_mech_get_by_pseudoflavor(pseudoflavor);
+ if (gm == NULL)
+ return -ENOENT;
+
+ for (i = 0; i < gm->gm_pf_num; i++) {
+ if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) {
+ memcpy(info->oid.data, gm->gm_oid.data, gm->gm_oid.len);
+ info->oid.len = gm->gm_oid.len;
+ info->qop = gm->gm_pfs[i].qop;
+ info->service = gm->gm_pfs[i].service;
+ gss_mech_put(gm);
+ return 0;
+ }
+ }
+
+ gss_mech_put(gm);
+ return -ENOENT;
}
-EXPORT_SYMBOL_GPL(gss_svc_to_pseudoflavor);
u32
gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
@@ -293,8 +359,7 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
}
return 0;
}
-
-EXPORT_SYMBOL_GPL(gss_pseudoflavor_to_service);
+EXPORT_SYMBOL(gss_pseudoflavor_to_service);
char *
gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service)
@@ -308,16 +373,13 @@ gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service)
return NULL;
}
-EXPORT_SYMBOL_GPL(gss_service_to_auth_domain_name);
-
void
gss_mech_put(struct gss_api_mech * gm)
{
if (gm)
module_put(gm->gm_owner);
}
-
-EXPORT_SYMBOL_GPL(gss_mech_put);
+EXPORT_SYMBOL(gss_mech_put);
/* The mech could probably be determined from the token instead, but it's just
* as easy for now to pass it in. */
@@ -325,14 +387,15 @@ int
gss_import_sec_context(const void *input_token, size_t bufsize,
struct gss_api_mech *mech,
struct gss_ctx **ctx_id,
+ time_t *endtime,
gfp_t gfp_mask)
{
if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask)))
return -ENOMEM;
(*ctx_id)->mech_type = gss_mech_get(mech);
- return mech->gm_ops
- ->gss_import_sec_context(input_token, bufsize, *ctx_id, gfp_mask);
+ return mech->gm_ops->gss_import_sec_context(input_token, bufsize,
+ *ctx_id, endtime, gfp_mask);
}
/* gss_get_mic: compute a mic over message and return mic_token. */
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
new file mode 100644
index 00000000000..abbb7dcd168
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -0,0 +1,382 @@
+/*
+ * linux/net/sunrpc/gss_rpc_upcall.c
+ *
+ * Copyright (C) 2012 Simo Sorce <simo@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/un.h>
+
+#include <linux/sunrpc/svcauth.h>
+#include "gss_rpc_upcall.h"
+
+#define GSSPROXY_SOCK_PATHNAME "/var/run/gssproxy.sock"
+
+#define GSSPROXY_PROGRAM (400112u)
+#define GSSPROXY_VERS_1 (1u)
+
+/*
+ * Encoding/Decoding functions
+ */
+
+enum {
+ GSSX_NULL = 0, /* Unused */
+ GSSX_INDICATE_MECHS = 1,
+ GSSX_GET_CALL_CONTEXT = 2,
+ GSSX_IMPORT_AND_CANON_NAME = 3,
+ GSSX_EXPORT_CRED = 4,
+ GSSX_IMPORT_CRED = 5,
+ GSSX_ACQUIRE_CRED = 6,
+ GSSX_STORE_CRED = 7,
+ GSSX_INIT_SEC_CONTEXT = 8,
+ GSSX_ACCEPT_SEC_CONTEXT = 9,
+ GSSX_RELEASE_HANDLE = 10,
+ GSSX_GET_MIC = 11,
+ GSSX_VERIFY = 12,
+ GSSX_WRAP = 13,
+ GSSX_UNWRAP = 14,
+ GSSX_WRAP_SIZE_LIMIT = 15,
+};
+
+#define PROC(proc, name) \
+[GSSX_##proc] = { \
+ .p_proc = GSSX_##proc, \
+ .p_encode = (kxdreproc_t)gssx_enc_##name, \
+ .p_decode = (kxdrdproc_t)gssx_dec_##name, \
+ .p_arglen = GSSX_ARG_##name##_sz, \
+ .p_replen = GSSX_RES_##name##_sz, \
+ .p_statidx = GSSX_##proc, \
+ .p_name = #proc, \
+}
+
+static struct rpc_procinfo gssp_procedures[] = {
+ PROC(INDICATE_MECHS, indicate_mechs),
+ PROC(GET_CALL_CONTEXT, get_call_context),
+ PROC(IMPORT_AND_CANON_NAME, import_and_canon_name),
+ PROC(EXPORT_CRED, export_cred),
+ PROC(IMPORT_CRED, import_cred),
+ PROC(ACQUIRE_CRED, acquire_cred),
+ PROC(STORE_CRED, store_cred),
+ PROC(INIT_SEC_CONTEXT, init_sec_context),
+ PROC(ACCEPT_SEC_CONTEXT, accept_sec_context),
+ PROC(RELEASE_HANDLE, release_handle),
+ PROC(GET_MIC, get_mic),
+ PROC(VERIFY, verify),
+ PROC(WRAP, wrap),
+ PROC(UNWRAP, unwrap),
+ PROC(WRAP_SIZE_LIMIT, wrap_size_limit),
+};
+
+
+
+/*
+ * Common transport functions
+ */
+
+static const struct rpc_program gssp_program;
+
+static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt)
+{
+ static const struct sockaddr_un gssp_localaddr = {
+ .sun_family = AF_LOCAL,
+ .sun_path = GSSPROXY_SOCK_PATHNAME,
+ };
+ struct rpc_create_args args = {
+ .net = net,
+ .protocol = XPRT_TRANSPORT_LOCAL,
+ .address = (struct sockaddr *)&gssp_localaddr,
+ .addrsize = sizeof(gssp_localaddr),
+ .servername = "localhost",
+ .program = &gssp_program,
+ .version = GSSPROXY_VERS_1,
+ .authflavor = RPC_AUTH_NULL,
+ /*
+ * Note we want connection to be done in the caller's
+ * filesystem namespace. We therefore turn off the idle
+ * timeout, which would result in reconnections being
+ * done without the correct namespace:
+ */
+ .flags = RPC_CLNT_CREATE_NOPING |
+ RPC_CLNT_CREATE_NO_IDLE_TIMEOUT
+ };
+ struct rpc_clnt *clnt;
+ int result = 0;
+
+ clnt = rpc_create(&args);
+ if (IS_ERR(clnt)) {
+ dprintk("RPC: failed to create AF_LOCAL gssproxy "
+ "client (errno %ld).\n", PTR_ERR(clnt));
+ result = PTR_ERR(clnt);
+ *_clnt = NULL;
+ goto out;
+ }
+
+ dprintk("RPC: created new gssp local client (gssp_local_clnt: "
+ "%p)\n", clnt);
+ *_clnt = clnt;
+
+out:
+ return result;
+}
+
+void init_gssp_clnt(struct sunrpc_net *sn)
+{
+ mutex_init(&sn->gssp_lock);
+ sn->gssp_clnt = NULL;
+}
+
+int set_gssp_clnt(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct rpc_clnt *clnt;
+ int ret;
+
+ mutex_lock(&sn->gssp_lock);
+ ret = gssp_rpc_create(net, &clnt);
+ if (!ret) {
+ if (sn->gssp_clnt)
+ rpc_shutdown_client(sn->gssp_clnt);
+ sn->gssp_clnt = clnt;
+ }
+ mutex_unlock(&sn->gssp_lock);
+ return ret;
+}
+
+void clear_gssp_clnt(struct sunrpc_net *sn)
+{
+ mutex_lock(&sn->gssp_lock);
+ if (sn->gssp_clnt) {
+ rpc_shutdown_client(sn->gssp_clnt);
+ sn->gssp_clnt = NULL;
+ }
+ mutex_unlock(&sn->gssp_lock);
+}
+
+static struct rpc_clnt *get_gssp_clnt(struct sunrpc_net *sn)
+{
+ struct rpc_clnt *clnt;
+
+ mutex_lock(&sn->gssp_lock);
+ clnt = sn->gssp_clnt;
+ if (clnt)
+ atomic_inc(&clnt->cl_count);
+ mutex_unlock(&sn->gssp_lock);
+ return clnt;
+}
+
+static int gssp_call(struct net *net, struct rpc_message *msg)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct rpc_clnt *clnt;
+ int status;
+
+ clnt = get_gssp_clnt(sn);
+ if (!clnt)
+ return -EIO;
+ status = rpc_call_sync(clnt, msg, 0);
+ if (status < 0) {
+ dprintk("gssp: rpc_call returned error %d\n", -status);
+ switch (status) {
+ case -EPROTONOSUPPORT:
+ status = -EINVAL;
+ break;
+ case -ECONNREFUSED:
+ case -ETIMEDOUT:
+ case -ENOTCONN:
+ status = -EAGAIN;
+ break;
+ case -ERESTARTSYS:
+ if (signalled ())
+ status = -EINTR;
+ break;
+ default:
+ break;
+ }
+ }
+ rpc_release_client(clnt);
+ return status;
+}
+
+static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg)
+{
+ int i;
+
+ for (i = 0; i < arg->npages && arg->pages[i]; i++)
+ __free_page(arg->pages[i]);
+}
+
+static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg)
+{
+ arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE);
+ arg->pages = kzalloc(arg->npages * sizeof(struct page *), GFP_KERNEL);
+ /*
+ * XXX: actual pages are allocated by xdr layer in
+ * xdr_partial_copy_from_skb.
+ */
+ if (!arg->pages)
+ return -ENOMEM;
+ return 0;
+}
+
+/*
+ * Public functions
+ */
+
+/* numbers somewhat arbitrary but large enough for current needs */
+#define GSSX_MAX_OUT_HANDLE 128
+#define GSSX_MAX_SRC_PRINC 256
+#define GSSX_KMEMBUF (GSSX_max_output_handle_sz + \
+ GSSX_max_oid_sz + \
+ GSSX_max_princ_sz + \
+ sizeof(struct svc_cred))
+
+int gssp_accept_sec_context_upcall(struct net *net,
+ struct gssp_upcall_data *data)
+{
+ struct gssx_ctx ctxh = {
+ .state = data->in_handle
+ };
+ struct gssx_arg_accept_sec_context arg = {
+ .input_token = data->in_token,
+ };
+ struct gssx_ctx rctxh = {
+ /*
+ * pass in the max length we expect for each of these
+ * buffers but let the xdr code kmalloc them:
+ */
+ .exported_context_token.len = GSSX_max_output_handle_sz,
+ .mech.len = GSS_OID_MAX_LEN,
+ .src_name.display_name.len = GSSX_max_princ_sz
+ };
+ struct gssx_res_accept_sec_context res = {
+ .context_handle = &rctxh,
+ .output_token = &data->out_token
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &gssp_procedures[GSSX_ACCEPT_SEC_CONTEXT],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ .rpc_cred = NULL, /* FIXME ? */
+ };
+ struct xdr_netobj client_name = { 0 , NULL };
+ int ret;
+
+ if (data->in_handle.len != 0)
+ arg.context_handle = &ctxh;
+ res.output_token->len = GSSX_max_output_token_sz;
+
+ ret = gssp_alloc_receive_pages(&arg);
+ if (ret)
+ return ret;
+
+ /* use nfs/ for targ_name ? */
+
+ ret = gssp_call(net, &msg);
+
+ gssp_free_receive_pages(&arg);
+
+ /* we need to fetch all data even in case of error so
+ * that we can free special strctures is they have been allocated */
+ data->major_status = res.status.major_status;
+ data->minor_status = res.status.minor_status;
+ if (res.context_handle) {
+ data->out_handle = rctxh.exported_context_token;
+ data->mech_oid.len = rctxh.mech.len;
+ if (rctxh.mech.data)
+ memcpy(data->mech_oid.data, rctxh.mech.data,
+ data->mech_oid.len);
+ client_name = rctxh.src_name.display_name;
+ }
+
+ if (res.options.count == 1) {
+ gssx_buffer *value = &res.options.data[0].value;
+ /* Currently we only decode CREDS_VALUE, if we add
+ * anything else we'll have to loop and match on the
+ * option name */
+ if (value->len == 1) {
+ /* steal group info from struct svc_cred */
+ data->creds = *(struct svc_cred *)value->data;
+ data->found_creds = 1;
+ }
+ /* whether we use it or not, free data */
+ kfree(value->data);
+ }
+
+ if (res.options.count != 0) {
+ kfree(res.options.data);
+ }
+
+ /* convert to GSS_NT_HOSTBASED_SERVICE form and set into creds */
+ if (data->found_creds && client_name.data != NULL) {
+ char *c;
+
+ data->creds.cr_principal = kstrndup(client_name.data,
+ client_name.len, GFP_KERNEL);
+ if (data->creds.cr_principal) {
+ /* terminate and remove realm part */
+ c = strchr(data->creds.cr_principal, '@');
+ if (c) {
+ *c = '\0';
+
+ /* change service-hostname delimiter */
+ c = strchr(data->creds.cr_principal, '/');
+ if (c) *c = '@';
+ }
+ if (!c) {
+ /* not a service principal */
+ kfree(data->creds.cr_principal);
+ data->creds.cr_principal = NULL;
+ }
+ }
+ }
+ kfree(client_name.data);
+
+ return ret;
+}
+
+void gssp_free_upcall_data(struct gssp_upcall_data *data)
+{
+ kfree(data->in_handle.data);
+ kfree(data->out_handle.data);
+ kfree(data->out_token.data);
+ free_svc_cred(&data->creds);
+}
+
+/*
+ * Initialization stuff
+ */
+
+static const struct rpc_version gssp_version1 = {
+ .number = GSSPROXY_VERS_1,
+ .nrprocs = ARRAY_SIZE(gssp_procedures),
+ .procs = gssp_procedures,
+};
+
+static const struct rpc_version *gssp_version[] = {
+ NULL,
+ &gssp_version1,
+};
+
+static struct rpc_stat gssp_stats;
+
+static const struct rpc_program gssp_program = {
+ .name = "gssproxy",
+ .number = GSSPROXY_PROGRAM,
+ .nrvers = ARRAY_SIZE(gssp_version),
+ .version = gssp_version,
+ .stats = &gssp_stats,
+};
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.h b/net/sunrpc/auth_gss/gss_rpc_upcall.h
new file mode 100644
index 00000000000..1e542aded90
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.h
@@ -0,0 +1,48 @@
+/*
+ * linux/net/sunrpc/gss_rpc_upcall.h
+ *
+ * Copyright (C) 2012 Simo Sorce <simo@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _GSS_RPC_UPCALL_H
+#define _GSS_RPC_UPCALL_H
+
+#include <linux/sunrpc/gss_api.h>
+#include <linux/sunrpc/auth_gss.h>
+#include "gss_rpc_xdr.h"
+#include "../netns.h"
+
+struct gssp_upcall_data {
+ struct xdr_netobj in_handle;
+ struct gssp_in_token in_token;
+ struct xdr_netobj out_handle;
+ struct xdr_netobj out_token;
+ struct rpcsec_gss_oid mech_oid;
+ struct svc_cred creds;
+ int found_creds;
+ int major_status;
+ int minor_status;
+};
+
+int gssp_accept_sec_context_upcall(struct net *net,
+ struct gssp_upcall_data *data);
+void gssp_free_upcall_data(struct gssp_upcall_data *data);
+
+void init_gssp_clnt(struct sunrpc_net *);
+int set_gssp_clnt(struct net *);
+void clear_gssp_clnt(struct sunrpc_net *);
+#endif /* _GSS_RPC_UPCALL_H */
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
new file mode 100644
index 00000000000..1ec19f6f0c2
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -0,0 +1,839 @@
+/*
+ * GSS Proxy upcall module
+ *
+ * Copyright (C) 2012 Simo Sorce <simo@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/sunrpc/svcauth.h>
+#include "gss_rpc_xdr.h"
+
+static int gssx_enc_bool(struct xdr_stream *xdr, int v)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ *p = v ? xdr_one : xdr_zero;
+ return 0;
+}
+
+static int gssx_dec_bool(struct xdr_stream *xdr, u32 *v)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ *v = be32_to_cpu(*p);
+ return 0;
+}
+
+static int gssx_enc_buffer(struct xdr_stream *xdr,
+ gssx_buffer *buf)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, sizeof(u32) + buf->len);
+ if (!p)
+ return -ENOSPC;
+ xdr_encode_opaque(p, buf->data, buf->len);
+ return 0;
+}
+
+static int gssx_enc_in_token(struct xdr_stream *xdr,
+ struct gssp_in_token *in)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return -ENOSPC;
+ *p = cpu_to_be32(in->page_len);
+
+ /* all we need to do is to write pages */
+ xdr_write_pages(xdr, in->pages, in->page_base, in->page_len);
+
+ return 0;
+}
+
+
+static int gssx_dec_buffer(struct xdr_stream *xdr,
+ gssx_buffer *buf)
+{
+ u32 length;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+
+ length = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, length);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+
+ if (buf->len == 0) {
+ /* we intentionally are not interested in this buffer */
+ return 0;
+ }
+ if (length > buf->len)
+ return -ENOSPC;
+
+ if (!buf->data) {
+ buf->data = kmemdup(p, length, GFP_KERNEL);
+ if (!buf->data)
+ return -ENOMEM;
+ } else {
+ memcpy(buf->data, p, length);
+ }
+ buf->len = length;
+ return 0;
+}
+
+static int gssx_enc_option(struct xdr_stream *xdr,
+ struct gssx_option *opt)
+{
+ int err;
+
+ err = gssx_enc_buffer(xdr, &opt->option);
+ if (err)
+ return err;
+ err = gssx_enc_buffer(xdr, &opt->value);
+ return err;
+}
+
+static int gssx_dec_option(struct xdr_stream *xdr,
+ struct gssx_option *opt)
+{
+ int err;
+
+ err = gssx_dec_buffer(xdr, &opt->option);
+ if (err)
+ return err;
+ err = gssx_dec_buffer(xdr, &opt->value);
+ return err;
+}
+
+static int dummy_enc_opt_array(struct xdr_stream *xdr,
+ struct gssx_option_array *oa)
+{
+ __be32 *p;
+
+ if (oa->count != 0)
+ return -EINVAL;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return -ENOSPC;
+ *p = 0;
+
+ return 0;
+}
+
+static int dummy_dec_opt_array(struct xdr_stream *xdr,
+ struct gssx_option_array *oa)
+{
+ struct gssx_option dummy;
+ u32 count, i;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ count = be32_to_cpup(p++);
+ memset(&dummy, 0, sizeof(dummy));
+ for (i = 0; i < count; i++) {
+ gssx_dec_option(xdr, &dummy);
+ }
+
+ oa->count = 0;
+ oa->data = NULL;
+ return 0;
+}
+
+static int get_host_u32(struct xdr_stream *xdr, u32 *res)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EINVAL;
+ /* Contents of linux creds are all host-endian: */
+ memcpy(res, p, sizeof(u32));
+ return 0;
+}
+
+static int gssx_dec_linux_creds(struct xdr_stream *xdr,
+ struct svc_cred *creds)
+{
+ u32 length;
+ __be32 *p;
+ u32 tmp;
+ u32 N;
+ int i, err;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+
+ length = be32_to_cpup(p);
+
+ if (length > (3 + NGROUPS_MAX) * sizeof(u32))
+ return -ENOSPC;
+
+ /* uid */
+ err = get_host_u32(xdr, &tmp);
+ if (err)
+ return err;
+ creds->cr_uid = make_kuid(&init_user_ns, tmp);
+
+ /* gid */
+ err = get_host_u32(xdr, &tmp);
+ if (err)
+ return err;
+ creds->cr_gid = make_kgid(&init_user_ns, tmp);
+
+ /* number of additional gid's */
+ err = get_host_u32(xdr, &tmp);
+ if (err)
+ return err;
+ N = tmp;
+ if ((3 + N) * sizeof(u32) != length)
+ return -EINVAL;
+ creds->cr_group_info = groups_alloc(N);
+ if (creds->cr_group_info == NULL)
+ return -ENOMEM;
+
+ /* gid's */
+ for (i = 0; i < N; i++) {
+ kgid_t kgid;
+ err = get_host_u32(xdr, &tmp);
+ if (err)
+ goto out_free_groups;
+ err = -EINVAL;
+ kgid = make_kgid(&init_user_ns, tmp);
+ if (!gid_valid(kgid))
+ goto out_free_groups;
+ GROUP_AT(creds->cr_group_info, i) = kgid;
+ }
+
+ return 0;
+out_free_groups:
+ groups_free(creds->cr_group_info);
+ return err;
+}
+
+static int gssx_dec_option_array(struct xdr_stream *xdr,
+ struct gssx_option_array *oa)
+{
+ struct svc_cred *creds;
+ u32 count, i;
+ __be32 *p;
+ int err;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ count = be32_to_cpup(p++);
+ if (!count)
+ return 0;
+
+ /* we recognize only 1 currently: CREDS_VALUE */
+ oa->count = 1;
+
+ oa->data = kmalloc(sizeof(struct gssx_option), GFP_KERNEL);
+ if (!oa->data)
+ return -ENOMEM;
+
+ creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL);
+ if (!creds) {
+ kfree(oa->data);
+ return -ENOMEM;
+ }
+
+ oa->data[0].option.data = CREDS_VALUE;
+ oa->data[0].option.len = sizeof(CREDS_VALUE);
+ oa->data[0].value.data = (void *)creds;
+ oa->data[0].value.len = 0;
+
+ for (i = 0; i < count; i++) {
+ gssx_buffer dummy = { 0, NULL };
+ u32 length;
+
+ /* option buffer */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+
+ length = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, length);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+
+ if (length == sizeof(CREDS_VALUE) &&
+ memcmp(p, CREDS_VALUE, sizeof(CREDS_VALUE)) == 0) {
+ /* We have creds here. parse them */
+ err = gssx_dec_linux_creds(xdr, creds);
+ if (err)
+ return err;
+ oa->data[0].value.len = 1; /* presence */
+ } else {
+ /* consume uninteresting buffer */
+ err = gssx_dec_buffer(xdr, &dummy);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
+
+static int gssx_dec_status(struct xdr_stream *xdr,
+ struct gssx_status *status)
+{
+ __be32 *p;
+ int err;
+
+ /* status->major_status */
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ p = xdr_decode_hyper(p, &status->major_status);
+
+ /* status->mech */
+ err = gssx_dec_buffer(xdr, &status->mech);
+ if (err)
+ return err;
+
+ /* status->minor_status */
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ p = xdr_decode_hyper(p, &status->minor_status);
+
+ /* status->major_status_string */
+ err = gssx_dec_buffer(xdr, &status->major_status_string);
+ if (err)
+ return err;
+
+ /* status->minor_status_string */
+ err = gssx_dec_buffer(xdr, &status->minor_status_string);
+ if (err)
+ return err;
+
+ /* status->server_ctx */
+ err = gssx_dec_buffer(xdr, &status->server_ctx);
+ if (err)
+ return err;
+
+ /* we assume we have no options for now, so simply consume them */
+ /* status->options */
+ err = dummy_dec_opt_array(xdr, &status->options);
+
+ return err;
+}
+
+static int gssx_enc_call_ctx(struct xdr_stream *xdr,
+ struct gssx_call_ctx *ctx)
+{
+ struct gssx_option opt;
+ __be32 *p;
+ int err;
+
+ /* ctx->locale */
+ err = gssx_enc_buffer(xdr, &ctx->locale);
+ if (err)
+ return err;
+
+ /* ctx->server_ctx */
+ err = gssx_enc_buffer(xdr, &ctx->server_ctx);
+ if (err)
+ return err;
+
+ /* we always want to ask for lucid contexts */
+ /* ctx->options */
+ p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(2);
+
+ /* we want a lucid_v1 context */
+ opt.option.data = LUCID_OPTION;
+ opt.option.len = sizeof(LUCID_OPTION);
+ opt.value.data = LUCID_VALUE;
+ opt.value.len = sizeof(LUCID_VALUE);
+ err = gssx_enc_option(xdr, &opt);
+
+ /* ..and user creds */
+ opt.option.data = CREDS_OPTION;
+ opt.option.len = sizeof(CREDS_OPTION);
+ opt.value.data = CREDS_VALUE;
+ opt.value.len = sizeof(CREDS_VALUE);
+ err = gssx_enc_option(xdr, &opt);
+
+ return err;
+}
+
+static int gssx_dec_name_attr(struct xdr_stream *xdr,
+ struct gssx_name_attr *attr)
+{
+ int err;
+
+ /* attr->attr */
+ err = gssx_dec_buffer(xdr, &attr->attr);
+ if (err)
+ return err;
+
+ /* attr->value */
+ err = gssx_dec_buffer(xdr, &attr->value);
+ if (err)
+ return err;
+
+ /* attr->extensions */
+ err = dummy_dec_opt_array(xdr, &attr->extensions);
+
+ return err;
+}
+
+static int dummy_enc_nameattr_array(struct xdr_stream *xdr,
+ struct gssx_name_attr_array *naa)
+{
+ __be32 *p;
+
+ if (naa->count != 0)
+ return -EINVAL;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return -ENOSPC;
+ *p = 0;
+
+ return 0;
+}
+
+static int dummy_dec_nameattr_array(struct xdr_stream *xdr,
+ struct gssx_name_attr_array *naa)
+{
+ struct gssx_name_attr dummy = { .attr = {.len = 0} };
+ u32 count, i;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ count = be32_to_cpup(p++);
+ for (i = 0; i < count; i++) {
+ gssx_dec_name_attr(xdr, &dummy);
+ }
+
+ naa->count = 0;
+ naa->data = NULL;
+ return 0;
+}
+
+static struct xdr_netobj zero_netobj = {};
+
+static struct gssx_name_attr_array zero_name_attr_array = {};
+
+static struct gssx_option_array zero_option_array = {};
+
+static int gssx_enc_name(struct xdr_stream *xdr,
+ struct gssx_name *name)
+{
+ int err;
+
+ /* name->display_name */
+ err = gssx_enc_buffer(xdr, &name->display_name);
+ if (err)
+ return err;
+
+ /* name->name_type */
+ err = gssx_enc_buffer(xdr, &zero_netobj);
+ if (err)
+ return err;
+
+ /* name->exported_name */
+ err = gssx_enc_buffer(xdr, &zero_netobj);
+ if (err)
+ return err;
+
+ /* name->exported_composite_name */
+ err = gssx_enc_buffer(xdr, &zero_netobj);
+ if (err)
+ return err;
+
+ /* leave name_attributes empty for now, will add once we have any
+ * to pass up at all */
+ /* name->name_attributes */
+ err = dummy_enc_nameattr_array(xdr, &zero_name_attr_array);
+ if (err)
+ return err;
+
+ /* leave options empty for now, will add once we have any options
+ * to pass up at all */
+ /* name->extensions */
+ err = dummy_enc_opt_array(xdr, &zero_option_array);
+
+ return err;
+}
+
+
+static int gssx_dec_name(struct xdr_stream *xdr,
+ struct gssx_name *name)
+{
+ struct xdr_netobj dummy_netobj = { .len = 0 };
+ struct gssx_name_attr_array dummy_name_attr_array = { .count = 0 };
+ struct gssx_option_array dummy_option_array = { .count = 0 };
+ int err;
+
+ /* name->display_name */
+ err = gssx_dec_buffer(xdr, &name->display_name);
+ if (err)
+ return err;
+
+ /* name->name_type */
+ err = gssx_dec_buffer(xdr, &dummy_netobj);
+ if (err)
+ return err;
+
+ /* name->exported_name */
+ err = gssx_dec_buffer(xdr, &dummy_netobj);
+ if (err)
+ return err;
+
+ /* name->exported_composite_name */
+ err = gssx_dec_buffer(xdr, &dummy_netobj);
+ if (err)
+ return err;
+
+ /* we assume we have no attributes for now, so simply consume them */
+ /* name->name_attributes */
+ err = dummy_dec_nameattr_array(xdr, &dummy_name_attr_array);
+ if (err)
+ return err;
+
+ /* we assume we have no options for now, so simply consume them */
+ /* name->extensions */
+ err = dummy_dec_opt_array(xdr, &dummy_option_array);
+
+ return err;
+}
+
+static int dummy_enc_credel_array(struct xdr_stream *xdr,
+ struct gssx_cred_element_array *cea)
+{
+ __be32 *p;
+
+ if (cea->count != 0)
+ return -EINVAL;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return -ENOSPC;
+ *p = 0;
+
+ return 0;
+}
+
+static int gssx_enc_cred(struct xdr_stream *xdr,
+ struct gssx_cred *cred)
+{
+ int err;
+
+ /* cred->desired_name */
+ err = gssx_enc_name(xdr, &cred->desired_name);
+ if (err)
+ return err;
+
+ /* cred->elements */
+ err = dummy_enc_credel_array(xdr, &cred->elements);
+ if (err)
+ return err;
+
+ /* cred->cred_handle_reference */
+ err = gssx_enc_buffer(xdr, &cred->cred_handle_reference);
+ if (err)
+ return err;
+
+ /* cred->needs_release */
+ err = gssx_enc_bool(xdr, cred->needs_release);
+
+ return err;
+}
+
+static int gssx_enc_ctx(struct xdr_stream *xdr,
+ struct gssx_ctx *ctx)
+{
+ __be32 *p;
+ int err;
+
+ /* ctx->exported_context_token */
+ err = gssx_enc_buffer(xdr, &ctx->exported_context_token);
+ if (err)
+ return err;
+
+ /* ctx->state */
+ err = gssx_enc_buffer(xdr, &ctx->state);
+ if (err)
+ return err;
+
+ /* ctx->need_release */
+ err = gssx_enc_bool(xdr, ctx->need_release);
+ if (err)
+ return err;
+
+ /* ctx->mech */
+ err = gssx_enc_buffer(xdr, &ctx->mech);
+ if (err)
+ return err;
+
+ /* ctx->src_name */
+ err = gssx_enc_name(xdr, &ctx->src_name);
+ if (err)
+ return err;
+
+ /* ctx->targ_name */
+ err = gssx_enc_name(xdr, &ctx->targ_name);
+ if (err)
+ return err;
+
+ /* ctx->lifetime */
+ p = xdr_reserve_space(xdr, 8+8);
+ if (!p)
+ return -ENOSPC;
+ p = xdr_encode_hyper(p, ctx->lifetime);
+
+ /* ctx->ctx_flags */
+ p = xdr_encode_hyper(p, ctx->ctx_flags);
+
+ /* ctx->locally_initiated */
+ err = gssx_enc_bool(xdr, ctx->locally_initiated);
+ if (err)
+ return err;
+
+ /* ctx->open */
+ err = gssx_enc_bool(xdr, ctx->open);
+ if (err)
+ return err;
+
+ /* leave options empty for now, will add once we have any options
+ * to pass up at all */
+ /* ctx->options */
+ err = dummy_enc_opt_array(xdr, &ctx->options);
+
+ return err;
+}
+
+static int gssx_dec_ctx(struct xdr_stream *xdr,
+ struct gssx_ctx *ctx)
+{
+ __be32 *p;
+ int err;
+
+ /* ctx->exported_context_token */
+ err = gssx_dec_buffer(xdr, &ctx->exported_context_token);
+ if (err)
+ return err;
+
+ /* ctx->state */
+ err = gssx_dec_buffer(xdr, &ctx->state);
+ if (err)
+ return err;
+
+ /* ctx->need_release */
+ err = gssx_dec_bool(xdr, &ctx->need_release);
+ if (err)
+ return err;
+
+ /* ctx->mech */
+ err = gssx_dec_buffer(xdr, &ctx->mech);
+ if (err)
+ return err;
+
+ /* ctx->src_name */
+ err = gssx_dec_name(xdr, &ctx->src_name);
+ if (err)
+ return err;
+
+ /* ctx->targ_name */
+ err = gssx_dec_name(xdr, &ctx->targ_name);
+ if (err)
+ return err;
+
+ /* ctx->lifetime */
+ p = xdr_inline_decode(xdr, 8+8);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ p = xdr_decode_hyper(p, &ctx->lifetime);
+
+ /* ctx->ctx_flags */
+ p = xdr_decode_hyper(p, &ctx->ctx_flags);
+
+ /* ctx->locally_initiated */
+ err = gssx_dec_bool(xdr, &ctx->locally_initiated);
+ if (err)
+ return err;
+
+ /* ctx->open */
+ err = gssx_dec_bool(xdr, &ctx->open);
+ if (err)
+ return err;
+
+ /* we assume we have no options for now, so simply consume them */
+ /* ctx->options */
+ err = dummy_dec_opt_array(xdr, &ctx->options);
+
+ return err;
+}
+
+static int gssx_enc_cb(struct xdr_stream *xdr, struct gssx_cb *cb)
+{
+ __be32 *p;
+ int err;
+
+ /* cb->initiator_addrtype */
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return -ENOSPC;
+ p = xdr_encode_hyper(p, cb->initiator_addrtype);
+
+ /* cb->initiator_address */
+ err = gssx_enc_buffer(xdr, &cb->initiator_address);
+ if (err)
+ return err;
+
+ /* cb->acceptor_addrtype */
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return -ENOSPC;
+ p = xdr_encode_hyper(p, cb->acceptor_addrtype);
+
+ /* cb->acceptor_address */
+ err = gssx_enc_buffer(xdr, &cb->acceptor_address);
+ if (err)
+ return err;
+
+ /* cb->application_data */
+ err = gssx_enc_buffer(xdr, &cb->application_data);
+
+ return err;
+}
+
+void gssx_enc_accept_sec_context(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct gssx_arg_accept_sec_context *arg)
+{
+ int err;
+
+ err = gssx_enc_call_ctx(xdr, &arg->call_ctx);
+ if (err)
+ goto done;
+
+ /* arg->context_handle */
+ if (arg->context_handle)
+ err = gssx_enc_ctx(xdr, arg->context_handle);
+ else
+ err = gssx_enc_bool(xdr, 0);
+ if (err)
+ goto done;
+
+ /* arg->cred_handle */
+ if (arg->cred_handle)
+ err = gssx_enc_cred(xdr, arg->cred_handle);
+ else
+ err = gssx_enc_bool(xdr, 0);
+ if (err)
+ goto done;
+
+ /* arg->input_token */
+ err = gssx_enc_in_token(xdr, &arg->input_token);
+ if (err)
+ goto done;
+
+ /* arg->input_cb */
+ if (arg->input_cb)
+ err = gssx_enc_cb(xdr, arg->input_cb);
+ else
+ err = gssx_enc_bool(xdr, 0);
+ if (err)
+ goto done;
+
+ err = gssx_enc_bool(xdr, arg->ret_deleg_cred);
+ if (err)
+ goto done;
+
+ /* leave options empty for now, will add once we have any options
+ * to pass up at all */
+ /* arg->options */
+ err = dummy_enc_opt_array(xdr, &arg->options);
+
+ xdr_inline_pages(&req->rq_rcv_buf,
+ PAGE_SIZE/2 /* pretty arbitrary */,
+ arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE);
+done:
+ if (err)
+ dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err);
+}
+
+int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct gssx_res_accept_sec_context *res)
+{
+ u32 value_follows;
+ int err;
+
+ /* res->status */
+ err = gssx_dec_status(xdr, &res->status);
+ if (err)
+ return err;
+
+ /* res->context_handle */
+ err = gssx_dec_bool(xdr, &value_follows);
+ if (err)
+ return err;
+ if (value_follows) {
+ err = gssx_dec_ctx(xdr, res->context_handle);
+ if (err)
+ return err;
+ } else {
+ res->context_handle = NULL;
+ }
+
+ /* res->output_token */
+ err = gssx_dec_bool(xdr, &value_follows);
+ if (err)
+ return err;
+ if (value_follows) {
+ err = gssx_dec_buffer(xdr, res->output_token);
+ if (err)
+ return err;
+ } else {
+ res->output_token = NULL;
+ }
+
+ /* res->delegated_cred_handle */
+ err = gssx_dec_bool(xdr, &value_follows);
+ if (err)
+ return err;
+ if (value_follows) {
+ /* we do not support upcall servers sending this data. */
+ return -EINVAL;
+ }
+
+ /* res->options */
+ err = gssx_dec_option_array(xdr, &res->options);
+
+ return err;
+}
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.h b/net/sunrpc/auth_gss/gss_rpc_xdr.h
new file mode 100644
index 00000000000..685a688f3d8
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.h
@@ -0,0 +1,267 @@
+/*
+ * GSS Proxy upcall module
+ *
+ * Copyright (C) 2012 Simo Sorce <simo@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _LINUX_GSS_RPC_XDR_H
+#define _LINUX_GSS_RPC_XDR_H
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xprtsock.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+#define LUCID_OPTION "exported_context_type"
+#define LUCID_VALUE "linux_lucid_v1"
+#define CREDS_OPTION "exported_creds_type"
+#define CREDS_VALUE "linux_creds_v1"
+
+typedef struct xdr_netobj gssx_buffer;
+typedef struct xdr_netobj utf8string;
+typedef struct xdr_netobj gssx_OID;
+
+enum gssx_cred_usage {
+ GSSX_C_INITIATE = 1,
+ GSSX_C_ACCEPT = 2,
+ GSSX_C_BOTH = 3,
+};
+
+struct gssx_option {
+ gssx_buffer option;
+ gssx_buffer value;
+};
+
+struct gssx_option_array {
+ u32 count;
+ struct gssx_option *data;
+};
+
+struct gssx_status {
+ u64 major_status;
+ gssx_OID mech;
+ u64 minor_status;
+ utf8string major_status_string;
+ utf8string minor_status_string;
+ gssx_buffer server_ctx;
+ struct gssx_option_array options;
+};
+
+struct gssx_call_ctx {
+ utf8string locale;
+ gssx_buffer server_ctx;
+ struct gssx_option_array options;
+};
+
+struct gssx_name_attr {
+ gssx_buffer attr;
+ gssx_buffer value;
+ struct gssx_option_array extensions;
+};
+
+struct gssx_name_attr_array {
+ u32 count;
+ struct gssx_name_attr *data;
+};
+
+struct gssx_name {
+ gssx_buffer display_name;
+};
+typedef struct gssx_name gssx_name;
+
+struct gssx_cred_element {
+ gssx_name MN;
+ gssx_OID mech;
+ u32 cred_usage;
+ u64 initiator_time_rec;
+ u64 acceptor_time_rec;
+ struct gssx_option_array options;
+};
+
+struct gssx_cred_element_array {
+ u32 count;
+ struct gssx_cred_element *data;
+};
+
+struct gssx_cred {
+ gssx_name desired_name;
+ struct gssx_cred_element_array elements;
+ gssx_buffer cred_handle_reference;
+ u32 needs_release;
+};
+
+struct gssx_ctx {
+ gssx_buffer exported_context_token;
+ gssx_buffer state;
+ u32 need_release;
+ gssx_OID mech;
+ gssx_name src_name;
+ gssx_name targ_name;
+ u64 lifetime;
+ u64 ctx_flags;
+ u32 locally_initiated;
+ u32 open;
+ struct gssx_option_array options;
+};
+
+struct gssx_cb {
+ u64 initiator_addrtype;
+ gssx_buffer initiator_address;
+ u64 acceptor_addrtype;
+ gssx_buffer acceptor_address;
+ gssx_buffer application_data;
+};
+
+
+/* This structure is not defined in the protocol.
+ * It is used in the kernel to carry around a big buffer
+ * as a set of pages */
+struct gssp_in_token {
+ struct page **pages; /* Array of contiguous pages */
+ unsigned int page_base; /* Start of page data */
+ unsigned int page_len; /* Length of page data */
+};
+
+struct gssx_arg_accept_sec_context {
+ struct gssx_call_ctx call_ctx;
+ struct gssx_ctx *context_handle;
+ struct gssx_cred *cred_handle;
+ struct gssp_in_token input_token;
+ struct gssx_cb *input_cb;
+ u32 ret_deleg_cred;
+ struct gssx_option_array options;
+ struct page **pages;
+ unsigned int npages;
+};
+
+struct gssx_res_accept_sec_context {
+ struct gssx_status status;
+ struct gssx_ctx *context_handle;
+ gssx_buffer *output_token;
+ /* struct gssx_cred *delegated_cred_handle; not used in kernel */
+ struct gssx_option_array options;
+};
+
+
+
+#define gssx_enc_indicate_mechs NULL
+#define gssx_dec_indicate_mechs NULL
+#define gssx_enc_get_call_context NULL
+#define gssx_dec_get_call_context NULL
+#define gssx_enc_import_and_canon_name NULL
+#define gssx_dec_import_and_canon_name NULL
+#define gssx_enc_export_cred NULL
+#define gssx_dec_export_cred NULL
+#define gssx_enc_import_cred NULL
+#define gssx_dec_import_cred NULL
+#define gssx_enc_acquire_cred NULL
+#define gssx_dec_acquire_cred NULL
+#define gssx_enc_store_cred NULL
+#define gssx_dec_store_cred NULL
+#define gssx_enc_init_sec_context NULL
+#define gssx_dec_init_sec_context NULL
+void gssx_enc_accept_sec_context(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct gssx_arg_accept_sec_context *args);
+int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct gssx_res_accept_sec_context *res);
+#define gssx_enc_release_handle NULL
+#define gssx_dec_release_handle NULL
+#define gssx_enc_get_mic NULL
+#define gssx_dec_get_mic NULL
+#define gssx_enc_verify NULL
+#define gssx_dec_verify NULL
+#define gssx_enc_wrap NULL
+#define gssx_dec_wrap NULL
+#define gssx_enc_unwrap NULL
+#define gssx_dec_unwrap NULL
+#define gssx_enc_wrap_size_limit NULL
+#define gssx_dec_wrap_size_limit NULL
+
+/* non implemented calls are set to 0 size */
+#define GSSX_ARG_indicate_mechs_sz 0
+#define GSSX_RES_indicate_mechs_sz 0
+#define GSSX_ARG_get_call_context_sz 0
+#define GSSX_RES_get_call_context_sz 0
+#define GSSX_ARG_import_and_canon_name_sz 0
+#define GSSX_RES_import_and_canon_name_sz 0
+#define GSSX_ARG_export_cred_sz 0
+#define GSSX_RES_export_cred_sz 0
+#define GSSX_ARG_import_cred_sz 0
+#define GSSX_RES_import_cred_sz 0
+#define GSSX_ARG_acquire_cred_sz 0
+#define GSSX_RES_acquire_cred_sz 0
+#define GSSX_ARG_store_cred_sz 0
+#define GSSX_RES_store_cred_sz 0
+#define GSSX_ARG_init_sec_context_sz 0
+#define GSSX_RES_init_sec_context_sz 0
+
+#define GSSX_default_in_call_ctx_sz (4 + 4 + 4 + \
+ 8 + sizeof(LUCID_OPTION) + sizeof(LUCID_VALUE) + \
+ 8 + sizeof(CREDS_OPTION) + sizeof(CREDS_VALUE))
+#define GSSX_default_in_ctx_hndl_sz (4 + 4+8 + 4 + 4 + 6*4 + 6*4 + 8 + 8 + \
+ 4 + 4 + 4)
+#define GSSX_default_in_cred_sz 4 /* we send in no cred_handle */
+#define GSSX_default_in_token_sz 4 /* does *not* include token data */
+#define GSSX_default_in_cb_sz 4 /* we do not use channel bindings */
+#define GSSX_ARG_accept_sec_context_sz (GSSX_default_in_call_ctx_sz + \
+ GSSX_default_in_ctx_hndl_sz + \
+ GSSX_default_in_cred_sz + \
+ GSSX_default_in_token_sz + \
+ GSSX_default_in_cb_sz + \
+ 4 /* no deleg creds boolean */ + \
+ 4) /* empty options */
+
+/* somewhat arbitrary numbers but large enough (we ignore some of the data
+ * sent down, but it is part of the protocol so we need enough space to take
+ * it in) */
+#define GSSX_default_status_sz 8 + 24 + 8 + 256 + 256 + 16 + 4
+#define GSSX_max_output_handle_sz 128
+#define GSSX_max_oid_sz 16
+#define GSSX_max_princ_sz 256
+#define GSSX_default_ctx_sz (GSSX_max_output_handle_sz + \
+ 16 + 4 + GSSX_max_oid_sz + \
+ 2 * GSSX_max_princ_sz + \
+ 8 + 8 + 4 + 4 + 4)
+#define GSSX_max_output_token_sz 1024
+/* grouplist not included; we allocate separate pages for that: */
+#define GSSX_max_creds_sz (4 + 4 + 4 /* + NGROUPS_MAX*4 */)
+#define GSSX_RES_accept_sec_context_sz (GSSX_default_status_sz + \
+ GSSX_default_ctx_sz + \
+ GSSX_max_output_token_sz + \
+ 4 + GSSX_max_creds_sz)
+
+#define GSSX_ARG_release_handle_sz 0
+#define GSSX_RES_release_handle_sz 0
+#define GSSX_ARG_get_mic_sz 0
+#define GSSX_RES_get_mic_sz 0
+#define GSSX_ARG_verify_sz 0
+#define GSSX_RES_verify_sz 0
+#define GSSX_ARG_wrap_sz 0
+#define GSSX_RES_wrap_sz 0
+#define GSSX_ARG_unwrap_sz 0
+#define GSSX_RES_unwrap_sz 0
+#define GSSX_ARG_wrap_size_limit_sz 0
+#define GSSX_RES_wrap_size_limit_sz 0
+
+
+
+#endif /* _LINUX_GSS_RPC_XDR_H */
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 73e95738660..4ce5eccec1f 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -48,8 +48,8 @@
#include <linux/sunrpc/svcauth.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/cache.h>
+#include "gss_rpc_upcall.h"
-#include "../netns.h"
#ifdef RPC_DEBUG
# define RPCDBG_FACILITY RPCDBG_AUTH
@@ -182,12 +182,6 @@ static void rsi_request(struct cache_detail *cd,
(*bpp)[-1] = '\n';
}
-static int rsi_upcall(struct cache_detail *cd, struct cache_head *h)
-{
- return sunrpc_cache_pipe_upcall(cd, h, rsi_request);
-}
-
-
static int rsi_parse(struct cache_detail *cd,
char *mesg, int mlen)
{
@@ -275,7 +269,7 @@ static struct cache_detail rsi_cache_template = {
.hash_size = RSI_HASHMAX,
.name = "auth.rpcsec.init",
.cache_put = rsi_put,
- .cache_upcall = rsi_upcall,
+ .cache_request = rsi_request,
.cache_parse = rsi_parse,
.match = rsi_match,
.init = rsi_init,
@@ -383,8 +377,7 @@ rsc_init(struct cache_head *cnew, struct cache_head *ctmp)
new->handle.data = tmp->handle.data;
tmp->handle.data = NULL;
new->mechctx = NULL;
- new->cred.cr_group_info = NULL;
- new->cred.cr_principal = NULL;
+ init_svc_cred(&new->cred);
}
static void
@@ -398,9 +391,7 @@ update_rsc(struct cache_head *cnew, struct cache_head *ctmp)
memset(&new->seqdata, 0, sizeof(new->seqdata));
spin_lock_init(&new->seqdata.sd_lock);
new->cred = tmp->cred;
- tmp->cred.cr_group_info = NULL;
- new->cred.cr_principal = tmp->cred.cr_principal;
- tmp->cred.cr_principal = NULL;
+ init_svc_cred(&tmp->cred);
}
static struct cache_head *
@@ -418,6 +409,7 @@ static int rsc_parse(struct cache_detail *cd,
{
/* contexthandle expiry [ uid gid N <n gids> mechname ...mechdata... ] */
char *buf = mesg;
+ int id;
int len, rv;
struct rsc rsci, *rscp = NULL;
time_t expiry;
@@ -444,7 +436,7 @@ static int rsc_parse(struct cache_detail *cd,
goto out;
/* uid, or NEGATIVE */
- rv = get_int(&mesg, &rsci.cred.cr_uid);
+ rv = get_int(&mesg, &id);
if (rv == -EINVAL)
goto out;
if (rv == -ENOENT)
@@ -452,9 +444,21 @@ static int rsc_parse(struct cache_detail *cd,
else {
int N, i;
+ /*
+ * NOTE: we skip uid_valid()/gid_valid() checks here:
+ * instead, * -1 id's are later mapped to the
+ * (export-specific) anonymous id by nfsd_setuser.
+ *
+ * (But supplementary gid's get no such special
+ * treatment so are checked for validity here.)
+ */
+ /* uid */
+ rsci.cred.cr_uid = make_kuid(&init_user_ns, id);
+
/* gid */
- if (get_int(&mesg, &rsci.cred.cr_gid))
+ if (get_int(&mesg, &id))
goto out;
+ rsci.cred.cr_gid = make_kgid(&init_user_ns, id);
/* number of additional gid's */
if (get_int(&mesg, &N))
@@ -467,11 +471,10 @@ static int rsc_parse(struct cache_detail *cd,
/* gid's */
status = -EINVAL;
for (i=0; i<N; i++) {
- gid_t gid;
kgid_t kgid;
- if (get_int(&mesg, &gid))
+ if (get_int(&mesg, &id))
goto out;
- kgid = make_kgid(&init_user_ns, gid);
+ kgid = make_kgid(&init_user_ns, id);
if (!gid_valid(kgid))
goto out;
GROUP_AT(rsci.cred.cr_group_info, i) = kgid;
@@ -481,7 +484,7 @@ static int rsc_parse(struct cache_detail *cd,
len = qword_get(&mesg, buf, mlen);
if (len < 0)
goto out;
- gm = gss_mech_get_by_name(buf);
+ gm = rsci.cred.cr_gss_mech = gss_mech_get_by_name(buf);
status = -EOPNOTSUPP;
if (!gm)
goto out;
@@ -491,7 +494,8 @@ static int rsc_parse(struct cache_detail *cd,
len = qword_get(&mesg, buf, mlen);
if (len < 0)
goto out;
- status = gss_import_sec_context(buf, len, gm, &rsci.mechctx, GFP_KERNEL);
+ status = gss_import_sec_context(buf, len, gm, &rsci.mechctx,
+ NULL, GFP_KERNEL);
if (status)
goto out;
@@ -499,8 +503,10 @@ static int rsc_parse(struct cache_detail *cd,
len = qword_get(&mesg, buf, mlen);
if (len > 0) {
rsci.cred.cr_principal = kstrdup(buf, GFP_KERNEL);
- if (!rsci.cred.cr_principal)
+ if (!rsci.cred.cr_principal) {
+ status = -ENOMEM;
goto out;
+ }
}
}
@@ -508,7 +514,6 @@ static int rsc_parse(struct cache_detail *cd,
rscp = rsc_update(cd, &rsci, rscp);
status = 0;
out:
- gss_mech_put(gm);
rsc_free(&rsci);
if (rscp)
cache_put(&rscp->h, cd);
@@ -817,13 +822,17 @@ read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
* The server uses base of head iovec as read pointer, while the
* client uses separate pointer. */
static int
-unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
+unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
{
int stat = -EINVAL;
u32 integ_len, maj_stat;
struct xdr_netobj mic;
struct xdr_buf integ_buf;
+ /* Did we already verify the signature on the original pass through? */
+ if (rqstp->rq_deferred)
+ return 0;
+
integ_len = svc_getnl(&buf->head[0]);
if (integ_len & 3)
return stat;
@@ -846,6 +855,8 @@ unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
goto out;
if (svc_getnl(&buf->head[0]) != seq)
goto out;
+ /* trim off the mic at the end before returning */
+ xdr_buf_trim(buf, mic.len + 4);
stat = 0;
out:
kfree(mic.data);
@@ -975,13 +986,10 @@ gss_write_init_verf(struct cache_detail *cd, struct svc_rqst *rqstp,
}
static inline int
-gss_read_verf(struct rpc_gss_wire_cred *gc,
- struct kvec *argv, __be32 *authp,
- struct xdr_netobj *in_handle,
- struct xdr_netobj *in_token)
+gss_read_common_verf(struct rpc_gss_wire_cred *gc,
+ struct kvec *argv, __be32 *authp,
+ struct xdr_netobj *in_handle)
{
- struct xdr_netobj tmpobj;
-
/* Read the verifier; should be NULL: */
*authp = rpc_autherr_badverf;
if (argv->iov_len < 2 * 4)
@@ -997,6 +1005,23 @@ gss_read_verf(struct rpc_gss_wire_cred *gc,
if (dup_netobj(in_handle, &gc->gc_ctx))
return SVC_CLOSE;
*authp = rpc_autherr_badverf;
+
+ return 0;
+}
+
+static inline int
+gss_read_verf(struct rpc_gss_wire_cred *gc,
+ struct kvec *argv, __be32 *authp,
+ struct xdr_netobj *in_handle,
+ struct xdr_netobj *in_token)
+{
+ struct xdr_netobj tmpobj;
+ int res;
+
+ res = gss_read_common_verf(gc, argv, authp, in_handle);
+ if (res)
+ return res;
+
if (svc_safe_getnetobj(argv, &tmpobj)) {
kfree(in_handle->data);
return SVC_DENIED;
@@ -1009,6 +1034,40 @@ gss_read_verf(struct rpc_gss_wire_cred *gc,
return 0;
}
+/* Ok this is really heavily depending on a set of semantics in
+ * how rqstp is set up by svc_recv and pages laid down by the
+ * server when reading a request. We are basically guaranteed that
+ * the token lays all down linearly across a set of pages, starting
+ * at iov_base in rq_arg.head[0] which happens to be the first of a
+ * set of pages stored in rq_pages[].
+ * rq_arg.head[0].iov_base will provide us the page_base to pass
+ * to the upcall.
+ */
+static inline int
+gss_read_proxy_verf(struct svc_rqst *rqstp,
+ struct rpc_gss_wire_cred *gc, __be32 *authp,
+ struct xdr_netobj *in_handle,
+ struct gssp_in_token *in_token)
+{
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ u32 inlen;
+ int res;
+
+ res = gss_read_common_verf(gc, argv, authp, in_handle);
+ if (res)
+ return res;
+
+ inlen = svc_getnl(argv);
+ if (inlen > (argv->iov_len + rqstp->rq_arg.page_len))
+ return SVC_DENIED;
+
+ in_token->pages = rqstp->rq_pages;
+ in_token->page_base = (ulong)argv->iov_base & ~PAGE_MASK;
+ in_token->page_len = inlen;
+
+ return 0;
+}
+
static inline int
gss_write_resv(struct kvec *resv, size_t size_limit,
struct xdr_netobj *out_handle, struct xdr_netobj *out_token,
@@ -1036,7 +1095,7 @@ gss_write_resv(struct kvec *resv, size_t size_limit,
* the upcall results are available, write the verifier and result.
* Otherwise, drop the request pending an answer to the upcall.
*/
-static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
+static int svcauth_gss_legacy_init(struct svc_rqst *rqstp,
struct rpc_gss_wire_cred *gc, __be32 *authp)
{
struct kvec *argv = &rqstp->rq_arg.head[0];
@@ -1076,6 +1135,254 @@ out:
return ret;
}
+static int gss_proxy_save_rsc(struct cache_detail *cd,
+ struct gssp_upcall_data *ud,
+ uint64_t *handle)
+{
+ struct rsc rsci, *rscp = NULL;
+ static atomic64_t ctxhctr;
+ long long ctxh;
+ struct gss_api_mech *gm = NULL;
+ time_t expiry;
+ int status = -EINVAL;
+
+ memset(&rsci, 0, sizeof(rsci));
+ /* context handle */
+ status = -ENOMEM;
+ /* the handle needs to be just a unique id,
+ * use a static counter */
+ ctxh = atomic64_inc_return(&ctxhctr);
+
+ /* make a copy for the caller */
+ *handle = ctxh;
+
+ /* make a copy for the rsc cache */
+ if (dup_to_netobj(&rsci.handle, (char *)handle, sizeof(uint64_t)))
+ goto out;
+ rscp = rsc_lookup(cd, &rsci);
+ if (!rscp)
+ goto out;
+
+ /* creds */
+ if (!ud->found_creds) {
+ /* userspace seem buggy, we should always get at least a
+ * mapping to nobody */
+ dprintk("RPC: No creds found!\n");
+ goto out;
+ } else {
+
+ /* steal creds */
+ rsci.cred = ud->creds;
+ memset(&ud->creds, 0, sizeof(struct svc_cred));
+
+ status = -EOPNOTSUPP;
+ /* get mech handle from OID */
+ gm = gss_mech_get_by_OID(&ud->mech_oid);
+ if (!gm)
+ goto out;
+ rsci.cred.cr_gss_mech = gm;
+
+ status = -EINVAL;
+ /* mech-specific data: */
+ status = gss_import_sec_context(ud->out_handle.data,
+ ud->out_handle.len,
+ gm, &rsci.mechctx,
+ &expiry, GFP_KERNEL);
+ if (status)
+ goto out;
+ }
+
+ rsci.h.expiry_time = expiry;
+ rscp = rsc_update(cd, &rsci, rscp);
+ status = 0;
+out:
+ rsc_free(&rsci);
+ if (rscp)
+ cache_put(&rscp->h, cd);
+ else
+ status = -ENOMEM;
+ return status;
+}
+
+static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
+ struct rpc_gss_wire_cred *gc, __be32 *authp)
+{
+ struct kvec *resv = &rqstp->rq_res.head[0];
+ struct xdr_netobj cli_handle;
+ struct gssp_upcall_data ud;
+ uint64_t handle;
+ int status;
+ int ret;
+ struct net *net = rqstp->rq_xprt->xpt_net;
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ memset(&ud, 0, sizeof(ud));
+ ret = gss_read_proxy_verf(rqstp, gc, authp,
+ &ud.in_handle, &ud.in_token);
+ if (ret)
+ return ret;
+
+ ret = SVC_CLOSE;
+
+ /* Perform synchronous upcall to gss-proxy */
+ status = gssp_accept_sec_context_upcall(net, &ud);
+ if (status)
+ goto out;
+
+ dprintk("RPC: svcauth_gss: gss major status = %d\n",
+ ud.major_status);
+
+ switch (ud.major_status) {
+ case GSS_S_CONTINUE_NEEDED:
+ cli_handle = ud.out_handle;
+ break;
+ case GSS_S_COMPLETE:
+ status = gss_proxy_save_rsc(sn->rsc_cache, &ud, &handle);
+ if (status)
+ goto out;
+ cli_handle.data = (u8 *)&handle;
+ cli_handle.len = sizeof(handle);
+ break;
+ default:
+ ret = SVC_CLOSE;
+ goto out;
+ }
+
+ /* Got an answer to the upcall; use it: */
+ if (gss_write_init_verf(sn->rsc_cache, rqstp,
+ &cli_handle, &ud.major_status))
+ goto out;
+ if (gss_write_resv(resv, PAGE_SIZE,
+ &cli_handle, &ud.out_token,
+ ud.major_status, ud.minor_status))
+ goto out;
+
+ ret = SVC_COMPLETE;
+out:
+ gssp_free_upcall_data(&ud);
+ return ret;
+}
+
+/*
+ * Try to set the sn->use_gss_proxy variable to a new value. We only allow
+ * it to be changed if it's currently undefined (-1). If it's any other value
+ * then return -EBUSY unless the type wouldn't have changed anyway.
+ */
+static int set_gss_proxy(struct net *net, int type)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ int ret;
+
+ WARN_ON_ONCE(type != 0 && type != 1);
+ ret = cmpxchg(&sn->use_gss_proxy, -1, type);
+ if (ret != -1 && ret != type)
+ return -EBUSY;
+ return 0;
+}
+
+static bool use_gss_proxy(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ /* If use_gss_proxy is still undefined, then try to disable it */
+ if (sn->use_gss_proxy == -1)
+ set_gss_proxy(net, 0);
+ return sn->use_gss_proxy;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static ssize_t write_gssp(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct net *net = PDE_DATA(file_inode(file));
+ char tbuf[20];
+ unsigned long i;
+ int res;
+
+ if (*ppos || count > sizeof(tbuf)-1)
+ return -EINVAL;
+ if (copy_from_user(tbuf, buf, count))
+ return -EFAULT;
+
+ tbuf[count] = 0;
+ res = kstrtoul(tbuf, 0, &i);
+ if (res)
+ return res;
+ if (i != 1)
+ return -EINVAL;
+ res = set_gssp_clnt(net);
+ if (res)
+ return res;
+ res = set_gss_proxy(net, 1);
+ if (res)
+ return res;
+ return count;
+}
+
+static ssize_t read_gssp(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct net *net = PDE_DATA(file_inode(file));
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ unsigned long p = *ppos;
+ char tbuf[10];
+ size_t len;
+
+ snprintf(tbuf, sizeof(tbuf), "%d\n", sn->use_gss_proxy);
+ len = strlen(tbuf);
+ if (p >= len)
+ return 0;
+ len -= p;
+ if (len > count)
+ len = count;
+ if (copy_to_user(buf, (void *)(tbuf+p), len))
+ return -EFAULT;
+ *ppos += len;
+ return len;
+}
+
+static const struct file_operations use_gss_proxy_ops = {
+ .open = nonseekable_open,
+ .write = write_gssp,
+ .read = read_gssp,
+};
+
+static int create_use_gss_proxy_proc_entry(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct proc_dir_entry **p = &sn->use_gssp_proc;
+
+ sn->use_gss_proxy = -1;
+ *p = proc_create_data("use-gss-proxy", S_IFREG|S_IRUSR|S_IWUSR,
+ sn->proc_net_rpc,
+ &use_gss_proxy_ops, net);
+ if (!*p)
+ return -ENOMEM;
+ init_gssp_clnt(sn);
+ return 0;
+}
+
+static void destroy_use_gss_proxy_proc_entry(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ if (sn->use_gssp_proc) {
+ remove_proc_entry("use-gss-proxy", sn->proc_net_rpc);
+ clear_gssp_clnt(sn);
+ }
+}
+#else /* CONFIG_PROC_FS */
+
+static int create_use_gss_proxy_proc_entry(struct net *net)
+{
+ return 0;
+}
+
+static void destroy_use_gss_proxy_proc_entry(struct net *net) {}
+
+#endif /* CONFIG_PROC_FS */
+
/*
* Accept an rpcsec packet.
* If context establishment, punt to user space
@@ -1142,7 +1449,10 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
switch (gc->gc_proc) {
case RPC_GSS_PROC_INIT:
case RPC_GSS_PROC_CONTINUE_INIT:
- return svcauth_gss_handle_init(rqstp, gc, authp);
+ if (use_gss_proxy(SVC_NET(rqstp)))
+ return svcauth_gss_proxy_init(rqstp, gc, authp);
+ else
+ return svcauth_gss_legacy_init(rqstp, gc, authp);
case RPC_GSS_PROC_DATA:
case RPC_GSS_PROC_DESTROY:
/* Look up the context, and check the verifier: */
@@ -1190,9 +1500,10 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
/* placeholders for length and seq. number: */
svc_putnl(resv, 0);
svc_putnl(resv, 0);
- if (unwrap_integ_data(&rqstp->rq_arg,
+ if (unwrap_integ_data(rqstp, &rqstp->rq_arg,
gc->gc_seq, rsci->mechctx))
goto garbage_args;
+ rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE;
break;
case RPC_GSS_SVC_PRIVACY:
/* placeholders for length and seq. number: */
@@ -1201,6 +1512,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
if (unwrap_priv_data(rqstp, &rqstp->rq_arg,
gc->gc_seq, rsci->mechctx))
goto garbage_args;
+ rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE * 2;
break;
default:
goto auth_err;
@@ -1208,7 +1520,9 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
svcdata->rsci = rsci;
cache_get(&rsci->h);
rqstp->rq_cred.cr_flavor = gss_svc_to_pseudoflavor(
- rsci->mechctx->mech_type, gc->gc_svc);
+ rsci->mechctx->mech_type,
+ GSS_C_QOP_DEFAULT,
+ gc->gc_svc);
ret = SVC_OK;
goto out;
}
@@ -1279,8 +1593,7 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp)
BUG_ON(integ_len % 4);
*p++ = htonl(integ_len);
*p++ = htonl(gc->gc_seq);
- if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset,
- integ_len))
+ if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len))
BUG();
if (resbuf->tail[0].iov_base == NULL) {
if (resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE > PAGE_SIZE)
@@ -1288,10 +1601,8 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp)
resbuf->tail[0].iov_base = resbuf->head[0].iov_base
+ resbuf->head[0].iov_len;
resbuf->tail[0].iov_len = 0;
- resv = &resbuf->tail[0];
- } else {
- resv = &resbuf->tail[0];
}
+ resv = &resbuf->tail[0];
mic.data = (u8 *)resv->iov_base + resv->iov_len + 4;
if (gss_get_mic(gsd->rsci->mechctx, &integ_buf, &mic))
goto out_err;
@@ -1517,7 +1828,12 @@ gss_svc_init_net(struct net *net)
rv = rsi_cache_create_net(net);
if (rv)
goto out1;
+ rv = create_use_gss_proxy_proc_entry(net);
+ if (rv)
+ goto out2;
return 0;
+out2:
+ destroy_use_gss_proxy_proc_entry(net);
out1:
rsc_cache_destroy_net(net);
return rv;
@@ -1526,6 +1842,7 @@ out1:
void
gss_svc_shutdown_net(struct net *net)
{
+ destroy_use_gss_proxy_proc_entry(net);
rsi_cache_destroy_net(net);
rsc_cache_destroy_net(net);
}
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index a5c36c01707..f0ebe07978a 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -18,7 +18,7 @@ static struct rpc_auth null_auth;
static struct rpc_cred null_cred;
static struct rpc_auth *
-nul_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
+nul_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
{
atomic_inc(&null_auth.au_count);
return &null_auth;
@@ -88,13 +88,13 @@ nul_validate(struct rpc_task *task, __be32 *p)
flavor = ntohl(*p++);
if (flavor != RPC_AUTH_NULL) {
printk("RPC: bad verf flavor: %u\n", flavor);
- return NULL;
+ return ERR_PTR(-EIO);
}
size = ntohl(*p++);
if (size != 0) {
printk("RPC: bad verf size: %u\n", size);
- return NULL;
+ return ERR_PTR(-EIO);
}
return p;
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 52c5abdee21..d5d69236629 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -18,8 +18,8 @@
struct unx_cred {
struct rpc_cred uc_base;
- gid_t uc_gid;
- gid_t uc_gids[NFS_NGROUPS];
+ kgid_t uc_gid;
+ kgid_t uc_gids[NFS_NGROUPS];
};
#define uc_uid uc_base.cr_uid
@@ -33,7 +33,7 @@ static struct rpc_auth unix_auth;
static const struct rpc_credops unix_credops;
static struct rpc_auth *
-unx_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
+unx_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
{
dprintk("RPC: creating UNIX authenticator for client %p\n",
clnt);
@@ -65,7 +65,8 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
unsigned int i;
dprintk("RPC: allocating UNIX cred for uid %d gid %d\n",
- acred->uid, acred->gid);
+ from_kuid(&init_user_ns, acred->uid),
+ from_kgid(&init_user_ns, acred->gid));
if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS)))
return ERR_PTR(-ENOMEM);
@@ -79,13 +80,10 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
groups = NFS_NGROUPS;
cred->uc_gid = acred->gid;
- for (i = 0; i < groups; i++) {
- gid_t gid;
- gid = from_kgid(&init_user_ns, GROUP_AT(acred->group_info, i));
- cred->uc_gids[i] = gid;
- }
+ for (i = 0; i < groups; i++)
+ cred->uc_gids[i] = GROUP_AT(acred->group_info, i);
if (i < NFS_NGROUPS)
- cred->uc_gids[i] = NOGROUP;
+ cred->uc_gids[i] = INVALID_GID;
return &cred->uc_base;
}
@@ -123,21 +121,17 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags)
unsigned int i;
- if (cred->uc_uid != acred->uid || cred->uc_gid != acred->gid)
+ if (!uid_eq(cred->uc_uid, acred->uid) || !gid_eq(cred->uc_gid, acred->gid))
return 0;
if (acred->group_info != NULL)
groups = acred->group_info->ngroups;
if (groups > NFS_NGROUPS)
groups = NFS_NGROUPS;
- for (i = 0; i < groups ; i++) {
- gid_t gid;
- gid = from_kgid(&init_user_ns, GROUP_AT(acred->group_info, i));
- if (cred->uc_gids[i] != gid)
+ for (i = 0; i < groups ; i++)
+ if (!gid_eq(cred->uc_gids[i], GROUP_AT(acred->group_info, i)))
return 0;
- }
- if (groups < NFS_NGROUPS &&
- cred->uc_gids[groups] != NOGROUP)
+ if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups]))
return 0;
return 1;
}
@@ -163,11 +157,11 @@ unx_marshal(struct rpc_task *task, __be32 *p)
*/
p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
- *p++ = htonl((u32) cred->uc_uid);
- *p++ = htonl((u32) cred->uc_gid);
+ *p++ = htonl((u32) from_kuid(&init_user_ns, cred->uc_uid));
+ *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gid));
hold = p++;
- for (i = 0; i < 16 && cred->uc_gids[i] != (gid_t) NOGROUP; i++)
- *p++ = htonl((u32) cred->uc_gids[i]);
+ for (i = 0; i < 16 && gid_valid(cred->uc_gids[i]); i++)
+ *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gids[i]));
*hold = htonl(p - hold - 1); /* gid array length */
*base = htonl((p - base - 1) << 2); /* cred length */
@@ -198,13 +192,13 @@ unx_validate(struct rpc_task *task, __be32 *p)
flavor != RPC_AUTH_UNIX &&
flavor != RPC_AUTH_SHORT) {
printk("RPC: bad verf flavor: %u\n", flavor);
- return NULL;
+ return ERR_PTR(-EIO);
}
size = ntohl(*p++);
if (size > RPC_MAX_AUTH_SIZE) {
printk("RPC: giant verf size: %u\n", size);
- return NULL;
+ return ERR_PTR(-EIO);
}
task->tk_rqstp->rq_cred->cr_auth->au_rslack = (size >> 2) + 2;
p += (size >> 2);
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 890a29912d5..9761a0da964 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -64,7 +64,6 @@ static void xprt_free_allocation(struct rpc_rqst *req)
free_page((unsigned long)xbufp->head[0].iov_base);
xbufp = &req->rq_snd_buf;
free_page((unsigned long)xbufp->head[0].iov_base);
- list_del(&req->rq_bc_pa_list);
kfree(req);
}
@@ -168,8 +167,10 @@ out_free:
/*
* Memory allocation failed, free the temporary list
*/
- list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list)
+ list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list) {
+ list_del(&req->rq_bc_pa_list);
xprt_free_allocation(req);
+ }
dprintk("RPC: setup backchannel transport failed\n");
return -ENOMEM;
@@ -198,6 +199,7 @@ void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs)
xprt_dec_alloc_count(xprt, max_reqs);
list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
dprintk("RPC: req=%p\n", req);
+ list_del(&req->rq_bc_pa_list);
xprt_free_allocation(req);
if (--max_reqs == 0)
break;
@@ -210,39 +212,23 @@ out:
}
EXPORT_SYMBOL_GPL(xprt_destroy_backchannel);
-/*
- * One or more rpc_rqst structure have been preallocated during the
- * backchannel setup. Buffer space for the send and private XDR buffers
- * has been preallocated as well. Use xprt_alloc_bc_request to allocate
- * to this request. Use xprt_free_bc_request to return it.
- *
- * We know that we're called in soft interrupt context, grab the spin_lock
- * since there is no need to grab the bottom half spin_lock.
- *
- * Return an available rpc_rqst, otherwise NULL if non are available.
- */
-struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt)
+static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid)
{
- struct rpc_rqst *req;
+ struct rpc_rqst *req = NULL;
dprintk("RPC: allocate a backchannel request\n");
- spin_lock(&xprt->bc_pa_lock);
- if (!list_empty(&xprt->bc_pa_list)) {
- req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
- rq_bc_pa_list);
- list_del(&req->rq_bc_pa_list);
- } else {
- req = NULL;
- }
- spin_unlock(&xprt->bc_pa_lock);
+ if (list_empty(&xprt->bc_pa_list))
+ goto not_found;
- if (req != NULL) {
- set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
- req->rq_reply_bytes_recvd = 0;
- req->rq_bytes_sent = 0;
- memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
+ req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
+ rq_bc_pa_list);
+ req->rq_reply_bytes_recvd = 0;
+ req->rq_bytes_sent = 0;
+ memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
sizeof(req->rq_private_buf));
- }
+ req->rq_xid = xid;
+ req->rq_connect_cookie = xprt->connect_cookie;
+not_found:
dprintk("RPC: backchannel req=%p\n", req);
return req;
}
@@ -257,10 +243,11 @@ void xprt_free_bc_request(struct rpc_rqst *req)
dprintk("RPC: free backchannel req=%p\n", req);
- smp_mb__before_clear_bit();
+ req->rq_connect_cookie = xprt->connect_cookie - 1;
+ smp_mb__before_atomic();
WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
if (!xprt_need_to_requeue(xprt)) {
/*
@@ -279,7 +266,57 @@ void xprt_free_bc_request(struct rpc_rqst *req)
* may be reused by a new callback request.
*/
spin_lock_bh(&xprt->bc_pa_lock);
- list_add(&req->rq_bc_pa_list, &xprt->bc_pa_list);
+ list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
spin_unlock_bh(&xprt->bc_pa_lock);
}
+/*
+ * One or more rpc_rqst structure have been preallocated during the
+ * backchannel setup. Buffer space for the send and private XDR buffers
+ * has been preallocated as well. Use xprt_alloc_bc_request to allocate
+ * to this request. Use xprt_free_bc_request to return it.
+ *
+ * We know that we're called in soft interrupt context, grab the spin_lock
+ * since there is no need to grab the bottom half spin_lock.
+ *
+ * Return an available rpc_rqst, otherwise NULL if non are available.
+ */
+struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid)
+{
+ struct rpc_rqst *req;
+
+ spin_lock(&xprt->bc_pa_lock);
+ list_for_each_entry(req, &xprt->bc_pa_list, rq_bc_pa_list) {
+ if (req->rq_connect_cookie != xprt->connect_cookie)
+ continue;
+ if (req->rq_xid == xid)
+ goto found;
+ }
+ req = xprt_alloc_bc_request(xprt, xid);
+found:
+ spin_unlock(&xprt->bc_pa_lock);
+ return req;
+}
+
+/*
+ * Add callback request to callback list. The callback
+ * service sleeps on the sv_cb_waitq waiting for new
+ * requests. Wake it up after adding enqueing the
+ * request.
+ */
+void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
+{
+ struct rpc_xprt *xprt = req->rq_xprt;
+ struct svc_serv *bc_serv = xprt->bc_serv;
+
+ req->rq_private_buf.len = copied;
+ set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+
+ dprintk("RPC: add callback request to list\n");
+ spin_lock(&bc_serv->sv_cb_lock);
+ list_del(&req->rq_bc_pa_list);
+ list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
+ wake_up(&bc_serv->sv_cb_waitq);
+ spin_unlock(&bc_serv->sv_cb_lock);
+}
+
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 9afa4393c21..06636214113 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -50,12 +50,6 @@ static void cache_init(struct cache_head *h)
h->last_refresh = now;
}
-static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h)
-{
- return (h->expiry_time < seconds_since_boot()) ||
- (detail->flush_time > h->last_refresh);
-}
-
struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
struct cache_head *key, int hash)
{
@@ -196,12 +190,12 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_update);
static int cache_make_upcall(struct cache_detail *cd, struct cache_head *h)
{
- if (!cd->cache_upcall)
- return -EINVAL;
- return cd->cache_upcall(cd, h);
+ if (cd->cache_upcall)
+ return cd->cache_upcall(cd, h);
+ return sunrpc_cache_pipe_upcall(cd, h);
}
-static inline int cache_is_valid(struct cache_detail *detail, struct cache_head *h)
+static inline int cache_is_valid(struct cache_head *h)
{
if (!test_bit(CACHE_VALID, &h->flags))
return -EAGAIN;
@@ -227,16 +221,15 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h
int rv;
write_lock(&detail->hash_lock);
- rv = cache_is_valid(detail, h);
- if (rv != -EAGAIN) {
- write_unlock(&detail->hash_lock);
- return rv;
+ rv = cache_is_valid(h);
+ if (rv == -EAGAIN) {
+ set_bit(CACHE_NEGATIVE, &h->flags);
+ cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY);
+ rv = -ENOENT;
}
- set_bit(CACHE_NEGATIVE, &h->flags);
- cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY);
write_unlock(&detail->hash_lock);
cache_fresh_unlocked(h, detail);
- return -ENOENT;
+ return rv;
}
/*
@@ -260,7 +253,7 @@ int cache_check(struct cache_detail *detail,
long refresh_age, age;
/* First decide return status as best we can */
- rv = cache_is_valid(detail, h);
+ rv = cache_is_valid(h);
/* now see if we want to start an upcall */
refresh_age = (h->expiry_time - h->last_refresh);
@@ -269,19 +262,17 @@ int cache_check(struct cache_detail *detail,
if (rqstp == NULL) {
if (rv == -EAGAIN)
rv = -ENOENT;
- } else if (rv == -EAGAIN || age > refresh_age/2) {
+ } else if (rv == -EAGAIN ||
+ (h->expiry_time != 0 && age > refresh_age/2)) {
dprintk("RPC: Want update, refage=%ld, age=%ld\n",
refresh_age, age);
if (!test_and_set_bit(CACHE_PENDING, &h->flags)) {
switch (cache_make_upcall(detail, h)) {
case -EINVAL:
- clear_bit(CACHE_PENDING, &h->flags);
- cache_revisit_request(h);
rv = try_to_negate_entry(detail, h);
break;
case -EAGAIN:
- clear_bit(CACHE_PENDING, &h->flags);
- cache_revisit_request(h);
+ cache_fresh_unlocked(h, detail);
break;
}
}
@@ -293,7 +284,7 @@ int cache_check(struct cache_detail *detail,
* Request was not deferred; handle it as best
* we can ourselves:
*/
- rv = cache_is_valid(detail, h);
+ rv = cache_is_valid(h);
if (rv == -EAGAIN)
rv = -ETIMEDOUT;
}
@@ -310,7 +301,7 @@ EXPORT_SYMBOL_GPL(cache_check);
* a current pointer into that list and into the table
* for that entry.
*
- * Each time clean_cache is called it finds the next non-empty entry
+ * Each time cache_clean is called it finds the next non-empty entry
* in the current table and walks the list in that entry
* looking for entries that can be removed.
*
@@ -383,7 +374,7 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
}
return;
out:
- printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name);
+ printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name);
}
EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail);
@@ -457,9 +448,8 @@ static int cache_clean(void)
current_index ++;
spin_unlock(&cache_list_lock);
if (ch) {
- if (test_and_clear_bit(CACHE_PENDING, &ch->flags))
- cache_dequeue(current_detail, ch);
- cache_revisit_request(ch);
+ set_bit(CACHE_CLEANED, &ch->flags);
+ cache_fresh_unlocked(ch, d);
cache_put(ch, d);
}
} else
@@ -629,7 +619,7 @@ static void cache_limit_defers(void)
/* Consider removing either the first or the last */
if (cache_defer_cnt > DFR_MAX) {
- if (net_random() & 1)
+ if (prandom_u32() & 1)
discard = list_entry(cache_defer_list.next,
struct cache_deferred_req, recent);
else
@@ -670,13 +660,13 @@ static void cache_revisit_request(struct cache_head *item)
{
struct cache_deferred_req *dreq;
struct list_head pending;
- struct hlist_node *lp, *tmp;
+ struct hlist_node *tmp;
int hash = DFR_HASH(item);
INIT_LIST_HEAD(&pending);
spin_lock(&cache_defer_lock);
- hlist_for_each_entry_safe(dreq, lp, tmp, &cache_defer_hash[hash], hash)
+ hlist_for_each_entry_safe(dreq, tmp, &cache_defer_hash[hash], hash)
if (dreq->item == item) {
__unhash_deferred_req(dreq);
list_add(&dreq->recent, &pending);
@@ -750,12 +740,24 @@ struct cache_reader {
int offset; /* if non-0, we have a refcnt on next request */
};
+static int cache_request(struct cache_detail *detail,
+ struct cache_request *crq)
+{
+ char *bp = crq->buf;
+ int len = PAGE_SIZE;
+
+ detail->cache_request(detail, crq->item, &bp, &len);
+ if (len < 0)
+ return -EAGAIN;
+ return PAGE_SIZE - len;
+}
+
static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
loff_t *ppos, struct cache_detail *cd)
{
struct cache_reader *rp = filp->private_data;
struct cache_request *rq;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
int err;
if (count == 0)
@@ -784,6 +786,13 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
rq->readers++;
spin_unlock(&queue_lock);
+ if (rq->len == 0) {
+ err = cache_request(cd, rq);
+ if (err < 0)
+ goto out;
+ rq->len = err;
+ }
+
if (rp->offset == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) {
err = -EAGAIN;
spin_lock(&queue_lock);
@@ -886,7 +895,7 @@ static ssize_t cache_write(struct file *filp, const char __user *buf,
struct cache_detail *cd)
{
struct address_space *mapping = filp->f_mapping;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
ssize_t ret = -EINVAL;
if (!cd->cache_parse)
@@ -967,8 +976,10 @@ static int cache_open(struct inode *inode, struct file *filp,
nonseekable_open(inode, filp);
if (filp->f_mode & FMODE_READ) {
rp = kmalloc(sizeof(*rp), GFP_KERNEL);
- if (!rp)
+ if (!rp) {
+ module_put(cd->owner);
return -ENOMEM;
+ }
rp->offset = 0;
rp->q.reader = 1;
atomic_inc(&cd->readers);
@@ -1015,23 +1026,32 @@ static int cache_release(struct inode *inode, struct file *filp,
static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch)
{
- struct cache_queue *cq;
+ struct cache_queue *cq, *tmp;
+ struct cache_request *cr;
+ struct list_head dequeued;
+
+ INIT_LIST_HEAD(&dequeued);
spin_lock(&queue_lock);
- list_for_each_entry(cq, &detail->queue, list)
+ list_for_each_entry_safe(cq, tmp, &detail->queue, list)
if (!cq->reader) {
- struct cache_request *cr = container_of(cq, struct cache_request, q);
+ cr = container_of(cq, struct cache_request, q);
if (cr->item != ch)
continue;
+ if (test_bit(CACHE_PENDING, &ch->flags))
+ /* Lost a race and it is pending again */
+ break;
if (cr->readers != 0)
continue;
- list_del(&cr->q.list);
- spin_unlock(&queue_lock);
- cache_put(cr->item, detail);
- kfree(cr->buf);
- kfree(cr);
- return;
+ list_move(&cr->q.list, &dequeued);
}
spin_unlock(&queue_lock);
+ while (!list_empty(&dequeued)) {
+ cr = list_entry(dequeued.next, struct cache_request, q.list);
+ list_del(&cr->q.list);
+ cache_put(cr->item, detail);
+ kfree(cr->buf);
+ kfree(cr);
+ }
}
/*
@@ -1091,9 +1111,7 @@ void qword_addhex(char **bpp, int *lp, char *buf, int blen)
*bp++ = 'x';
len -= 2;
while (blen && len >= 2) {
- unsigned char c = *buf++;
- *bp++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1);
- *bp++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1);
+ bp = hex_byte_pack(bp, *buf++);
len -= 2;
blen--;
}
@@ -1140,22 +1158,23 @@ static bool cache_listeners_exist(struct cache_detail *detail)
*
* Each request is at most one page long.
*/
-int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h,
- void (*cache_request)(struct cache_detail *,
- struct cache_head *,
- char **,
- int *))
+int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
{
char *buf;
struct cache_request *crq;
- char *bp;
- int len;
+ int ret = 0;
+
+ if (!detail->cache_request)
+ return -EINVAL;
if (!cache_listeners_exist(detail)) {
warn_no_listener(detail);
return -EINVAL;
}
+ if (test_bit(CACHE_CLEANED, &h->flags))
+ /* Too late to make an upcall */
+ return -EAGAIN;
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!buf)
@@ -1167,25 +1186,24 @@ int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h,
return -EAGAIN;
}
- bp = buf; len = PAGE_SIZE;
-
- cache_request(detail, h, &bp, &len);
-
- if (len < 0) {
- kfree(buf);
- kfree(crq);
- return -EAGAIN;
- }
crq->q.reader = 0;
crq->item = cache_get(h);
crq->buf = buf;
- crq->len = PAGE_SIZE - len;
+ crq->len = 0;
crq->readers = 0;
spin_lock(&queue_lock);
- list_add_tail(&crq->q.list, &detail->queue);
+ if (test_bit(CACHE_PENDING, &h->flags))
+ list_add_tail(&crq->q.list, &detail->queue);
+ else
+ /* Lost a race, no longer PENDING, so don't enqueue */
+ ret = -EAGAIN;
spin_unlock(&queue_lock);
wake_up(&queue_wait);
- return 0;
+ if (ret == -EAGAIN) {
+ kfree(buf);
+ kfree(crq);
+ }
+ return ret;
}
EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall);
@@ -1201,7 +1219,6 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall);
* key and content are both parsed by cache
*/
-#define isodigit(c) (isdigit(c) && c <= '7')
int qword_get(char **bpp, char *dest, int bufsize)
{
/* return bytes copied, or -1 on error */
@@ -1454,7 +1471,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
static ssize_t cache_read_procfs(struct file *filp, char __user *buf,
size_t count, loff_t *ppos)
{
- struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data;
+ struct cache_detail *cd = PDE_DATA(file_inode(filp));
return cache_read(filp, buf, count, ppos, cd);
}
@@ -1462,14 +1479,14 @@ static ssize_t cache_read_procfs(struct file *filp, char __user *buf,
static ssize_t cache_write_procfs(struct file *filp, const char __user *buf,
size_t count, loff_t *ppos)
{
- struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data;
+ struct cache_detail *cd = PDE_DATA(file_inode(filp));
return cache_write(filp, buf, count, ppos, cd);
}
static unsigned int cache_poll_procfs(struct file *filp, poll_table *wait)
{
- struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data;
+ struct cache_detail *cd = PDE_DATA(file_inode(filp));
return cache_poll(filp, wait, cd);
}
@@ -1477,22 +1494,22 @@ static unsigned int cache_poll_procfs(struct file *filp, poll_table *wait)
static long cache_ioctl_procfs(struct file *filp,
unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
- struct cache_detail *cd = PDE(inode)->data;
+ struct inode *inode = file_inode(filp);
+ struct cache_detail *cd = PDE_DATA(inode);
return cache_ioctl(inode, filp, cmd, arg, cd);
}
static int cache_open_procfs(struct inode *inode, struct file *filp)
{
- struct cache_detail *cd = PDE(inode)->data;
+ struct cache_detail *cd = PDE_DATA(inode);
return cache_open(inode, filp, cd);
}
static int cache_release_procfs(struct inode *inode, struct file *filp)
{
- struct cache_detail *cd = PDE(inode)->data;
+ struct cache_detail *cd = PDE_DATA(inode);
return cache_release(inode, filp, cd);
}
@@ -1510,14 +1527,14 @@ static const struct file_operations cache_file_operations_procfs = {
static int content_open_procfs(struct inode *inode, struct file *filp)
{
- struct cache_detail *cd = PDE(inode)->data;
+ struct cache_detail *cd = PDE_DATA(inode);
return content_open(inode, filp, cd);
}
static int content_release_procfs(struct inode *inode, struct file *filp)
{
- struct cache_detail *cd = PDE(inode)->data;
+ struct cache_detail *cd = PDE_DATA(inode);
return content_release(inode, filp, cd);
}
@@ -1531,14 +1548,14 @@ static const struct file_operations content_file_operations_procfs = {
static int open_flush_procfs(struct inode *inode, struct file *filp)
{
- struct cache_detail *cd = PDE(inode)->data;
+ struct cache_detail *cd = PDE_DATA(inode);
return open_flush(inode, filp, cd);
}
static int release_flush_procfs(struct inode *inode, struct file *filp)
{
- struct cache_detail *cd = PDE(inode)->data;
+ struct cache_detail *cd = PDE_DATA(inode);
return release_flush(inode, filp, cd);
}
@@ -1546,7 +1563,7 @@ static int release_flush_procfs(struct inode *inode, struct file *filp)
static ssize_t read_flush_procfs(struct file *filp, char __user *buf,
size_t count, loff_t *ppos)
{
- struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data;
+ struct cache_detail *cd = PDE_DATA(file_inode(filp));
return read_flush(filp, buf, count, ppos, cd);
}
@@ -1555,7 +1572,7 @@ static ssize_t write_flush_procfs(struct file *filp,
const char __user *buf,
size_t count, loff_t *ppos)
{
- struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data;
+ struct cache_detail *cd = PDE_DATA(file_inode(filp));
return write_flush(filp, buf, count, ppos, cd);
}
@@ -1605,7 +1622,7 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
if (p == NULL)
goto out_nomem;
- if (cd->cache_upcall || cd->cache_parse) {
+ if (cd->cache_request || cd->cache_parse) {
p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
cd->u.procfs.proc_ent,
&cache_file_operations_procfs, cd);
@@ -1614,7 +1631,7 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
goto out_nomem;
}
if (cd->cache_show) {
- p = proc_create_data("content", S_IFREG|S_IRUSR|S_IWUSR,
+ p = proc_create_data("content", S_IFREG|S_IRUSR,
cd->u.procfs.proc_ent,
&content_file_operations_procfs, cd);
cd->u.procfs.content_ent = p;
@@ -1686,7 +1703,7 @@ EXPORT_SYMBOL_GPL(cache_destroy_net);
static ssize_t cache_read_pipefs(struct file *filp, char __user *buf,
size_t count, loff_t *ppos)
{
- struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private;
+ struct cache_detail *cd = RPC_I(file_inode(filp))->private;
return cache_read(filp, buf, count, ppos, cd);
}
@@ -1694,14 +1711,14 @@ static ssize_t cache_read_pipefs(struct file *filp, char __user *buf,
static ssize_t cache_write_pipefs(struct file *filp, const char __user *buf,
size_t count, loff_t *ppos)
{
- struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private;
+ struct cache_detail *cd = RPC_I(file_inode(filp))->private;
return cache_write(filp, buf, count, ppos, cd);
}
static unsigned int cache_poll_pipefs(struct file *filp, poll_table *wait)
{
- struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private;
+ struct cache_detail *cd = RPC_I(file_inode(filp))->private;
return cache_poll(filp, wait, cd);
}
@@ -1709,7 +1726,7 @@ static unsigned int cache_poll_pipefs(struct file *filp, poll_table *wait)
static long cache_ioctl_pipefs(struct file *filp,
unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct cache_detail *cd = RPC_I(inode)->private;
return cache_ioctl(inode, filp, cmd, arg, cd);
@@ -1778,7 +1795,7 @@ static int release_flush_pipefs(struct inode *inode, struct file *filp)
static ssize_t read_flush_pipefs(struct file *filp, char __user *buf,
size_t count, loff_t *ppos)
{
- struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private;
+ struct cache_detail *cd = RPC_I(file_inode(filp))->private;
return read_flush(filp, buf, count, ppos, cd);
}
@@ -1787,7 +1804,7 @@ static ssize_t write_flush_pipefs(struct file *filp,
const char __user *buf,
size_t count, loff_t *ppos)
{
- struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private;
+ struct cache_detail *cd = RPC_I(file_inode(filp))->private;
return write_flush(filp, buf, count, ppos, cd);
}
@@ -1804,19 +1821,11 @@ int sunrpc_cache_register_pipefs(struct dentry *parent,
const char *name, umode_t umode,
struct cache_detail *cd)
{
- struct qstr q;
- struct dentry *dir;
- int ret = 0;
-
- q.name = name;
- q.len = strlen(name);
- q.hash = full_name_hash(q.name, q.len);
- dir = rpc_create_cache_dir(parent, &q, umode, cd);
- if (!IS_ERR(dir))
- cd->u.pipefs.dir = dir;
- else
- ret = PTR_ERR(dir);
- return ret;
+ struct dentry *dir = rpc_create_cache_dir(parent, name, umode, cd);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+ cd->u.pipefs.dir = dir;
+ return 0;
}
EXPORT_SYMBOL_GPL(sunrpc_cache_register_pipefs);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 507b5e84fbd..2e6ab10734f 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -25,14 +25,15 @@
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/slab.h>
+#include <linux/rcupdate.h>
#include <linux/utsname.h>
#include <linux/workqueue.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/un.h>
-#include <linux/rcupdate.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
#include <linux/sunrpc/metrics.h>
#include <linux/sunrpc/bc_xprt.h>
@@ -101,12 +102,7 @@ static void rpc_unregister_client(struct rpc_clnt *clnt)
static void __rpc_clnt_remove_pipedir(struct rpc_clnt *clnt)
{
- if (clnt->cl_dentry) {
- if (clnt->cl_auth && clnt->cl_auth->au_ops->pipes_destroy)
- clnt->cl_auth->au_ops->pipes_destroy(clnt->cl_auth);
- rpc_remove_client_dir(clnt->cl_dentry);
- }
- clnt->cl_dentry = NULL;
+ rpc_remove_client_dir(clnt);
}
static void rpc_clnt_remove_pipedir(struct rpc_clnt *clnt)
@@ -122,14 +118,12 @@ static void rpc_clnt_remove_pipedir(struct rpc_clnt *clnt)
}
static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb,
- struct rpc_clnt *clnt,
- const char *dir_name)
+ struct rpc_clnt *clnt)
{
static uint32_t clntid;
+ const char *dir_name = clnt->cl_program->pipe_dir_name;
char name[15];
- struct qstr q = { .name = name };
struct dentry *dir, *dentry;
- int error;
dir = rpc_d_lookup_sb(sb, dir_name);
if (dir == NULL) {
@@ -137,50 +131,52 @@ static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb,
return dir;
}
for (;;) {
- q.len = snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++);
+ snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++);
name[sizeof(name) - 1] = '\0';
- q.hash = full_name_hash(q.name, q.len);
- dentry = rpc_create_client_dir(dir, &q, clnt);
+ dentry = rpc_create_client_dir(dir, name, clnt);
if (!IS_ERR(dentry))
break;
- error = PTR_ERR(dentry);
- if (error != -EEXIST) {
- printk(KERN_INFO "RPC: Couldn't create pipefs entry"
- " %s/%s, error %d\n",
- dir_name, name, error);
- break;
- }
+ if (dentry == ERR_PTR(-EEXIST))
+ continue;
+ printk(KERN_INFO "RPC: Couldn't create pipefs entry"
+ " %s/%s, error %ld\n",
+ dir_name, name, PTR_ERR(dentry));
+ break;
}
dput(dir);
return dentry;
}
static int
-rpc_setup_pipedir(struct rpc_clnt *clnt, const char *dir_name)
+rpc_setup_pipedir(struct super_block *pipefs_sb, struct rpc_clnt *clnt)
{
- struct net *net = rpc_net_ns(clnt);
- struct super_block *pipefs_sb;
struct dentry *dentry;
- clnt->cl_dentry = NULL;
- if (dir_name == NULL)
- return 0;
- pipefs_sb = rpc_get_sb_net(net);
- if (!pipefs_sb)
- return 0;
- dentry = rpc_setup_pipedir_sb(pipefs_sb, clnt, dir_name);
- rpc_put_sb_net(net);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- clnt->cl_dentry = dentry;
+ if (clnt->cl_program->pipe_dir_name != NULL) {
+ dentry = rpc_setup_pipedir_sb(pipefs_sb, clnt);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ }
return 0;
}
-static inline int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event)
+static int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event)
{
- if (((event == RPC_PIPEFS_MOUNT) && clnt->cl_dentry) ||
- ((event == RPC_PIPEFS_UMOUNT) && !clnt->cl_dentry))
+ if (clnt->cl_program->pipe_dir_name == NULL)
return 1;
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ if (clnt->cl_pipedir_objects.pdh_dentry != NULL)
+ return 1;
+ if (atomic_read(&clnt->cl_count) == 0)
+ return 1;
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ if (clnt->cl_pipedir_objects.pdh_dentry == NULL)
+ return 1;
+ break;
+ }
return 0;
}
@@ -192,18 +188,11 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event,
switch (event) {
case RPC_PIPEFS_MOUNT:
- dentry = rpc_setup_pipedir_sb(sb, clnt,
- clnt->cl_program->pipe_dir_name);
+ dentry = rpc_setup_pipedir_sb(sb, clnt);
if (!dentry)
return -ENOENT;
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- clnt->cl_dentry = dentry;
- if (clnt->cl_auth->au_ops->pipes_create) {
- err = clnt->cl_auth->au_ops->pipes_create(clnt->cl_auth);
- if (err)
- __rpc_clnt_remove_pipedir(clnt);
- }
break;
case RPC_PIPEFS_UMOUNT:
__rpc_clnt_remove_pipedir(clnt);
@@ -236,12 +225,8 @@ static struct rpc_clnt *rpc_get_client_for_event(struct net *net, int event)
spin_lock(&sn->rpc_client_lock);
list_for_each_entry(clnt, &sn->all_clients, cl_clients) {
- if (clnt->cl_program->pipe_dir_name == NULL)
- continue;
if (rpc_clnt_skip_event(clnt, event))
continue;
- if (atomic_inc_not_zero(&clnt->cl_count) == 0)
- continue;
spin_unlock(&sn->rpc_client_lock);
return clnt;
}
@@ -258,7 +243,6 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
while ((clnt = rpc_get_client_for_event(sb->s_fs_info, event))) {
error = __rpc_pipefs_event(clnt, event, sb);
- rpc_release_client(clnt);
if (error)
break;
}
@@ -280,6 +264,26 @@ void rpc_clients_notifier_unregister(void)
return rpc_pipefs_notifier_unregister(&rpc_clients_block);
}
+static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ const struct rpc_timeout *timeout)
+{
+ struct rpc_xprt *old;
+
+ spin_lock(&clnt->cl_lock);
+ old = rcu_dereference_protected(clnt->cl_xprt,
+ lockdep_is_held(&clnt->cl_lock));
+
+ if (!xprt_bound(xprt))
+ clnt->cl_autobind = 1;
+
+ clnt->cl_timeout = timeout;
+ rcu_assign_pointer(clnt->cl_xprt, xprt);
+ spin_unlock(&clnt->cl_lock);
+
+ return old;
+}
+
static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename)
{
clnt->cl_nodelen = strlen(nodename);
@@ -288,12 +292,74 @@ static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename)
memcpy(clnt->cl_nodename, nodename, clnt->cl_nodelen);
}
-static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, struct rpc_xprt *xprt)
+static int rpc_client_register(struct rpc_clnt *clnt,
+ rpc_authflavor_t pseudoflavor,
+ const char *client_name)
+{
+ struct rpc_auth_create_args auth_args = {
+ .pseudoflavor = pseudoflavor,
+ .target_name = client_name,
+ };
+ struct rpc_auth *auth;
+ struct net *net = rpc_net_ns(clnt);
+ struct super_block *pipefs_sb;
+ int err;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ err = rpc_setup_pipedir(pipefs_sb, clnt);
+ if (err)
+ goto out;
+ }
+
+ rpc_register_client(clnt);
+ if (pipefs_sb)
+ rpc_put_sb_net(net);
+
+ auth = rpcauth_create(&auth_args, clnt);
+ if (IS_ERR(auth)) {
+ dprintk("RPC: Couldn't create auth handle (flavor %u)\n",
+ pseudoflavor);
+ err = PTR_ERR(auth);
+ goto err_auth;
+ }
+ return 0;
+err_auth:
+ pipefs_sb = rpc_get_sb_net(net);
+ rpc_unregister_client(clnt);
+ __rpc_clnt_remove_pipedir(clnt);
+out:
+ if (pipefs_sb)
+ rpc_put_sb_net(net);
+ return err;
+}
+
+static DEFINE_IDA(rpc_clids);
+
+static int rpc_alloc_clid(struct rpc_clnt *clnt)
+{
+ int clid;
+
+ clid = ida_simple_get(&rpc_clids, 0, 0, GFP_KERNEL);
+ if (clid < 0)
+ return clid;
+ clnt->cl_clid = clid;
+ return 0;
+}
+
+static void rpc_free_clid(struct rpc_clnt *clnt)
+{
+ ida_simple_remove(&rpc_clids, clnt->cl_clid);
+}
+
+static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
+ struct rpc_xprt *xprt,
+ struct rpc_clnt *parent)
{
const struct rpc_program *program = args->program;
const struct rpc_version *version;
- struct rpc_clnt *clnt = NULL;
- struct rpc_auth *auth;
+ struct rpc_clnt *clnt = NULL;
+ const struct rpc_timeout *timeout;
int err;
/* sanity check the name before trying to print it */
@@ -303,10 +369,8 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
err = rpciod_up();
if (err)
goto out_no_rpciod;
- err = -EINVAL;
- if (!xprt)
- goto out_no_xprt;
+ err = -EINVAL;
if (args->version >= program->nrvers)
goto out_err;
version = program->version[args->version];
@@ -317,16 +381,19 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
clnt = kzalloc(sizeof(*clnt), GFP_KERNEL);
if (!clnt)
goto out_err;
- clnt->cl_parent = clnt;
+ clnt->cl_parent = parent ? : clnt;
+
+ err = rpc_alloc_clid(clnt);
+ if (err)
+ goto out_no_clid;
- rcu_assign_pointer(clnt->cl_xprt, xprt);
clnt->cl_procinfo = version->procs;
clnt->cl_maxproc = version->nrprocs;
- clnt->cl_protname = program->name;
clnt->cl_prog = args->prognumber ? : program->number;
clnt->cl_vers = version->number;
clnt->cl_stats = program->stats;
clnt->cl_metrics = rpc_alloc_iostats(clnt);
+ rpc_init_pipe_dir_head(&clnt->cl_pipedir_objects);
err = -ENOMEM;
if (clnt->cl_metrics == NULL)
goto out_no_stats;
@@ -334,60 +401,75 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
INIT_LIST_HEAD(&clnt->cl_tasks);
spin_lock_init(&clnt->cl_lock);
- if (!xprt_bound(xprt))
- clnt->cl_autobind = 1;
-
- clnt->cl_timeout = xprt->timeout;
+ timeout = xprt->timeout;
if (args->timeout != NULL) {
memcpy(&clnt->cl_timeout_default, args->timeout,
sizeof(clnt->cl_timeout_default));
- clnt->cl_timeout = &clnt->cl_timeout_default;
+ timeout = &clnt->cl_timeout_default;
}
+ rpc_clnt_set_transport(clnt, xprt, timeout);
+
clnt->cl_rtt = &clnt->cl_rtt_default;
rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval);
- clnt->cl_principal = NULL;
- if (args->client_name) {
- clnt->cl_principal = kstrdup(args->client_name, GFP_KERNEL);
- if (!clnt->cl_principal)
- goto out_no_principal;
- }
atomic_set(&clnt->cl_count, 1);
- err = rpc_setup_pipedir(clnt, program->pipe_dir_name);
- if (err < 0)
- goto out_no_path;
-
- auth = rpcauth_create(args->authflavor, clnt);
- if (IS_ERR(auth)) {
- printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n",
- args->authflavor);
- err = PTR_ERR(auth);
- goto out_no_auth;
- }
-
/* save the nodename */
rpc_clnt_set_nodename(clnt, utsname()->nodename);
- rpc_register_client(clnt);
+
+ err = rpc_client_register(clnt, args->authflavor, args->client_name);
+ if (err)
+ goto out_no_path;
+ if (parent)
+ atomic_inc(&parent->cl_count);
return clnt;
-out_no_auth:
- rpc_clnt_remove_pipedir(clnt);
out_no_path:
- kfree(clnt->cl_principal);
-out_no_principal:
rpc_free_iostats(clnt->cl_metrics);
out_no_stats:
+ rpc_free_clid(clnt);
+out_no_clid:
kfree(clnt);
out_err:
- xprt_put(xprt);
-out_no_xprt:
rpciod_down();
out_no_rpciod:
+ xprt_put(xprt);
return ERR_PTR(err);
}
+struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
+ struct rpc_xprt *xprt)
+{
+ struct rpc_clnt *clnt = NULL;
+
+ clnt = rpc_new_client(args, xprt, NULL);
+ if (IS_ERR(clnt))
+ return clnt;
+
+ if (!(args->flags & RPC_CLNT_CREATE_NOPING)) {
+ int err = rpc_ping(clnt);
+ if (err != 0) {
+ rpc_shutdown_client(clnt);
+ return ERR_PTR(err);
+ }
+ }
+
+ clnt->cl_softrtry = 1;
+ if (args->flags & RPC_CLNT_CREATE_HARDRTRY)
+ clnt->cl_softrtry = 0;
+
+ if (args->flags & RPC_CLNT_CREATE_AUTOBIND)
+ clnt->cl_autobind = 1;
+ if (args->flags & RPC_CLNT_CREATE_DISCRTRY)
+ clnt->cl_discrtry = 1;
+ if (!(args->flags & RPC_CLNT_CREATE_QUIET))
+ clnt->cl_chatty = 1;
+
+ return clnt;
+}
+EXPORT_SYMBOL_GPL(rpc_create_xprt);
+
/**
* rpc_create - create an RPC client and transport with one call
* @args: rpc_clnt create argument structure
@@ -401,7 +483,6 @@ out_no_rpciod:
struct rpc_clnt *rpc_create(struct rpc_create_args *args)
{
struct rpc_xprt *xprt;
- struct rpc_clnt *clnt;
struct xprt_create xprtargs = {
.net = args->net,
.ident = args->protocol,
@@ -413,6 +494,10 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
};
char servername[48];
+ if (args->flags & RPC_CLNT_CREATE_INFINITE_SLOTS)
+ xprtargs.flags |= XPRT_CREATE_INFINITE_SLOTS;
+ if (args->flags & RPC_CLNT_CREATE_NO_IDLE_TIMEOUT)
+ xprtargs.flags |= XPRT_CREATE_NO_IDLE_TIMEOUT;
/*
* If the caller chooses not to specify a hostname, whip
* up a string representation of the passed-in address.
@@ -461,30 +546,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
xprt->resvport = 0;
- clnt = rpc_new_client(args, xprt);
- if (IS_ERR(clnt))
- return clnt;
-
- if (!(args->flags & RPC_CLNT_CREATE_NOPING)) {
- int err = rpc_ping(clnt);
- if (err != 0) {
- rpc_shutdown_client(clnt);
- return ERR_PTR(err);
- }
- }
-
- clnt->cl_softrtry = 1;
- if (args->flags & RPC_CLNT_CREATE_HARDRTRY)
- clnt->cl_softrtry = 0;
-
- if (args->flags & RPC_CLNT_CREATE_AUTOBIND)
- clnt->cl_autobind = 1;
- if (args->flags & RPC_CLNT_CREATE_DISCRTRY)
- clnt->cl_discrtry = 1;
- if (!(args->flags & RPC_CLNT_CREATE_QUIET))
- clnt->cl_chatty = 1;
-
- return clnt;
+ return rpc_create_xprt(args, xprt);
}
EXPORT_SYMBOL_GPL(rpc_create);
@@ -508,15 +570,12 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args,
goto out_err;
args->servername = xprt->servername;
- new = rpc_new_client(args, xprt);
+ new = rpc_new_client(args, xprt, clnt);
if (IS_ERR(new)) {
err = PTR_ERR(new);
- goto out_put;
+ goto out_err;
}
- atomic_inc(&clnt->cl_count);
- new->cl_parent = clnt;
-
/* Turn off autobind on clones */
new->cl_autobind = 0;
new->cl_softrtry = clnt->cl_softrtry;
@@ -524,8 +583,6 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args,
new->cl_chatty = clnt->cl_chatty;
return new;
-out_put:
- xprt_put(xprt);
out_err:
dprintk("RPC: %s: returned error %d\n", __func__, err);
return ERR_PTR(err);
@@ -545,7 +602,6 @@ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt)
.prognumber = clnt->cl_prog,
.version = clnt->cl_vers,
.authflavor = clnt->cl_auth->au_flavor,
- .client_name = clnt->cl_principal,
};
return __rpc_clone_client(&args, clnt);
}
@@ -567,12 +623,85 @@ rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
.prognumber = clnt->cl_prog,
.version = clnt->cl_vers,
.authflavor = flavor,
- .client_name = clnt->cl_principal,
};
return __rpc_clone_client(&args, clnt);
}
EXPORT_SYMBOL_GPL(rpc_clone_client_set_auth);
+/**
+ * rpc_switch_client_transport: switch the RPC transport on the fly
+ * @clnt: pointer to a struct rpc_clnt
+ * @args: pointer to the new transport arguments
+ * @timeout: pointer to the new timeout parameters
+ *
+ * This function allows the caller to switch the RPC transport for the
+ * rpc_clnt structure 'clnt' to allow it to connect to a mirrored NFS
+ * server, for instance. It assumes that the caller has ensured that
+ * there are no active RPC tasks by using some form of locking.
+ *
+ * Returns zero if "clnt" is now using the new xprt. Otherwise a
+ * negative errno is returned, and "clnt" continues to use the old
+ * xprt.
+ */
+int rpc_switch_client_transport(struct rpc_clnt *clnt,
+ struct xprt_create *args,
+ const struct rpc_timeout *timeout)
+{
+ const struct rpc_timeout *old_timeo;
+ rpc_authflavor_t pseudoflavor;
+ struct rpc_xprt *xprt, *old;
+ struct rpc_clnt *parent;
+ int err;
+
+ xprt = xprt_create_transport(args);
+ if (IS_ERR(xprt)) {
+ dprintk("RPC: failed to create new xprt for clnt %p\n",
+ clnt);
+ return PTR_ERR(xprt);
+ }
+
+ pseudoflavor = clnt->cl_auth->au_flavor;
+
+ old_timeo = clnt->cl_timeout;
+ old = rpc_clnt_set_transport(clnt, xprt, timeout);
+
+ rpc_unregister_client(clnt);
+ __rpc_clnt_remove_pipedir(clnt);
+
+ /*
+ * A new transport was created. "clnt" therefore
+ * becomes the root of a new cl_parent tree. clnt's
+ * children, if it has any, still point to the old xprt.
+ */
+ parent = clnt->cl_parent;
+ clnt->cl_parent = clnt;
+
+ /*
+ * The old rpc_auth cache cannot be re-used. GSS
+ * contexts in particular are between a single
+ * client and server.
+ */
+ err = rpc_client_register(clnt, pseudoflavor, NULL);
+ if (err)
+ goto out_revert;
+
+ synchronize_rcu();
+ if (parent != clnt)
+ rpc_release_client(parent);
+ xprt_put(old);
+ dprintk("RPC: replaced xprt for clnt %p\n", clnt);
+ return 0;
+
+out_revert:
+ rpc_clnt_set_transport(clnt, old, old_timeo);
+ clnt->cl_parent = parent;
+ rpc_client_register(clnt, pseudoflavor, NULL);
+ xprt_put(xprt);
+ dprintk("RPC: failed to switch xprt for clnt %p\n", clnt);
+ return err;
+}
+EXPORT_SYMBOL_GPL(rpc_switch_client_transport);
+
/*
* Kill all tasks for the given client.
* XXX: kill their descendants as well?
@@ -613,7 +742,7 @@ void rpc_shutdown_client(struct rpc_clnt *clnt)
might_sleep();
dprintk_rcu("RPC: shutting down %s client for %s\n",
- clnt->cl_protname,
+ clnt->cl_program->name,
rcu_dereference(clnt->cl_xprt)->servername);
while (!list_empty(&clnt->cl_tasks)) {
@@ -629,34 +758,35 @@ EXPORT_SYMBOL_GPL(rpc_shutdown_client);
/*
* Free an RPC client
*/
-static void
+static struct rpc_clnt *
rpc_free_client(struct rpc_clnt *clnt)
{
+ struct rpc_clnt *parent = NULL;
+
dprintk_rcu("RPC: destroying %s client for %s\n",
- clnt->cl_protname,
+ clnt->cl_program->name,
rcu_dereference(clnt->cl_xprt)->servername);
if (clnt->cl_parent != clnt)
- rpc_release_client(clnt->cl_parent);
- rpc_unregister_client(clnt);
+ parent = clnt->cl_parent;
rpc_clnt_remove_pipedir(clnt);
+ rpc_unregister_client(clnt);
rpc_free_iostats(clnt->cl_metrics);
- kfree(clnt->cl_principal);
clnt->cl_metrics = NULL;
xprt_put(rcu_dereference_raw(clnt->cl_xprt));
rpciod_down();
+ rpc_free_clid(clnt);
kfree(clnt);
+ return parent;
}
/*
* Free an RPC client
*/
-static void
+static struct rpc_clnt *
rpc_free_auth(struct rpc_clnt *clnt)
{
- if (clnt->cl_auth == NULL) {
- rpc_free_client(clnt);
- return;
- }
+ if (clnt->cl_auth == NULL)
+ return rpc_free_client(clnt);
/*
* Note: RPCSEC_GSS may need to send NULL RPC calls in order to
@@ -667,7 +797,8 @@ rpc_free_auth(struct rpc_clnt *clnt)
rpcauth_release(clnt->cl_auth);
clnt->cl_auth = NULL;
if (atomic_dec_and_test(&clnt->cl_count))
- rpc_free_client(clnt);
+ return rpc_free_client(clnt);
+ return NULL;
}
/*
@@ -678,11 +809,15 @@ rpc_release_client(struct rpc_clnt *clnt)
{
dprintk("RPC: rpc_release_client(%p)\n", clnt);
- if (list_empty(&clnt->cl_tasks))
- wake_up(&destroy_wait);
- if (atomic_dec_and_test(&clnt->cl_count))
- rpc_free_auth(clnt);
+ do {
+ if (list_empty(&clnt->cl_tasks))
+ wake_up(&destroy_wait);
+ if (!atomic_dec_and_test(&clnt->cl_count))
+ break;
+ clnt = rpc_free_auth(clnt);
+ } while (clnt != NULL);
}
+EXPORT_SYMBOL_GPL(rpc_release_client);
/**
* rpc_bind_new_program - bind a new RPC program to an existing client
@@ -703,7 +838,6 @@ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
.prognumber = program->number,
.version = vers,
.authflavor = old->cl_auth->au_flavor,
- .client_name = old->cl_principal,
};
struct rpc_clnt *clnt;
int err;
@@ -745,6 +879,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
atomic_inc(&clnt->cl_count);
if (clnt->cl_softrtry)
task->tk_flags |= RPC_TASK_SOFT;
+ if (clnt->cl_noretranstimeo)
+ task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
if (sk_memalloc_socks()) {
struct rpc_xprt *xprt;
@@ -1196,6 +1332,21 @@ size_t rpc_max_payload(struct rpc_clnt *clnt)
EXPORT_SYMBOL_GPL(rpc_max_payload);
/**
+ * rpc_get_timeout - Get timeout for transport in units of HZ
+ * @clnt: RPC client to query
+ */
+unsigned long rpc_get_timeout(struct rpc_clnt *clnt)
+{
+ unsigned long ret;
+
+ rcu_read_lock();
+ ret = rcu_dereference(clnt->cl_xprt)->timeout->to_initval;
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_get_timeout);
+
+/**
* rpc_force_rebind - force transport to check that remote port is unchanged
* @clnt: client to rebind
*
@@ -1220,6 +1371,7 @@ rpc_restart_call_prepare(struct rpc_task *task)
if (RPC_ASSASSINATED(task))
return 0;
task->tk_action = call_start;
+ task->tk_status = 0;
if (task->tk_ops->rpc_call_prepare != NULL)
task->tk_action = rpc_prepare_task;
return 1;
@@ -1236,6 +1388,7 @@ rpc_restart_call(struct rpc_task *task)
if (RPC_ASSASSINATED(task))
return 0;
task->tk_action = call_start;
+ task->tk_status = 0;
return 1;
}
EXPORT_SYMBOL_GPL(rpc_restart_call);
@@ -1267,7 +1420,7 @@ call_start(struct rpc_task *task)
struct rpc_clnt *clnt = task->tk_client;
dprintk("RPC: %5u call_start %s%d proc %s (%s)\n", task->tk_pid,
- clnt->cl_protname, clnt->cl_vers,
+ clnt->cl_program->name, clnt->cl_vers,
rpc_proc_name(task),
(RPC_IS_ASYNC(task) ? "async" : "sync"));
@@ -1290,6 +1443,8 @@ call_reserve(struct rpc_task *task)
xprt_reserve(task);
}
+static void call_retry_reserve(struct rpc_task *task);
+
/*
* 1b. Grok the result of xprt_reserve()
*/
@@ -1331,7 +1486,7 @@ call_reserveresult(struct rpc_task *task)
case -ENOMEM:
rpc_delay(task, HZ >> 2);
case -EAGAIN: /* woken up; retry */
- task->tk_action = call_reserve;
+ task->tk_action = call_retry_reserve;
return;
case -EIO: /* probably a shutdown */
break;
@@ -1344,6 +1499,19 @@ call_reserveresult(struct rpc_task *task)
}
/*
+ * 1c. Retry reserving an RPC call slot
+ */
+static void
+call_retry_reserve(struct rpc_task *task)
+{
+ dprint_status(task);
+
+ task->tk_status = 0;
+ task->tk_action = call_reserveresult;
+ xprt_retry_reserve(task);
+}
+
+/*
* 2. Bind and/or refresh the credentials
*/
static void
@@ -1371,14 +1539,18 @@ call_refreshresult(struct rpc_task *task)
task->tk_action = call_refresh;
switch (status) {
case 0:
- if (rpcauth_uptodatecred(task))
+ if (rpcauth_uptodatecred(task)) {
task->tk_action = call_allocate;
- return;
+ return;
+ }
+ /* Use rate-limiting and a max number of retries if refresh
+ * had status 0 but failed to update the cred.
+ */
case -ETIMEDOUT:
rpc_delay(task, 3*HZ);
- case -EKEYEXPIRED:
case -EAGAIN:
status = -EACCES;
+ case -EKEYEXPIRED:
if (!task->tk_cred_retry)
break;
task->tk_cred_retry--;
@@ -1400,7 +1572,7 @@ call_allocate(struct rpc_task *task)
{
unsigned int slack = task->tk_rqstp->rq_cred->cr_auth->au_cslack;
struct rpc_rqst *req = task->tk_rqstp;
- struct rpc_xprt *xprt = task->tk_xprt;
+ struct rpc_xprt *xprt = req->rq_xprt;
struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
dprint_status(task);
@@ -1508,7 +1680,7 @@ rpc_xdr_encode(struct rpc_task *task)
static void
call_bind(struct rpc_task *task)
{
- struct rpc_xprt *xprt = task->tk_xprt;
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
dprint_status(task);
@@ -1566,11 +1738,10 @@ call_bind_status(struct rpc_task *task)
case -EPROTONOSUPPORT:
dprintk("RPC: %5u remote rpcbind version unavailable, retrying\n",
task->tk_pid);
- task->tk_status = 0;
- task->tk_action = call_bind;
- return;
+ goto retry_timeout;
case -ECONNREFUSED: /* connection problems */
case -ECONNRESET:
+ case -ECONNABORTED:
case -ENOTCONN:
case -EHOSTDOWN:
case -EHOSTUNREACH:
@@ -1593,6 +1764,7 @@ call_bind_status(struct rpc_task *task)
return;
retry_timeout:
+ task->tk_status = 0;
task->tk_action = call_timeout;
}
@@ -1602,7 +1774,7 @@ retry_timeout:
static void
call_connect(struct rpc_task *task)
{
- struct rpc_xprt *xprt = task->tk_xprt;
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
dprintk("RPC: %5u call_connect xprt %p %s connected\n",
task->tk_pid, xprt,
@@ -1613,6 +1785,10 @@ call_connect(struct rpc_task *task)
task->tk_action = call_connect_status;
if (task->tk_status < 0)
return;
+ if (task->tk_flags & RPC_TASK_NOCONNECT) {
+ rpc_exit(task, -ENOTCONN);
+ return;
+ }
xprt_connect(task);
}
}
@@ -1628,22 +1804,29 @@ call_connect_status(struct rpc_task *task)
dprint_status(task);
- task->tk_status = 0;
- if (status >= 0 || status == -EAGAIN) {
- clnt->cl_stats->netreconn++;
- task->tk_action = call_transmit;
- return;
- }
-
trace_rpc_connect_status(task, status);
+ task->tk_status = 0;
switch (status) {
- /* if soft mounted, test if we've timed out */
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -ECONNABORTED:
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
+ if (RPC_IS_SOFTCONN(task))
+ break;
+ /* retry with existing socket, after a delay */
+ rpc_delay(task, 3*HZ);
+ case -EAGAIN:
+ /* Check for timeouts before looping back to call_bind */
case -ETIMEDOUT:
task->tk_action = call_timeout;
- break;
- default:
- rpc_exit(task, -EIO);
+ return;
+ case 0:
+ clnt->cl_stats->netreconn++;
+ task->tk_action = call_transmit;
+ return;
}
+ rpc_exit(task, status);
}
/*
@@ -1652,13 +1835,14 @@ call_connect_status(struct rpc_task *task)
static void
call_transmit(struct rpc_task *task)
{
+ int is_retrans = RPC_WAS_SENT(task);
+
dprint_status(task);
task->tk_action = call_status;
if (task->tk_status < 0)
return;
- task->tk_status = xprt_prepare_transmit(task);
- if (task->tk_status != 0)
+ if (!xprt_prepare_transmit(task))
return;
task->tk_action = call_transmit_status;
/* Encode here so that rpcsec_gss can use correct sequence number. */
@@ -1677,6 +1861,8 @@ call_transmit(struct rpc_task *task)
xprt_transmit(task);
if (task->tk_status < 0)
return;
+ if (is_retrans)
+ task->tk_client->cl_stats->rpcretrans++;
/*
* On success, ensure that we call xprt_end_transmit() before sleeping
* in order to allow access to the socket to other RPC requests.
@@ -1685,7 +1871,7 @@ call_transmit(struct rpc_task *task)
if (rpc_reply_expected(task))
return;
task->tk_action = rpc_exit_task;
- rpc_wake_up_queued_task(&task->tk_xprt->pending, task);
+ rpc_wake_up_queued_task(&task->tk_rqstp->rq_xprt->pending, task);
}
/*
@@ -1730,6 +1916,7 @@ call_transmit_status(struct rpc_task *task)
break;
}
case -ECONNRESET:
+ case -ECONNABORTED:
case -ENOTCONN:
case -EPIPE:
rpc_task_force_reencode(task);
@@ -1746,8 +1933,7 @@ call_bc_transmit(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
- task->tk_status = xprt_prepare_transmit(task);
- if (task->tk_status == -EAGAIN) {
+ if (!xprt_prepare_transmit(task)) {
/*
* Could not reserve the transport. Try again after the
* transport is released.
@@ -1784,7 +1970,7 @@ call_bc_transmit(struct rpc_task *task)
*/
printk(KERN_NOTICE "RPC: Could not send backchannel reply "
"error: %d\n", task->tk_status);
- xprt_conditional_disconnect(task->tk_xprt,
+ xprt_conditional_disconnect(req->rq_xprt,
req->rq_connect_cookie);
break;
default:
@@ -1828,6 +2014,10 @@ call_status(struct rpc_task *task)
case -EHOSTDOWN:
case -EHOSTUNREACH:
case -ENETUNREACH:
+ if (RPC_IS_SOFTCONN(task)) {
+ rpc_exit(task, status);
+ break;
+ }
/*
* Delay any retries for 3 seconds, then handle as if it
* were a timeout.
@@ -1835,12 +2025,14 @@ call_status(struct rpc_task *task)
rpc_delay(task, 3*HZ);
case -ETIMEDOUT:
task->tk_action = call_timeout;
- if (task->tk_client->cl_discrtry)
- xprt_conditional_disconnect(task->tk_xprt,
+ if (!(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT)
+ && task->tk_client->cl_discrtry)
+ xprt_conditional_disconnect(req->rq_xprt,
req->rq_connect_cookie);
break;
- case -ECONNRESET:
case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -ECONNABORTED:
rpc_force_rebind(clnt);
rpc_delay(task, 3*HZ);
case -EPIPE:
@@ -1857,7 +2049,7 @@ call_status(struct rpc_task *task)
default:
if (clnt->cl_chatty)
printk("%s: RPC call returned error %d\n",
- clnt->cl_protname, -status);
+ clnt->cl_program->name, -status);
rpc_exit(task, status);
}
}
@@ -1888,7 +2080,7 @@ call_timeout(struct rpc_task *task)
if (clnt->cl_chatty) {
rcu_read_lock();
printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
- clnt->cl_protname,
+ clnt->cl_program->name,
rcu_dereference(clnt->cl_xprt)->servername);
rcu_read_unlock();
}
@@ -1904,7 +2096,7 @@ call_timeout(struct rpc_task *task)
if (clnt->cl_chatty) {
rcu_read_lock();
printk(KERN_NOTICE "%s: server %s not responding, still trying\n",
- clnt->cl_protname,
+ clnt->cl_program->name,
rcu_dereference(clnt->cl_xprt)->servername);
rcu_read_unlock();
}
@@ -1917,7 +2109,6 @@ call_timeout(struct rpc_task *task)
rpcauth_invalcred(task);
retry:
- clnt->cl_stats->rpcretrans++;
task->tk_action = call_bind;
task->tk_status = 0;
}
@@ -1939,7 +2130,7 @@ call_decode(struct rpc_task *task)
if (clnt->cl_chatty) {
rcu_read_lock();
printk(KERN_NOTICE "%s: server %s OK\n",
- clnt->cl_protname,
+ clnt->cl_program->name,
rcu_dereference(clnt->cl_xprt)->servername);
rcu_read_unlock();
}
@@ -1960,11 +2151,10 @@ call_decode(struct rpc_task *task)
if (req->rq_rcv_buf.len < 12) {
if (!RPC_IS_SOFT(task)) {
task->tk_action = call_bind;
- clnt->cl_stats->rpcretrans++;
goto out_retry;
}
dprintk("RPC: %s: too small RPC reply size (%d bytes)\n",
- clnt->cl_protname, task->tk_status);
+ clnt->cl_program->name, task->tk_status);
task->tk_action = call_timeout;
goto out_retry;
}
@@ -1991,7 +2181,7 @@ out_retry:
if (task->tk_rqstp == req) {
req->rq_reply_bytes_recvd = req->rq_rcv_buf.len = 0;
if (task->tk_client->cl_discrtry)
- xprt_conditional_disconnect(task->tk_xprt,
+ xprt_conditional_disconnect(req->rq_xprt,
req->rq_connect_cookie);
}
}
@@ -2005,7 +2195,7 @@ rpc_encode_header(struct rpc_task *task)
/* FIXME: check buffer size? */
- p = xprt_skip_transport_header(task->tk_xprt, p);
+ p = xprt_skip_transport_header(req->rq_xprt, p);
*p++ = req->rq_xid; /* XID */
*p++ = htonl(RPC_CALL); /* CALL */
*p++ = htonl(RPC_VERSION); /* RPC version */
@@ -2036,7 +2226,8 @@ rpc_verify_header(struct rpc_task *task)
dprintk("RPC: %5u %s: XDR representation not a multiple of"
" 4 bytes: 0x%x\n", task->tk_pid, __func__,
task->tk_rqstp->rq_rcv_buf.len);
- goto out_eio;
+ error = -EIO;
+ goto out_err;
}
if ((len -= 3) < 0)
goto out_overflow;
@@ -2045,6 +2236,7 @@ rpc_verify_header(struct rpc_task *task)
if ((n = ntohl(*p++)) != RPC_REPLY) {
dprintk("RPC: %5u %s: not an RPC reply: %x\n",
task->tk_pid, __func__, n);
+ error = -EIO;
goto out_garbage;
}
@@ -2063,7 +2255,8 @@ rpc_verify_header(struct rpc_task *task)
dprintk("RPC: %5u %s: RPC call rejected, "
"unknown error: %x\n",
task->tk_pid, __func__, n);
- goto out_eio;
+ error = -EIO;
+ goto out_err;
}
if (--len < 0)
goto out_overflow;
@@ -2108,9 +2301,11 @@ rpc_verify_header(struct rpc_task *task)
task->tk_pid, __func__, n);
goto out_err;
}
- if (!(p = rpcauth_checkverf(task, p))) {
- dprintk("RPC: %5u %s: auth check failed\n",
- task->tk_pid, __func__);
+ p = rpcauth_checkverf(task, p);
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
+ dprintk("RPC: %5u %s: auth check failed with %d\n",
+ task->tk_pid, __func__, error);
goto out_garbage; /* bad verifier, retry */
}
len = p - (__be32 *)iov->iov_base - 1;
@@ -2163,8 +2358,6 @@ out_garbage:
out_retry:
return ERR_PTR(-EAGAIN);
}
-out_eio:
- error = -EIO;
out_err:
rpc_exit(task, error);
dprintk("RPC: %5u %s: call failed with error %d\n", task->tk_pid,
@@ -2236,7 +2429,7 @@ static void rpc_show_task(const struct rpc_clnt *clnt,
printk(KERN_INFO "%5u %04x %6d %8p %8p %8ld %8p %sv%u %s a:%ps q:%s\n",
task->tk_pid, task->tk_flags, task->tk_status,
clnt, task->tk_rqstp, task->tk_timeout, task->tk_ops,
- clnt->cl_protname, clnt->cl_vers, rpc_proc_name(task),
+ clnt->cl_program->name, clnt->cl_vers, rpc_proc_name(task),
task->tk_action, rpc_waitq);
}
diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h
index ce7bd449173..df582687653 100644
--- a/net/sunrpc/netns.h
+++ b/net/sunrpc/netns.h
@@ -14,6 +14,7 @@ struct sunrpc_net {
struct cache_detail *rsi_cache;
struct super_block *pipefs_sb;
+ struct rpc_pipe *gssd_dummy;
struct mutex pipefs_sb_lock;
struct list_head all_clients;
@@ -23,6 +24,14 @@ struct sunrpc_net {
struct rpc_clnt *rpcb_local_clnt4;
spinlock_t rpcb_clnt_lock;
unsigned int rpcb_users;
+ unsigned int rpcb_is_af_local : 1;
+
+ struct mutex gssp_lock;
+ struct rpc_clnt *gssp_clnt;
+ int use_gss_proxy;
+ int pipe_version;
+ atomic_t pipe_users;
+ struct proc_dir_entry *use_gssp_proc;
};
extern int sunrpc_net_id;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index fd10981ea79..b1855489856 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -17,6 +17,7 @@
#include <linux/fsnotify.h>
#include <linux/kernel.h>
#include <linux/rcupdate.h>
+#include <linux/utsname.h>
#include <asm/ioctls.h>
#include <linux/poll.h>
@@ -38,7 +39,7 @@
#define NET_NAME(net) ((net == &init_net) ? " (init_net)" : "")
static struct file_system_type rpc_pipe_fs_type;
-
+static const struct rpc_pipe_ops gssd_dummy_pipe_ops;
static struct kmem_cache *rpc_inode_cachep __read_mostly;
@@ -284,7 +285,7 @@ out:
static ssize_t
rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct rpc_pipe *pipe;
struct rpc_pipe_msg *msg;
int res = 0;
@@ -328,7 +329,7 @@ out_unlock:
static ssize_t
rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *offset)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
int res;
mutex_lock(&inode->i_mutex);
@@ -342,7 +343,7 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of
static unsigned int
rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct rpc_inode *rpci = RPC_I(inode);
unsigned int mask = POLLOUT | POLLWRNORM;
@@ -360,7 +361,7 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
static long
rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct rpc_pipe *pipe;
int len;
@@ -406,7 +407,7 @@ rpc_show_info(struct seq_file *m, void *v)
rcu_read_lock();
seq_printf(m, "RPC server: %s\n",
rcu_dereference(clnt->cl_xprt)->servername);
- seq_printf(m, "service: %s (%d) version %d\n", clnt->cl_protname,
+ seq_printf(m, "service: %s (%d) version %d\n", clnt->cl_program->name,
clnt->cl_prog, clnt->cl_vers);
seq_printf(m, "address: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR));
seq_printf(m, "protocol: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PROTO));
@@ -468,15 +469,6 @@ struct rpc_filelist {
umode_t mode;
};
-static int rpc_delete_dentry(const struct dentry *dentry)
-{
- return 1;
-}
-
-static const struct dentry_operations rpc_dentry_operations = {
- .d_delete = rpc_delete_dentry,
-};
-
static struct inode *
rpc_get_inode(struct super_block *sb, umode_t mode)
{
@@ -516,8 +508,8 @@ static int __rpc_create_common(struct inode *dir, struct dentry *dentry,
d_add(dentry, inode);
return 0;
out_err:
- printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n",
- __FILE__, __func__, dentry->d_name.name);
+ printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %pd\n",
+ __FILE__, __func__, dentry);
dput(dentry);
return -ENOMEM;
}
@@ -653,20 +645,17 @@ static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry)
}
static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent,
- struct qstr *name)
+ const char *name)
{
- struct dentry *dentry;
-
- dentry = d_lookup(parent, name);
+ struct qstr q = QSTR_INIT(name, strlen(name));
+ struct dentry *dentry = d_hash_and_lookup(parent, &q);
if (!dentry) {
- dentry = d_alloc(parent, name);
+ dentry = d_alloc(parent, &q);
if (!dentry)
return ERR_PTR(-ENOMEM);
}
- if (dentry->d_inode == NULL) {
- d_set_d_op(dentry, &rpc_dentry_operations);
+ if (dentry->d_inode == NULL)
return dentry;
- }
dput(dentry);
return ERR_PTR(-EEXIST);
}
@@ -686,8 +675,7 @@ static void __rpc_depopulate(struct dentry *parent,
for (i = start; i < eof; i++) {
name.name = files[i].name;
name.len = strlen(files[i].name);
- name.hash = full_name_hash(name.name, name.len);
- dentry = d_lookup(parent, &name);
+ dentry = d_hash_and_lookup(parent, &name);
if (dentry == NULL)
continue;
@@ -729,12 +717,7 @@ static int rpc_populate(struct dentry *parent,
mutex_lock(&dir->i_mutex);
for (i = start; i < eof; i++) {
- struct qstr q;
-
- q.name = files[i].name;
- q.len = strlen(files[i].name);
- q.hash = full_name_hash(q.name, q.len);
- dentry = __rpc_lookup_create_exclusive(parent, &q);
+ dentry = __rpc_lookup_create_exclusive(parent, files[i].name);
err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_bad;
@@ -761,13 +744,13 @@ static int rpc_populate(struct dentry *parent,
out_bad:
__rpc_depopulate(parent, files, start, eof);
mutex_unlock(&dir->i_mutex);
- printk(KERN_WARNING "%s: %s failed to populate directory %s\n",
- __FILE__, __func__, parent->d_name.name);
+ printk(KERN_WARNING "%s: %s failed to populate directory %pd\n",
+ __FILE__, __func__, parent);
return err;
}
static struct dentry *rpc_mkdir_populate(struct dentry *parent,
- struct qstr *name, umode_t mode, void *private,
+ const char *name, umode_t mode, void *private,
int (*populate)(struct dentry *, void *), void *args_populate)
{
struct dentry *dentry;
@@ -830,7 +813,7 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
* responses to upcalls. They will result in calls to @msg->downcall.
*
* The @private argument passed here will be available to all these methods
- * from the file pointer, via RPC_I(file->f_dentry->d_inode)->private.
+ * from the file pointer, via RPC_I(file_inode(file))->private.
*/
struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
void *private, struct rpc_pipe *pipe)
@@ -838,7 +821,6 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
struct dentry *dentry;
struct inode *dir = parent->d_inode;
umode_t umode = S_IFIFO | S_IRUSR | S_IWUSR;
- struct qstr q;
int err;
if (pipe->ops->upcall == NULL)
@@ -846,12 +828,8 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
if (pipe->ops->downcall == NULL)
umode &= ~S_IWUGO;
- q.name = name;
- q.len = strlen(name);
- q.hash = full_name_hash(q.name, q.len),
-
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
- dentry = __rpc_lookup_create_exclusive(parent, &q);
+ dentry = __rpc_lookup_create_exclusive(parent, name);
if (IS_ERR(dentry))
goto out;
err = __rpc_mkpipe_dentry(dir, dentry, umode, &rpc_pipe_fops,
@@ -863,8 +841,8 @@ out:
return dentry;
out_err:
dentry = ERR_PTR(err);
- printk(KERN_WARNING "%s: %s() failed to create pipe %s/%s (errno = %d)\n",
- __FILE__, __func__, parent->d_name.name, name,
+ printk(KERN_WARNING "%s: %s() failed to create pipe %pd/%s (errno = %d)\n",
+ __FILE__, __func__, parent, name,
err);
goto out;
}
@@ -895,6 +873,159 @@ rpc_unlink(struct dentry *dentry)
}
EXPORT_SYMBOL_GPL(rpc_unlink);
+/**
+ * rpc_init_pipe_dir_head - initialise a struct rpc_pipe_dir_head
+ * @pdh: pointer to struct rpc_pipe_dir_head
+ */
+void rpc_init_pipe_dir_head(struct rpc_pipe_dir_head *pdh)
+{
+ INIT_LIST_HEAD(&pdh->pdh_entries);
+ pdh->pdh_dentry = NULL;
+}
+EXPORT_SYMBOL_GPL(rpc_init_pipe_dir_head);
+
+/**
+ * rpc_init_pipe_dir_object - initialise a struct rpc_pipe_dir_object
+ * @pdo: pointer to struct rpc_pipe_dir_object
+ * @pdo_ops: pointer to const struct rpc_pipe_dir_object_ops
+ * @pdo_data: pointer to caller-defined data
+ */
+void rpc_init_pipe_dir_object(struct rpc_pipe_dir_object *pdo,
+ const struct rpc_pipe_dir_object_ops *pdo_ops,
+ void *pdo_data)
+{
+ INIT_LIST_HEAD(&pdo->pdo_head);
+ pdo->pdo_ops = pdo_ops;
+ pdo->pdo_data = pdo_data;
+}
+EXPORT_SYMBOL_GPL(rpc_init_pipe_dir_object);
+
+static int
+rpc_add_pipe_dir_object_locked(struct net *net,
+ struct rpc_pipe_dir_head *pdh,
+ struct rpc_pipe_dir_object *pdo)
+{
+ int ret = 0;
+
+ if (pdh->pdh_dentry)
+ ret = pdo->pdo_ops->create(pdh->pdh_dentry, pdo);
+ if (ret == 0)
+ list_add_tail(&pdo->pdo_head, &pdh->pdh_entries);
+ return ret;
+}
+
+static void
+rpc_remove_pipe_dir_object_locked(struct net *net,
+ struct rpc_pipe_dir_head *pdh,
+ struct rpc_pipe_dir_object *pdo)
+{
+ if (pdh->pdh_dentry)
+ pdo->pdo_ops->destroy(pdh->pdh_dentry, pdo);
+ list_del_init(&pdo->pdo_head);
+}
+
+/**
+ * rpc_add_pipe_dir_object - associate a rpc_pipe_dir_object to a directory
+ * @net: pointer to struct net
+ * @pdh: pointer to struct rpc_pipe_dir_head
+ * @pdo: pointer to struct rpc_pipe_dir_object
+ *
+ */
+int
+rpc_add_pipe_dir_object(struct net *net,
+ struct rpc_pipe_dir_head *pdh,
+ struct rpc_pipe_dir_object *pdo)
+{
+ int ret = 0;
+
+ if (list_empty(&pdo->pdo_head)) {
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ mutex_lock(&sn->pipefs_sb_lock);
+ ret = rpc_add_pipe_dir_object_locked(net, pdh, pdo);
+ mutex_unlock(&sn->pipefs_sb_lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_add_pipe_dir_object);
+
+/**
+ * rpc_remove_pipe_dir_object - remove a rpc_pipe_dir_object from a directory
+ * @net: pointer to struct net
+ * @pdh: pointer to struct rpc_pipe_dir_head
+ * @pdo: pointer to struct rpc_pipe_dir_object
+ *
+ */
+void
+rpc_remove_pipe_dir_object(struct net *net,
+ struct rpc_pipe_dir_head *pdh,
+ struct rpc_pipe_dir_object *pdo)
+{
+ if (!list_empty(&pdo->pdo_head)) {
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ mutex_lock(&sn->pipefs_sb_lock);
+ rpc_remove_pipe_dir_object_locked(net, pdh, pdo);
+ mutex_unlock(&sn->pipefs_sb_lock);
+ }
+}
+EXPORT_SYMBOL_GPL(rpc_remove_pipe_dir_object);
+
+/**
+ * rpc_find_or_alloc_pipe_dir_object
+ * @net: pointer to struct net
+ * @pdh: pointer to struct rpc_pipe_dir_head
+ * @match: match struct rpc_pipe_dir_object to data
+ * @alloc: allocate a new struct rpc_pipe_dir_object
+ * @data: user defined data for match() and alloc()
+ *
+ */
+struct rpc_pipe_dir_object *
+rpc_find_or_alloc_pipe_dir_object(struct net *net,
+ struct rpc_pipe_dir_head *pdh,
+ int (*match)(struct rpc_pipe_dir_object *, void *),
+ struct rpc_pipe_dir_object *(*alloc)(void *),
+ void *data)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct rpc_pipe_dir_object *pdo;
+
+ mutex_lock(&sn->pipefs_sb_lock);
+ list_for_each_entry(pdo, &pdh->pdh_entries, pdo_head) {
+ if (!match(pdo, data))
+ continue;
+ goto out;
+ }
+ pdo = alloc(data);
+ if (!pdo)
+ goto out;
+ rpc_add_pipe_dir_object_locked(net, pdh, pdo);
+out:
+ mutex_unlock(&sn->pipefs_sb_lock);
+ return pdo;
+}
+EXPORT_SYMBOL_GPL(rpc_find_or_alloc_pipe_dir_object);
+
+static void
+rpc_create_pipe_dir_objects(struct rpc_pipe_dir_head *pdh)
+{
+ struct rpc_pipe_dir_object *pdo;
+ struct dentry *dir = pdh->pdh_dentry;
+
+ list_for_each_entry(pdo, &pdh->pdh_entries, pdo_head)
+ pdo->pdo_ops->create(dir, pdo);
+}
+
+static void
+rpc_destroy_pipe_dir_objects(struct rpc_pipe_dir_head *pdh)
+{
+ struct rpc_pipe_dir_object *pdo;
+ struct dentry *dir = pdh->pdh_dentry;
+
+ list_for_each_entry(pdo, &pdh->pdh_entries, pdo_head)
+ pdo->pdo_ops->destroy(dir, pdo);
+}
+
enum {
RPCAUTH_info,
RPCAUTH_EOF
@@ -922,8 +1053,8 @@ static void rpc_clntdir_depopulate(struct dentry *dentry)
/**
* rpc_create_client_dir - Create a new rpc_client directory in rpc_pipefs
- * @dentry: dentry from the rpc_pipefs root to the new directory
- * @name: &struct qstr for the name
+ * @dentry: the parent of new directory
+ * @name: the name of new directory
* @rpc_client: rpc client to associate with this directory
*
* This creates a directory at the given @path associated with
@@ -932,19 +1063,32 @@ static void rpc_clntdir_depopulate(struct dentry *dentry)
* later be created using rpc_mkpipe().
*/
struct dentry *rpc_create_client_dir(struct dentry *dentry,
- struct qstr *name,
+ const char *name,
struct rpc_clnt *rpc_client)
{
- return rpc_mkdir_populate(dentry, name, S_IRUGO | S_IXUGO, NULL,
+ struct dentry *ret;
+
+ ret = rpc_mkdir_populate(dentry, name, S_IRUGO | S_IXUGO, NULL,
rpc_clntdir_populate, rpc_client);
+ if (!IS_ERR(ret)) {
+ rpc_client->cl_pipedir_objects.pdh_dentry = ret;
+ rpc_create_pipe_dir_objects(&rpc_client->cl_pipedir_objects);
+ }
+ return ret;
}
/**
* rpc_remove_client_dir - Remove a directory created with rpc_create_client_dir()
- * @dentry: dentry for the pipe
+ * @rpc_client: rpc_client for the pipe
*/
-int rpc_remove_client_dir(struct dentry *dentry)
+int rpc_remove_client_dir(struct rpc_clnt *rpc_client)
{
+ struct dentry *dentry = rpc_client->cl_pipedir_objects.pdh_dentry;
+
+ if (dentry == NULL)
+ return 0;
+ rpc_destroy_pipe_dir_objects(&rpc_client->cl_pipedir_objects);
+ rpc_client->cl_pipedir_objects.pdh_dentry = NULL;
return rpc_rmdir_depopulate(dentry, rpc_clntdir_depopulate);
}
@@ -978,7 +1122,7 @@ static void rpc_cachedir_depopulate(struct dentry *dentry)
rpc_depopulate(dentry, cache_pipefs_files, 0, 3);
}
-struct dentry *rpc_create_cache_dir(struct dentry *parent, struct qstr *name,
+struct dentry *rpc_create_cache_dir(struct dentry *parent, const char *name,
umode_t umode, struct cache_detail *cd)
{
return rpc_mkdir_populate(parent, name, umode, NULL,
@@ -1013,6 +1157,7 @@ enum {
RPCAUTH_nfsd4_cb,
RPCAUTH_cache,
RPCAUTH_nfsd,
+ RPCAUTH_gssd,
RPCAUTH_RootEOF
};
@@ -1049,6 +1194,10 @@ static const struct rpc_filelist files[] = {
.name = "nfsd",
.mode = S_IFDIR | S_IRUGO | S_IXUGO,
},
+ [RPCAUTH_gssd] = {
+ .name = "gssd",
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ },
};
/*
@@ -1058,17 +1207,28 @@ struct dentry *rpc_d_lookup_sb(const struct super_block *sb,
const unsigned char *dir_name)
{
struct qstr dir = QSTR_INIT(dir_name, strlen(dir_name));
-
- dir.hash = full_name_hash(dir.name, dir.len);
- return d_lookup(sb->s_root, &dir);
+ return d_hash_and_lookup(sb->s_root, &dir);
}
EXPORT_SYMBOL_GPL(rpc_d_lookup_sb);
-void rpc_pipefs_init_net(struct net *net)
+int rpc_pipefs_init_net(struct net *net)
{
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ sn->gssd_dummy = rpc_mkpipe_data(&gssd_dummy_pipe_ops, 0);
+ if (IS_ERR(sn->gssd_dummy))
+ return PTR_ERR(sn->gssd_dummy);
+
mutex_init(&sn->pipefs_sb_lock);
+ sn->pipe_version = -1;
+ return 0;
+}
+
+void rpc_pipefs_exit_net(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ rpc_destroy_pipe_data(sn->gssd_dummy);
}
/*
@@ -1098,11 +1258,134 @@ void rpc_put_sb_net(const struct net *net)
}
EXPORT_SYMBOL_GPL(rpc_put_sb_net);
+static const struct rpc_filelist gssd_dummy_clnt_dir[] = {
+ [0] = {
+ .name = "clntXX",
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ },
+};
+
+static ssize_t
+dummy_downcall(struct file *filp, const char __user *src, size_t len)
+{
+ return -EINVAL;
+}
+
+static const struct rpc_pipe_ops gssd_dummy_pipe_ops = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = dummy_downcall,
+};
+
+/*
+ * Here we present a bogus "info" file to keep rpc.gssd happy. We don't expect
+ * that it will ever use this info to handle an upcall, but rpc.gssd expects
+ * that this file will be there and have a certain format.
+ */
+static int
+rpc_show_dummy_info(struct seq_file *m, void *v)
+{
+ seq_printf(m, "RPC server: %s\n", utsname()->nodename);
+ seq_printf(m, "service: foo (1) version 0\n");
+ seq_printf(m, "address: 127.0.0.1\n");
+ seq_printf(m, "protocol: tcp\n");
+ seq_printf(m, "port: 0\n");
+ return 0;
+}
+
+static int
+rpc_dummy_info_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rpc_show_dummy_info, NULL);
+}
+
+static const struct file_operations rpc_dummy_info_operations = {
+ .owner = THIS_MODULE,
+ .open = rpc_dummy_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const struct rpc_filelist gssd_dummy_info_file[] = {
+ [0] = {
+ .name = "info",
+ .i_fop = &rpc_dummy_info_operations,
+ .mode = S_IFREG | S_IRUSR,
+ },
+};
+
+/**
+ * rpc_gssd_dummy_populate - create a dummy gssd pipe
+ * @root: root of the rpc_pipefs filesystem
+ * @pipe_data: pipe data created when netns is initialized
+ *
+ * Create a dummy set of directories and a pipe that gssd can hold open to
+ * indicate that it is up and running.
+ */
+static struct dentry *
+rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data)
+{
+ int ret = 0;
+ struct dentry *gssd_dentry;
+ struct dentry *clnt_dentry = NULL;
+ struct dentry *pipe_dentry = NULL;
+ struct qstr q = QSTR_INIT(files[RPCAUTH_gssd].name,
+ strlen(files[RPCAUTH_gssd].name));
+
+ /* We should never get this far if "gssd" doesn't exist */
+ gssd_dentry = d_hash_and_lookup(root, &q);
+ if (!gssd_dentry)
+ return ERR_PTR(-ENOENT);
+
+ ret = rpc_populate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1, NULL);
+ if (ret) {
+ pipe_dentry = ERR_PTR(ret);
+ goto out;
+ }
+
+ q.name = gssd_dummy_clnt_dir[0].name;
+ q.len = strlen(gssd_dummy_clnt_dir[0].name);
+ clnt_dentry = d_hash_and_lookup(gssd_dentry, &q);
+ if (!clnt_dentry) {
+ pipe_dentry = ERR_PTR(-ENOENT);
+ goto out;
+ }
+
+ ret = rpc_populate(clnt_dentry, gssd_dummy_info_file, 0, 1, NULL);
+ if (ret) {
+ __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1);
+ pipe_dentry = ERR_PTR(ret);
+ goto out;
+ }
+
+ pipe_dentry = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data);
+ if (IS_ERR(pipe_dentry)) {
+ __rpc_depopulate(clnt_dentry, gssd_dummy_info_file, 0, 1);
+ __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1);
+ }
+out:
+ dput(clnt_dentry);
+ dput(gssd_dentry);
+ return pipe_dentry;
+}
+
+static void
+rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry)
+{
+ struct dentry *clnt_dir = pipe_dentry->d_parent;
+ struct dentry *gssd_dir = clnt_dir->d_parent;
+
+ __rpc_rmpipe(clnt_dir->d_inode, pipe_dentry);
+ __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1);
+ __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1);
+ dput(pipe_dentry);
+}
+
static int
rpc_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *inode;
- struct dentry *root;
+ struct dentry *root, *gssd_dentry;
struct net *net = data;
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
int err;
@@ -1111,6 +1394,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = RPCAUTH_GSSMAGIC;
sb->s_op = &s_ops;
+ sb->s_d_op = &simple_dentry_operations;
sb->s_time_gran = 1;
inode = rpc_get_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO);
@@ -1119,8 +1403,16 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
return -ENOMEM;
if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL))
return -ENOMEM;
+
+ gssd_dentry = rpc_gssd_dummy_populate(root, sn->gssd_dummy);
+ if (IS_ERR(gssd_dentry)) {
+ __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF);
+ return PTR_ERR(gssd_dentry);
+ }
+
dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n",
net, NET_NAME(net));
+ mutex_lock(&sn->pipefs_sb_lock);
sn->pipefs_sb = sb;
err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list,
RPC_PIPEFS_MOUNT,
@@ -1128,17 +1420,30 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto err_depopulate;
sb->s_fs_info = get_net(net);
+ mutex_unlock(&sn->pipefs_sb_lock);
return 0;
err_depopulate:
+ rpc_gssd_dummy_depopulate(gssd_dentry);
blocking_notifier_call_chain(&rpc_pipefs_notifier_list,
RPC_PIPEFS_UMOUNT,
sb);
sn->pipefs_sb = NULL;
__rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF);
+ mutex_unlock(&sn->pipefs_sb_lock);
return err;
}
+bool
+gssd_running(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct rpc_pipe *pipe = sn->gssd_dummy;
+
+ return pipe->nreaders || pipe->nwriters;
+}
+EXPORT_SYMBOL_GPL(gssd_running);
+
static struct dentry *
rpc_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
@@ -1157,12 +1462,12 @@ static void rpc_kill_sb(struct super_block *sb)
goto out;
}
sn->pipefs_sb = NULL;
- mutex_unlock(&sn->pipefs_sb_lock);
dprintk("RPC: sending pipefs UMOUNT notification for net %p%s\n",
net, NET_NAME(net));
blocking_notifier_call_chain(&rpc_pipefs_notifier_list,
RPC_PIPEFS_UMOUNT,
sb);
+ mutex_unlock(&sn->pipefs_sb_lock);
put_net(net);
out:
kill_litter_super(sb);
@@ -1174,6 +1479,8 @@ static struct file_system_type rpc_pipe_fs_type = {
.mount = rpc_mount,
.kill_sb = rpc_kill_sb,
};
+MODULE_ALIAS_FS("rpc_pipefs");
+MODULE_ALIAS("rpc_pipefs");
static void
init_once(void *foo)
@@ -1218,6 +1525,3 @@ void unregister_rpc_pipefs(void)
kmem_cache_destroy(rpc_inode_cachep);
unregister_filesystem(&rpc_pipe_fs_type);
}
-
-/* Make 'mount -t rpc_pipefs ...' autoload this module. */
-MODULE_ALIAS("rpc_pipefs");
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 795a0f4e920..1891a1022c1 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -26,6 +26,7 @@
#include <net/ipv6.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/sched.h>
#include <linux/sunrpc/xprtsock.h>
@@ -203,13 +204,15 @@ void rpcb_put_local(struct net *net)
}
static void rpcb_set_local(struct net *net, struct rpc_clnt *clnt,
- struct rpc_clnt *clnt4)
+ struct rpc_clnt *clnt4,
+ bool is_af_local)
{
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
/* Protected by rpcb_create_local_mutex */
sn->rpcb_local_clnt = clnt;
sn->rpcb_local_clnt4 = clnt4;
+ sn->rpcb_is_af_local = is_af_local ? 1 : 0;
smp_wmb();
sn->rpcb_users = 1;
dprintk("RPC: created new rpcb local clients (rpcb_local_clnt: "
@@ -237,6 +240,14 @@ static int rpcb_create_local_unix(struct net *net)
.program = &rpcb_program,
.version = RPCBVERS_2,
.authflavor = RPC_AUTH_NULL,
+ /*
+ * We turn off the idle timeout to prevent the kernel
+ * from automatically disconnecting the socket.
+ * Otherwise, we'd have to cache the mount namespace
+ * of the caller and somehow pass that to the socket
+ * reconnect code.
+ */
+ .flags = RPC_CLNT_CREATE_NO_IDLE_TIMEOUT,
};
struct rpc_clnt *clnt, *clnt4;
int result = 0;
@@ -262,7 +273,7 @@ static int rpcb_create_local_unix(struct net *net)
clnt4 = NULL;
}
- rpcb_set_local(net, clnt, clnt4);
+ rpcb_set_local(net, clnt, clnt4, true);
out:
return result;
@@ -314,7 +325,7 @@ static int rpcb_create_local_net(struct net *net)
clnt4 = NULL;
}
- rpcb_set_local(net, clnt, clnt4);
+ rpcb_set_local(net, clnt, clnt4, false);
out:
return result;
@@ -375,13 +386,16 @@ static struct rpc_clnt *rpcb_create(struct net *net, const char *hostname,
return rpc_create(&args);
}
-static int rpcb_register_call(struct rpc_clnt *clnt, struct rpc_message *msg)
+static int rpcb_register_call(struct sunrpc_net *sn, struct rpc_clnt *clnt, struct rpc_message *msg, bool is_set)
{
- int result, error = 0;
+ int flags = RPC_TASK_NOCONNECT;
+ int error, result = 0;
+ if (is_set || !sn->rpcb_is_af_local)
+ flags = RPC_TASK_SOFTCONN;
msg->rpc_resp = &result;
- error = rpc_call_sync(clnt, msg, RPC_TASK_SOFTCONN);
+ error = rpc_call_sync(clnt, msg, flags);
if (error < 0) {
dprintk("RPC: failed to contact local rpcbind "
"server (errno %d).\n", -error);
@@ -438,16 +452,19 @@ int rpcb_register(struct net *net, u32 prog, u32 vers, int prot, unsigned short
.rpc_argp = &map,
};
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ bool is_set = false;
dprintk("RPC: %sregistering (%u, %u, %d, %u) with local "
"rpcbind\n", (port ? "" : "un"),
prog, vers, prot, port);
msg.rpc_proc = &rpcb_procedures2[RPCBPROC_UNSET];
- if (port)
+ if (port != 0) {
msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET];
+ is_set = true;
+ }
- return rpcb_register_call(sn->rpcb_local_clnt, &msg);
+ return rpcb_register_call(sn, sn->rpcb_local_clnt, &msg, is_set);
}
/*
@@ -460,6 +477,7 @@ static int rpcb_register_inet4(struct sunrpc_net *sn,
const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
struct rpcbind_args *map = msg->rpc_argp;
unsigned short port = ntohs(sin->sin_port);
+ bool is_set = false;
int result;
map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL);
@@ -470,10 +488,12 @@ static int rpcb_register_inet4(struct sunrpc_net *sn,
map->r_addr, map->r_netid);
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
- if (port)
+ if (port != 0) {
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
+ is_set = true;
+ }
- result = rpcb_register_call(sn->rpcb_local_clnt4, msg);
+ result = rpcb_register_call(sn, sn->rpcb_local_clnt4, msg, is_set);
kfree(map->r_addr);
return result;
}
@@ -488,6 +508,7 @@ static int rpcb_register_inet6(struct sunrpc_net *sn,
const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap;
struct rpcbind_args *map = msg->rpc_argp;
unsigned short port = ntohs(sin6->sin6_port);
+ bool is_set = false;
int result;
map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL);
@@ -498,10 +519,12 @@ static int rpcb_register_inet6(struct sunrpc_net *sn,
map->r_addr, map->r_netid);
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
- if (port)
+ if (port != 0) {
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
+ is_set = true;
+ }
- result = rpcb_register_call(sn->rpcb_local_clnt4, msg);
+ result = rpcb_register_call(sn, sn->rpcb_local_clnt4, msg, is_set);
kfree(map->r_addr);
return result;
}
@@ -518,7 +541,7 @@ static int rpcb_unregister_all_protofamilies(struct sunrpc_net *sn,
map->r_addr = "";
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
- return rpcb_register_call(sn->rpcb_local_clnt4, msg);
+ return rpcb_register_call(sn, sn->rpcb_local_clnt4, msg, false);
}
/**
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index bfa31714581..c0365c14b85 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -98,9 +98,25 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list);
}
+static void rpc_rotate_queue_owner(struct rpc_wait_queue *queue)
+{
+ struct list_head *q = &queue->tasks[queue->priority];
+ struct rpc_task *task;
+
+ if (!list_empty(q)) {
+ task = list_first_entry(q, struct rpc_task, u.tk_wait.list);
+ if (task->tk_owner == queue->owner)
+ list_move_tail(&task->u.tk_wait.list, q);
+ }
+}
+
static void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority)
{
- queue->priority = priority;
+ if (queue->priority != priority) {
+ /* Fairness: rotate the list when changing priority */
+ rpc_rotate_queue_owner(queue);
+ queue->priority = priority;
+ }
}
static void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid)
@@ -164,6 +180,8 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]);
task->tk_waitqueue = queue;
queue->qlen++;
+ /* barrier matches the read in rpc_wake_up_task_queue_locked() */
+ smp_wmb();
rpc_set_queued(task);
dprintk("RPC: %5u added to queue %p \"%s\"\n",
@@ -236,11 +254,11 @@ static int rpc_wait_bit_killable(void *word)
{
if (fatal_signal_pending(current))
return -ERESTARTSYS;
- freezable_schedule();
+ freezable_schedule_unsafe();
return 0;
}
-#ifdef RPC_DEBUG
+#if defined(RPC_DEBUG) || defined(RPC_TRACEPOINTS)
static void rpc_task_set_debuginfo(struct rpc_task *task)
{
static atomic_t rpc_pid;
@@ -306,11 +324,17 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
* Note: If the task is ASYNC, and is being made runnable after sitting on an
* rpc_wait_queue, this must be called with the queue spinlock held to protect
* the wait queue operation.
+ * Note the ordering of rpc_test_and_set_running() and rpc_clear_queued(),
+ * which is needed to ensure that __rpc_execute() doesn't loop (due to the
+ * lockless RPC_IS_QUEUED() test) before we've had a chance to test
+ * the RPC_TASK_RUNNING flag.
*/
static void rpc_make_runnable(struct rpc_task *task)
{
+ bool need_wakeup = !rpc_test_and_set_running(task);
+
rpc_clear_queued(task);
- if (rpc_test_and_set_running(task))
+ if (!need_wakeup)
return;
if (RPC_IS_ASYNC(task)) {
INIT_WORK(&task->u.tk_work, rpc_async_schedule);
@@ -414,23 +438,12 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
*/
static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
{
- if (RPC_IS_QUEUED(task) && task->tk_waitqueue == queue)
- __rpc_do_wake_up_task(queue, task);
-}
-
-/*
- * Tests whether rpc queue is empty
- */
-int rpc_queue_empty(struct rpc_wait_queue *queue)
-{
- int res;
-
- spin_lock_bh(&queue->lock);
- res = queue->qlen;
- spin_unlock_bh(&queue->lock);
- return res == 0;
+ if (RPC_IS_QUEUED(task)) {
+ smp_rmb();
+ if (task->tk_waitqueue == queue)
+ __rpc_do_wake_up_task(queue, task);
+ }
}
-EXPORT_SYMBOL_GPL(rpc_queue_empty);
/*
* Wake up a task on a specific queue
@@ -624,7 +637,8 @@ static void __rpc_queue_timer_fn(unsigned long ptr)
static void __rpc_atrun(struct rpc_task *task)
{
- task->tk_status = 0;
+ if (task->tk_status == -ETIMEDOUT)
+ task->tk_status = 0;
}
/*
@@ -777,7 +791,6 @@ static void __rpc_execute(struct rpc_task *task)
task->tk_flags |= RPC_TASK_KILLED;
rpc_exit(task, -ERESTARTSYS);
}
- rpc_set_running(task);
dprintk("RPC: %5u sync task resuming\n", task->tk_pid);
}
@@ -798,9 +811,11 @@ static void __rpc_execute(struct rpc_task *task)
*/
void rpc_execute(struct rpc_task *task)
{
+ bool is_async = RPC_IS_ASYNC(task);
+
rpc_set_active(task);
rpc_make_runnable(task);
- if (!RPC_IS_ASYNC(task))
+ if (!is_async)
__rpc_execute(task);
}
@@ -817,7 +832,8 @@ static void rpc_async_schedule(struct work_struct *work)
* @size: requested byte size
*
* To prevent rpciod from hanging, this allocator never sleeps,
- * returning NULL if the request cannot be serviced immediately.
+ * returning NULL and suppressing warning if the request cannot be serviced
+ * immediately.
* The caller can arrange to sleep in a way that is safe for rpciod.
*
* Most requests are 'small' (under 2KiB) and can be serviced from a
@@ -830,7 +846,7 @@ static void rpc_async_schedule(struct work_struct *work)
void *rpc_malloc(struct rpc_task *task, size_t size)
{
struct rpc_buffer *buf;
- gfp_t gfp = GFP_NOWAIT;
+ gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN;
if (RPC_IS_SWAPPER(task))
gfp |= __GFP_MEMALLOC;
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 0a648c502fc..2df87f78e51 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -173,7 +173,8 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
return -1;
if (csum_fold(desc.csum))
return -1;
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
netdev_rx_csum_fault(skb->dev);
return 0;
no_checksum:
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index bc2068ee795..54530490944 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -64,7 +64,7 @@ static int rpc_proc_show(struct seq_file *seq, void *v) {
static int rpc_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, rpc_proc_show, PDE(inode)->data);
+ return single_open(file, rpc_proc_show, PDE_DATA(inode));
}
static const struct file_operations rpc_proc_fops = {
@@ -188,7 +188,7 @@ void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt)
seq_printf(seq, "\tRPC iostats version: %s ", RPC_IOSTATS_VERS);
seq_printf(seq, "p/v: %u/%u (%s)\n",
- clnt->cl_prog, clnt->cl_vers, clnt->cl_protname);
+ clnt->cl_prog, clnt->cl_vers, clnt->cl_program->name);
rcu_read_lock();
xprt = rcu_dereference(clnt->cl_xprt);
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index 14c9f6d1c5f..f2b7cb540e6 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -43,6 +43,19 @@ static inline int rpc_reply_expected(struct rpc_task *task)
(task->tk_msg.rpc_proc->p_decode != NULL);
}
+static inline int sock_is_loopback(struct sock *sk)
+{
+ struct dst_entry *dst;
+ int loopback = 0;
+ rcu_read_lock();
+ dst = rcu_dereference(sk->sk_dst_cache);
+ if (dst && dst->dev &&
+ (dst->dev->features & NETIF_F_LOOPBACK))
+ loopback = 1;
+ rcu_read_unlock();
+ return loopback;
+}
+
int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
struct page *headpage, unsigned long headoffset,
struct page *tailpage, unsigned long tailoffset);
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 3d6498af9ad..cd30120de9e 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -44,12 +44,17 @@ static __net_init int sunrpc_init_net(struct net *net)
if (err)
goto err_unixgid;
- rpc_pipefs_init_net(net);
+ err = rpc_pipefs_init_net(net);
+ if (err)
+ goto err_pipefs;
+
INIT_LIST_HEAD(&sn->all_clients);
spin_lock_init(&sn->rpc_client_lock);
spin_lock_init(&sn->rpcb_clnt_lock);
return 0;
+err_pipefs:
+ unix_gid_cache_destroy(net);
err_unixgid:
ip_map_cache_destroy(net);
err_ipmap:
@@ -60,6 +65,7 @@ err_proc:
static __net_exit void sunrpc_exit_net(struct net *net)
{
+ rpc_pipefs_exit_net(net);
unix_gid_cache_destroy(net);
ip_map_cache_destroy(net);
rpc_proc_exit(net);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index dbf12ac5ecb..5de6801cd92 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -515,15 +515,6 @@ EXPORT_SYMBOL_GPL(svc_create_pooled);
void svc_shutdown_net(struct svc_serv *serv, struct net *net)
{
- /*
- * The set of xprts (contained in the sv_tempsocks and
- * sv_permsocks lists) is now constant, since it is modified
- * only by accepting new sockets (done by service threads in
- * svc_recv) or aging old ones (done by sv_temptimer), or
- * configuration changes (excluded by whatever locking the
- * caller is using--nfsd_mutex in the case of nfsd). So it's
- * safe to traverse those lists and shut everything down:
- */
svc_close_net(serv, net);
if (serv->sv_shutdown)
@@ -749,7 +740,7 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
__module_get(serv->sv_module);
task = kthread_create_on_node(serv->sv_function, rqstp,
- node, serv->sv_name);
+ node, "%s", serv->sv_name);
if (IS_ERR(task)) {
error = PTR_ERR(task);
module_put(serv->sv_module);
@@ -925,9 +916,6 @@ static int __svc_register(struct net *net, const char *progname,
#endif
}
- if (error < 0)
- printk(KERN_WARNING "svc: failed to register %sv%u RPC "
- "service (errno %d).\n", progname, version, -error);
return error;
}
@@ -946,6 +934,7 @@ int svc_register(const struct svc_serv *serv, struct net *net,
const unsigned short port)
{
struct svc_program *progp;
+ struct svc_version *vers;
unsigned int i;
int error = 0;
@@ -955,7 +944,8 @@ int svc_register(const struct svc_serv *serv, struct net *net,
for (progp = serv->sv_program; progp; progp = progp->pg_next) {
for (i = 0; i < progp->pg_nvers; i++) {
- if (progp->pg_vers[i] == NULL)
+ vers = progp->pg_vers[i];
+ if (vers == NULL)
continue;
dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n",
@@ -964,16 +954,26 @@ int svc_register(const struct svc_serv *serv, struct net *net,
proto == IPPROTO_UDP? "udp" : "tcp",
port,
family,
- progp->pg_vers[i]->vs_hidden?
- " (but not telling portmap)" : "");
+ vers->vs_hidden ?
+ " (but not telling portmap)" : "");
- if (progp->pg_vers[i]->vs_hidden)
+ if (vers->vs_hidden)
continue;
error = __svc_register(net, progp->pg_name, progp->pg_prog,
i, family, proto, port);
- if (error < 0)
+
+ if (vers->vs_rpcb_optnl) {
+ error = 0;
+ continue;
+ }
+
+ if (error < 0) {
+ printk(KERN_WARNING "svc: failed to register "
+ "%sv%u RPC service (errno %d).\n",
+ progp->pg_name, i, -error);
break;
+ }
}
}
@@ -1042,6 +1042,7 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net)
/*
* dprintk the given error with the address of the client that caused it.
*/
+#ifdef RPC_DEBUG
static __printf(2, 3)
void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...)
{
@@ -1058,6 +1059,9 @@ void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...)
va_end(args);
}
+#else
+static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) {}
+#endif
/*
* Common routine for processing the RPC request.
@@ -1109,8 +1113,6 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
rqstp->rq_vers = vers = svc_getnl(argv); /* version number */
rqstp->rq_proc = proc = svc_getnl(argv); /* procedure number */
- progp = serv->sv_program;
-
for (progp = serv->sv_program; progp; progp = progp->pg_next)
if (prog == progp->pg_prog)
break;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index b8e47fac731..b4737fbdec1 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -499,7 +499,8 @@ void svc_wake_up(struct svc_serv *serv)
rqstp->rq_xprt = NULL;
*/
wake_up(&rqstp->rq_wait);
- }
+ } else
+ pool->sp_task_pending = 1;
spin_unlock_bh(&pool->sp_lock);
}
}
@@ -570,7 +571,7 @@ static void svc_check_conn_limits(struct svc_serv *serv)
}
}
-int svc_alloc_arg(struct svc_rqst *rqstp)
+static int svc_alloc_arg(struct svc_rqst *rqstp)
{
struct svc_serv *serv = rqstp->rq_server;
struct xdr_buf *arg;
@@ -596,6 +597,7 @@ int svc_alloc_arg(struct svc_rqst *rqstp)
}
rqstp->rq_pages[i] = p;
}
+ rqstp->rq_page_end = &rqstp->rq_pages[i];
rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
/* Make arg->head point to first page and arg->pages point to rest */
@@ -611,7 +613,7 @@ int svc_alloc_arg(struct svc_rqst *rqstp)
return 0;
}
-struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
+static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
{
struct svc_xprt *xprt;
struct svc_pool *pool = rqstp->rq_pool;
@@ -634,7 +636,13 @@ struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
* long for cache updates.
*/
rqstp->rq_chandle.thread_wait = 1*HZ;
+ pool->sp_task_pending = 0;
} else {
+ if (pool->sp_task_pending) {
+ pool->sp_task_pending = 0;
+ spin_unlock_bh(&pool->sp_lock);
+ return ERR_PTR(-EAGAIN);
+ }
/* No data pending. Go to sleep */
svc_thread_enqueue(pool, rqstp);
@@ -684,7 +692,7 @@ struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
return xprt;
}
-void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt)
+static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt)
{
spin_lock_bh(&serv->sv_lock);
set_bit(XPT_TEMP, &newxpt->xpt_flags);
@@ -723,6 +731,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
newxpt = xprt->xpt_ops->xpo_accept(xprt);
if (newxpt)
svc_add_new_temp_xprt(serv, newxpt);
+ else
+ module_put(xprt->xpt_class->xcl_owner);
} else if (xprt->xpt_ops->xpo_has_wspace(xprt)) {
/* XPT_DATA|XPT_DEFERRED case: */
dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
@@ -786,7 +796,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
clear_bit(XPT_OLD, &xprt->xpt_flags);
- rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
+ rqstp->rq_secure = xprt->xpt_ops->xpo_secure_port(rqstp);
rqstp->rq_chandle.defer = svc_defer;
if (serv->sv_stats)
@@ -856,7 +866,6 @@ static void svc_age_temp_xprts(unsigned long closure)
struct svc_serv *serv = (struct svc_serv *)closure;
struct svc_xprt *xprt;
struct list_head *le, *next;
- LIST_HEAD(to_be_aged);
dprintk("svc_age_temp_xprts\n");
@@ -877,25 +886,15 @@ static void svc_age_temp_xprts(unsigned long closure)
if (atomic_read(&xprt->xpt_ref.refcount) > 1 ||
test_bit(XPT_BUSY, &xprt->xpt_flags))
continue;
- svc_xprt_get(xprt);
- list_move(le, &to_be_aged);
+ list_del_init(le);
set_bit(XPT_CLOSE, &xprt->xpt_flags);
set_bit(XPT_DETACHED, &xprt->xpt_flags);
- }
- spin_unlock_bh(&serv->sv_lock);
-
- while (!list_empty(&to_be_aged)) {
- le = to_be_aged.next;
- /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */
- list_del_init(le);
- xprt = list_entry(le, struct svc_xprt, xpt_list);
-
dprintk("queuing xprt %p for closing\n", xprt);
/* a thread will dequeue and close it soon */
svc_xprt_enqueue(xprt);
- svc_xprt_put(xprt);
}
+ spin_unlock_bh(&serv->sv_lock);
mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
}
@@ -959,21 +958,24 @@ void svc_close_xprt(struct svc_xprt *xprt)
}
EXPORT_SYMBOL_GPL(svc_close_xprt);
-static void svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net)
+static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net)
{
struct svc_xprt *xprt;
+ int ret = 0;
spin_lock(&serv->sv_lock);
list_for_each_entry(xprt, xprt_list, xpt_list) {
if (xprt->xpt_net != net)
continue;
+ ret++;
set_bit(XPT_CLOSE, &xprt->xpt_flags);
- set_bit(XPT_BUSY, &xprt->xpt_flags);
+ svc_xprt_enqueue(xprt);
}
spin_unlock(&serv->sv_lock);
+ return ret;
}
-static void svc_clear_pools(struct svc_serv *serv, struct net *net)
+static struct svc_xprt *svc_dequeue_net(struct svc_serv *serv, struct net *net)
{
struct svc_pool *pool;
struct svc_xprt *xprt;
@@ -988,42 +990,46 @@ static void svc_clear_pools(struct svc_serv *serv, struct net *net)
if (xprt->xpt_net != net)
continue;
list_del_init(&xprt->xpt_ready);
+ spin_unlock_bh(&pool->sp_lock);
+ return xprt;
}
spin_unlock_bh(&pool->sp_lock);
}
+ return NULL;
}
-static void svc_clear_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net)
+static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
{
struct svc_xprt *xprt;
- struct svc_xprt *tmp;
- LIST_HEAD(victims);
-
- spin_lock(&serv->sv_lock);
- list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) {
- if (xprt->xpt_net != net)
- continue;
- list_move(&xprt->xpt_list, &victims);
- }
- spin_unlock(&serv->sv_lock);
- list_for_each_entry_safe(xprt, tmp, &victims, xpt_list)
+ while ((xprt = svc_dequeue_net(serv, net))) {
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
svc_delete_xprt(xprt);
+ }
}
+/*
+ * Server threads may still be running (especially in the case where the
+ * service is still running in other network namespaces).
+ *
+ * So we shut down sockets the same way we would on a running server, by
+ * setting XPT_CLOSE, enqueuing, and letting a thread pick it up to do
+ * the close. In the case there are no such other threads,
+ * threads running, svc_clean_up_xprts() does a simple version of a
+ * server's main event loop, and in the case where there are other
+ * threads, we may need to wait a little while and then check again to
+ * see if they're done.
+ */
void svc_close_net(struct svc_serv *serv, struct net *net)
{
- svc_close_list(serv, &serv->sv_tempsocks, net);
- svc_close_list(serv, &serv->sv_permsocks, net);
+ int delay = 0;
- svc_clear_pools(serv, net);
- /*
- * At this point the sp_sockets lists will stay empty, since
- * svc_xprt_enqueue will not add new entries without taking the
- * sp_lock and checking XPT_BUSY.
- */
- svc_clear_list(serv, &serv->sv_tempsocks, net);
- svc_clear_list(serv, &serv->sv_permsocks, net);
+ while (svc_close_list(serv, &serv->sv_permsocks, net) +
+ svc_close_list(serv, &serv->sv_tempsocks, net)) {
+
+ svc_clean_up_xprts(serv, net);
+ msleep(delay++);
+ }
}
/*
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index 7963569fc04..79c0f3459b5 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -54,6 +54,8 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
}
spin_unlock(&authtab_lock);
+ rqstp->rq_auth_slack = 0;
+
rqstp->rq_authop = aops;
return aops->accept(rqstp, authp);
}
@@ -138,13 +140,12 @@ auth_domain_lookup(char *name, struct auth_domain *new)
{
struct auth_domain *hp;
struct hlist_head *head;
- struct hlist_node *np;
head = &auth_domain_table[hash_str(name, DN_HASHBITS)];
spin_lock(&auth_domain_lock);
- hlist_for_each_entry(hp, np, head, hash) {
+ hlist_for_each_entry(hp, head, hash) {
if (strcmp(hp->name, name)==0) {
kref_get(&hp->ref);
spin_unlock(&auth_domain_lock);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 4d012920373..621ca7b4a15 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -6,6 +6,7 @@
#include <linux/sunrpc/svcsock.h>
#include <linux/sunrpc/svcauth.h>
#include <linux/sunrpc/gss_api.h>
+#include <linux/sunrpc/addr.h>
#include <linux/err.h>
#include <linux/seq_file.h>
#include <linux/hash.h>
@@ -17,7 +18,6 @@
#include <linux/user_namespace.h>
#define RPCDBG_FACILITY RPCDBG_AUTH
-#include <linux/sunrpc/clnt.h>
#include "netns.h"
@@ -157,11 +157,6 @@ static void ip_map_request(struct cache_detail *cd,
(*bpp)[-1] = '\n';
}
-static int ip_map_upcall(struct cache_detail *cd, struct cache_head *h)
-{
- return sunrpc_cache_pipe_upcall(cd, h, ip_map_request);
-}
-
static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, struct in6_addr *addr);
static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, struct unix_domain *udom, time_t expiry);
@@ -352,13 +347,13 @@ ip_map_cached_get(struct svc_xprt *xprt)
spin_lock(&xprt->xpt_lock);
ipm = xprt->xpt_auth_cache;
if (ipm != NULL) {
- if (!cache_valid(&ipm->h)) {
+ sn = net_generic(xprt->xpt_net, sunrpc_net_id);
+ if (cache_is_expired(sn->ip_map_cache, &ipm->h)) {
/*
* The entry has been invalidated since it was
* remembered, e.g. by a second mount from the
* same IP address.
*/
- sn = net_generic(xprt->xpt_net, sunrpc_net_id);
xprt->xpt_auth_cache = NULL;
spin_unlock(&xprt->xpt_lock);
cache_put(&ipm->h, sn->ip_map_cache);
@@ -415,10 +410,15 @@ svcauth_unix_info_release(struct svc_xprt *xpt)
struct unix_gid {
struct cache_head h;
- uid_t uid;
+ kuid_t uid;
struct group_info *gi;
};
+static int unix_gid_hash(kuid_t uid)
+{
+ return hash_long(from_kuid(&init_user_ns, uid), GID_HASHBITS);
+}
+
static void unix_gid_put(struct kref *kref)
{
struct cache_head *item = container_of(kref, struct cache_head, ref);
@@ -433,7 +433,7 @@ static int unix_gid_match(struct cache_head *corig, struct cache_head *cnew)
{
struct unix_gid *orig = container_of(corig, struct unix_gid, h);
struct unix_gid *new = container_of(cnew, struct unix_gid, h);
- return orig->uid == new->uid;
+ return uid_eq(orig->uid, new->uid);
}
static void unix_gid_init(struct cache_head *cnew, struct cache_head *citem)
{
@@ -465,23 +465,19 @@ static void unix_gid_request(struct cache_detail *cd,
char tuid[20];
struct unix_gid *ug = container_of(h, struct unix_gid, h);
- snprintf(tuid, 20, "%u", ug->uid);
+ snprintf(tuid, 20, "%u", from_kuid(&init_user_ns, ug->uid));
qword_add(bpp, blen, tuid);
(*bpp)[-1] = '\n';
}
-static int unix_gid_upcall(struct cache_detail *cd, struct cache_head *h)
-{
- return sunrpc_cache_pipe_upcall(cd, h, unix_gid_request);
-}
-
-static struct unix_gid *unix_gid_lookup(struct cache_detail *cd, uid_t uid);
+static struct unix_gid *unix_gid_lookup(struct cache_detail *cd, kuid_t uid);
static int unix_gid_parse(struct cache_detail *cd,
char *mesg, int mlen)
{
/* uid expiry Ngid gid0 gid1 ... gidN-1 */
- int uid;
+ int id;
+ kuid_t uid;
int gids;
int rv;
int i;
@@ -493,9 +489,10 @@ static int unix_gid_parse(struct cache_detail *cd,
return -EINVAL;
mesg[mlen-1] = 0;
- rv = get_int(&mesg, &uid);
+ rv = get_int(&mesg, &id);
if (rv)
return -EINVAL;
+ uid = make_kuid(&init_user_ns, id);
ug.uid = uid;
expiry = get_expiry(&mesg);
@@ -530,7 +527,7 @@ static int unix_gid_parse(struct cache_detail *cd,
ug.h.expiry_time = expiry;
ch = sunrpc_cache_update(cd,
&ug.h, &ugp->h,
- hash_long(uid, GID_HASHBITS));
+ unix_gid_hash(uid));
if (!ch)
err = -ENOMEM;
else {
@@ -549,7 +546,7 @@ static int unix_gid_show(struct seq_file *m,
struct cache_detail *cd,
struct cache_head *h)
{
- struct user_namespace *user_ns = current_user_ns();
+ struct user_namespace *user_ns = &init_user_ns;
struct unix_gid *ug;
int i;
int glen;
@@ -565,7 +562,7 @@ static int unix_gid_show(struct seq_file *m,
else
glen = 0;
- seq_printf(m, "%u %d:", ug->uid, glen);
+ seq_printf(m, "%u %d:", from_kuid_munged(user_ns, ug->uid), glen);
for (i = 0; i < glen; i++)
seq_printf(m, " %d", from_kgid_munged(user_ns, GROUP_AT(ug->gi, i)));
seq_printf(m, "\n");
@@ -577,7 +574,7 @@ static struct cache_detail unix_gid_cache_template = {
.hash_size = GID_HASHMAX,
.name = "auth.unix.gid",
.cache_put = unix_gid_put,
- .cache_upcall = unix_gid_upcall,
+ .cache_request = unix_gid_request,
.cache_parse = unix_gid_parse,
.cache_show = unix_gid_show,
.match = unix_gid_match,
@@ -615,20 +612,20 @@ void unix_gid_cache_destroy(struct net *net)
cache_destroy_net(cd, net);
}
-static struct unix_gid *unix_gid_lookup(struct cache_detail *cd, uid_t uid)
+static struct unix_gid *unix_gid_lookup(struct cache_detail *cd, kuid_t uid)
{
struct unix_gid ug;
struct cache_head *ch;
ug.uid = uid;
- ch = sunrpc_cache_lookup(cd, &ug.h, hash_long(uid, GID_HASHBITS));
+ ch = sunrpc_cache_lookup(cd, &ug.h, unix_gid_hash(uid));
if (ch)
return container_of(ch, struct unix_gid, h);
else
return NULL;
}
-static struct group_info *unix_gid_find(uid_t uid, struct svc_rqst *rqstp)
+static struct group_info *unix_gid_find(kuid_t uid, struct svc_rqst *rqstp)
{
struct unix_gid *ug;
struct group_info *gi;
@@ -750,8 +747,8 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
}
/* Signal that mapping to nobody uid/gid is required */
- cred->cr_uid = (uid_t) -1;
- cred->cr_gid = (gid_t) -1;
+ cred->cr_uid = INVALID_UID;
+ cred->cr_gid = INVALID_GID;
cred->cr_group_info = groups_alloc(0);
if (cred->cr_group_info == NULL)
return SVC_CLOSE; /* kmalloc failure - client must retry */
@@ -811,9 +808,15 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
goto badcred;
argv->iov_base = (void*)((__be32*)argv->iov_base + slen); /* skip machname */
argv->iov_len -= slen*4;
-
- cred->cr_uid = svc_getnl(argv); /* uid */
- cred->cr_gid = svc_getnl(argv); /* gid */
+ /*
+ * Note: we skip uid_valid()/gid_valid() checks here for
+ * backwards compatibility with clients that use -1 id's.
+ * Instead, -1 uid or gid is later mapped to the
+ * (export-specific) anonymous id by nfsd_setuser.
+ * Supplementary gid's will be left alone.
+ */
+ cred->cr_uid = make_kuid(&init_user_ns, svc_getnl(argv)); /* uid */
+ cred->cr_gid = make_kgid(&init_user_ns, svc_getnl(argv)); /* gid */
slen = svc_getnl(argv); /* gids length */
if (slen > 16 || (len -= (slen + 2)*4) < 0)
goto badcred;
@@ -822,8 +825,6 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
return SVC_CLOSE;
for (i = 0; i < slen; i++) {
kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv));
- if (!gid_valid(kgid))
- goto badcred;
GROUP_AT(cred->cr_group_info, i) = kgid;
}
if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
@@ -874,7 +875,7 @@ static struct cache_detail ip_map_cache_template = {
.hash_size = IP_HASHMAX,
.name = "auth.unix.ip",
.cache_put = ip_map_put,
- .cache_upcall = ip_map_upcall,
+ .cache_request = ip_map_request,
.cache_parse = ip_map_parse,
.cache_show = ip_map_show,
.match = ip_map_match,
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 0a148c9d2a5..b507cd327d9 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -60,7 +60,7 @@
static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
int flags);
-static void svc_udp_data_ready(struct sock *, int);
+static void svc_udp_data_ready(struct sock *);
static int svc_udp_recvfrom(struct svc_rqst *);
static int svc_udp_sendto(struct svc_rqst *);
static void svc_sock_detach(struct svc_xprt *);
@@ -291,12 +291,14 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining)
&inet_sk(sk)->inet_rcv_saddr,
inet_sk(sk)->inet_num);
break;
+#if IS_ENABLED(CONFIG_IPV6)
case PF_INET6:
len = snprintf(buf, remaining, "ipv6 %s %pI6 %d\n",
proto_name,
- &inet6_sk(sk)->rcv_saddr,
+ &sk->sk_v6_rcv_saddr,
inet_sk(sk)->inet_num);
break;
+#endif
default:
len = snprintf(buf, remaining, "*unknown-%d*\n",
sk->sk_family);
@@ -398,17 +400,23 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
release_sock(sock->sk);
#endif
}
+
+static int svc_sock_secure_port(struct svc_rqst *rqstp)
+{
+ return svc_port_is_privileged(svc_addr(rqstp));
+}
+
/*
* INET callback when data has been received on the socket.
*/
-static void svc_udp_data_ready(struct sock *sk, int count)
+static void svc_udp_data_ready(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
wait_queue_head_t *wq = sk_sleep(sk);
if (svsk) {
- dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
- svsk, sk, count,
+ dprintk("svc: socket %p(inet %p), busy=%d\n",
+ svsk, sk,
test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
@@ -442,7 +450,7 @@ static void svc_tcp_write_space(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock)
+ if (sk_stream_is_writeable(sk) && sock)
clear_bit(SOCK_NOSPACE, &sock->flags);
svc_write_space(sk);
}
@@ -465,7 +473,7 @@ static int svc_udp_get_dest_address4(struct svc_rqst *rqstp,
}
/*
- * See net/ipv6/datagram.c : datagram_recv_ctl
+ * See net/ipv6/datagram.c : ip6_datagram_recv_ctl
*/
static int svc_udp_get_dest_address6(struct svc_rqst *rqstp,
struct cmsghdr *cmh)
@@ -676,6 +684,7 @@ static struct svc_xprt_ops svc_udp_ops = {
.xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
.xpo_has_wspace = svc_udp_has_wspace,
.xpo_accept = svc_udp_accept,
+ .xpo_secure_port = svc_sock_secure_port,
};
static struct svc_xprt_class svc_udp_class = {
@@ -729,7 +738,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
* A data_ready event on a listening socket means there's a connection
* pending. Do not use state_change as a substitute for it.
*/
-static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
+static void svc_tcp_listen_data_ready(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
wait_queue_head_t *wq;
@@ -781,7 +790,7 @@ static void svc_tcp_state_change(struct sock *sk)
wake_up_interruptible_all(wq);
}
-static void svc_tcp_data_ready(struct sock *sk, int count)
+static void svc_tcp_data_ready(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
wait_queue_head_t *wq = sk_sleep(sk);
@@ -840,8 +849,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
* tell us anything. For now just warn about unpriv connections.
*/
if (!svc_port_is_privileged(sin)) {
- dprintk(KERN_WARNING
- "%s: connect from unprivileged port: %s\n",
+ dprintk("%s: connect from unprivileged port: %s\n",
serv->sv_name,
__svc_print_addr(sin, buf, sizeof(buf)));
}
@@ -865,6 +873,10 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
}
svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
+ if (sock_is_loopback(newsock->sk))
+ set_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags);
+ else
+ clear_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags);
if (serv->sv_stats)
serv->sv_stats->nettcpconn++;
@@ -917,7 +929,10 @@ static void svc_tcp_clear_pages(struct svc_sock *svsk)
len = svsk->sk_datalen;
npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
for (i = 0; i < npages; i++) {
- BUG_ON(svsk->sk_pages[i] == NULL);
+ if (svsk->sk_pages[i] == NULL) {
+ WARN_ON_ONCE(1);
+ continue;
+ }
put_page(svsk->sk_pages[i]);
svsk->sk_pages[i] = NULL;
}
@@ -1092,8 +1107,10 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
goto err_noclose;
}
- if (svc_sock_reclen(svsk) < 8)
+ if (svsk->sk_datalen < 8) {
+ svsk->sk_datalen = 0;
goto err_delete; /* client is nuts. */
+ }
rqstp->rq_arg.len = svsk->sk_datalen;
rqstp->rq_arg.page_base = 0;
@@ -1105,6 +1122,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
rqstp->rq_xprt_ctxt = NULL;
rqstp->rq_prot = IPPROTO_TCP;
+ rqstp->rq_local = !!test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags);
p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
calldir = p[1];
@@ -1188,7 +1206,9 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt)
if (test_bit(XPT_LISTENER, &xprt->xpt_flags))
return 1;
required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg;
- if (sk_stream_wspace(svsk->sk_sk) >= required)
+ if (sk_stream_wspace(svsk->sk_sk) >= required ||
+ (sk_stream_min_wspace(svsk->sk_sk) == 0 &&
+ atomic_read(&xprt->xpt_reserved) == 0))
return 1;
set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
return 0;
@@ -1225,6 +1245,7 @@ static struct svc_xprt_ops svc_tcp_bc_ops = {
.xpo_detach = svc_bc_tcp_sock_detach,
.xpo_free = svc_bc_sock_free,
.xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
+ .xpo_secure_port = svc_sock_secure_port,
};
static struct svc_xprt_class svc_tcp_bc_class = {
@@ -1263,6 +1284,7 @@ static struct svc_xprt_ops svc_tcp_ops = {
.xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
.xpo_has_wspace = svc_tcp_has_wspace,
.xpo_accept = svc_tcp_accept,
+ .xpo_secure_port = svc_sock_secure_port,
};
static struct svc_xprt_class svc_tcp_class = {
@@ -1388,6 +1410,22 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
return svsk;
}
+bool svc_alien_sock(struct net *net, int fd)
+{
+ int err;
+ struct socket *sock = sockfd_lookup(fd, &err);
+ bool ret = false;
+
+ if (!sock)
+ goto out;
+ if (sock_net(sock->sk) != net)
+ ret = true;
+ sockfd_put(sock);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(svc_alien_sock);
+
/**
* svc_addsock - add a listener socket to an RPC service
* @serv: pointer to RPC service to which to add a new listener
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index af7d339add9..c99c58e2ee6 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -40,7 +40,7 @@ EXPORT_SYMBOL_GPL(nlm_debug);
#ifdef RPC_DEBUG
static struct ctl_table_header *sunrpc_table_header;
-static ctl_table sunrpc_table[];
+static struct ctl_table sunrpc_table[];
void
rpc_register_sysctl(void)
@@ -58,7 +58,7 @@ rpc_unregister_sysctl(void)
}
}
-static int proc_do_xprt(ctl_table *table, int write,
+static int proc_do_xprt(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
char tmpbuf[256];
@@ -73,7 +73,7 @@ static int proc_do_xprt(ctl_table *table, int write,
}
static int
-proc_dodebug(ctl_table *table, int write,
+proc_dodebug(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
char tmpbuf[20], c, *s;
@@ -135,7 +135,7 @@ done:
}
-static ctl_table debug_table[] = {
+static struct ctl_table debug_table[] = {
{
.procname = "rpc_debug",
.data = &rpc_debug,
@@ -173,7 +173,7 @@ static ctl_table debug_table[] = {
{ }
};
-static ctl_table sunrpc_table[] = {
+static struct ctl_table sunrpc_table[] = {
{
.procname = "sunrpc",
.mode = 0555,
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 56055632f15..23fb4e75e24 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -207,10 +207,13 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base,
pgfrom_base -= copy;
vto = kmap_atomic(*pgto);
- vfrom = kmap_atomic(*pgfrom);
- memmove(vto + pgto_base, vfrom + pgfrom_base, copy);
+ if (*pgto != *pgfrom) {
+ vfrom = kmap_atomic(*pgfrom);
+ memcpy(vto + pgto_base, vfrom + pgfrom_base, copy);
+ kunmap_atomic(vfrom);
+ } else
+ memmove(vto + pgto_base, vto + pgfrom_base, copy);
flush_dcache_page(*pgto);
- kunmap_atomic(vfrom);
kunmap_atomic(vto);
} while ((len -= copy) != 0);
@@ -459,6 +462,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
struct kvec *iov = buf->head;
int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
+ xdr_set_scratch_buffer(xdr, NULL, 0);
BUG_ON(scratch_len < 0);
xdr->buf = buf;
xdr->iov = iov;
@@ -479,6 +483,73 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
EXPORT_SYMBOL_GPL(xdr_init_encode);
/**
+ * xdr_commit_encode - Ensure all data is written to buffer
+ * @xdr: pointer to xdr_stream
+ *
+ * We handle encoding across page boundaries by giving the caller a
+ * temporary location to write to, then later copying the data into
+ * place; xdr_commit_encode does that copying.
+ *
+ * Normally the caller doesn't need to call this directly, as the
+ * following xdr_reserve_space will do it. But an explicit call may be
+ * required at the end of encoding, or any other time when the xdr_buf
+ * data might be read.
+ */
+void xdr_commit_encode(struct xdr_stream *xdr)
+{
+ int shift = xdr->scratch.iov_len;
+ void *page;
+
+ if (shift == 0)
+ return;
+ page = page_address(*xdr->page_ptr);
+ memcpy(xdr->scratch.iov_base, page, shift);
+ memmove(page, page + shift, (void *)xdr->p - page);
+ xdr->scratch.iov_len = 0;
+}
+EXPORT_SYMBOL_GPL(xdr_commit_encode);
+
+__be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, size_t nbytes)
+{
+ static __be32 *p;
+ int space_left;
+ int frag1bytes, frag2bytes;
+
+ if (nbytes > PAGE_SIZE)
+ return NULL; /* Bigger buffers require special handling */
+ if (xdr->buf->len + nbytes > xdr->buf->buflen)
+ return NULL; /* Sorry, we're totally out of space */
+ frag1bytes = (xdr->end - xdr->p) << 2;
+ frag2bytes = nbytes - frag1bytes;
+ if (xdr->iov)
+ xdr->iov->iov_len += frag1bytes;
+ else
+ xdr->buf->page_len += frag1bytes;
+ xdr->page_ptr++;
+ xdr->iov = NULL;
+ /*
+ * If the last encode didn't end exactly on a page boundary, the
+ * next one will straddle boundaries. Encode into the next
+ * page, then copy it back later in xdr_commit_encode. We use
+ * the "scratch" iov to track any temporarily unused fragment of
+ * space at the end of the previous buffer:
+ */
+ xdr->scratch.iov_base = xdr->p;
+ xdr->scratch.iov_len = frag1bytes;
+ p = page_address(*xdr->page_ptr);
+ /*
+ * Note this is where the next encode will start after we've
+ * shifted this one back:
+ */
+ xdr->p = (void *)p + frag2bytes;
+ space_left = xdr->buf->buflen - xdr->buf->len;
+ xdr->end = (void *)p + min_t(int, space_left, PAGE_SIZE);
+ xdr->buf->page_len += frag2bytes;
+ xdr->buf->len += nbytes;
+ return p;
+}
+
+/**
* xdr_reserve_space - Reserve buffer space for sending
* @xdr: pointer to xdr_stream
* @nbytes: number of bytes to reserve
@@ -492,20 +563,122 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
__be32 *p = xdr->p;
__be32 *q;
+ xdr_commit_encode(xdr);
/* align nbytes on the next 32-bit boundary */
nbytes += 3;
nbytes &= ~3;
q = p + (nbytes >> 2);
if (unlikely(q > xdr->end || q < p))
- return NULL;
+ return xdr_get_next_encode_buffer(xdr, nbytes);
xdr->p = q;
- xdr->iov->iov_len += nbytes;
+ if (xdr->iov)
+ xdr->iov->iov_len += nbytes;
+ else
+ xdr->buf->page_len += nbytes;
xdr->buf->len += nbytes;
return p;
}
EXPORT_SYMBOL_GPL(xdr_reserve_space);
/**
+ * xdr_truncate_encode - truncate an encode buffer
+ * @xdr: pointer to xdr_stream
+ * @len: new length of buffer
+ *
+ * Truncates the xdr stream, so that xdr->buf->len == len,
+ * and xdr->p points at offset len from the start of the buffer, and
+ * head, tail, and page lengths are adjusted to correspond.
+ *
+ * If this means moving xdr->p to a different buffer, we assume that
+ * that the end pointer should be set to the end of the current page,
+ * except in the case of the head buffer when we assume the head
+ * buffer's current length represents the end of the available buffer.
+ *
+ * This is *not* safe to use on a buffer that already has inlined page
+ * cache pages (as in a zero-copy server read reply), except for the
+ * simple case of truncating from one position in the tail to another.
+ *
+ */
+void xdr_truncate_encode(struct xdr_stream *xdr, size_t len)
+{
+ struct xdr_buf *buf = xdr->buf;
+ struct kvec *head = buf->head;
+ struct kvec *tail = buf->tail;
+ int fraglen;
+ int new, old;
+
+ if (len > buf->len) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ xdr_commit_encode(xdr);
+
+ fraglen = min_t(int, buf->len - len, tail->iov_len);
+ tail->iov_len -= fraglen;
+ buf->len -= fraglen;
+ if (tail->iov_len && buf->len == len) {
+ xdr->p = tail->iov_base + tail->iov_len;
+ /* xdr->end, xdr->iov should be set already */
+ return;
+ }
+ WARN_ON_ONCE(fraglen);
+ fraglen = min_t(int, buf->len - len, buf->page_len);
+ buf->page_len -= fraglen;
+ buf->len -= fraglen;
+
+ new = buf->page_base + buf->page_len;
+ old = new + fraglen;
+ xdr->page_ptr -= (old >> PAGE_SHIFT) - (new >> PAGE_SHIFT);
+
+ if (buf->page_len && buf->len == len) {
+ xdr->p = page_address(*xdr->page_ptr);
+ xdr->end = (void *)xdr->p + PAGE_SIZE;
+ xdr->p = (void *)xdr->p + (new % PAGE_SIZE);
+ /* xdr->iov should already be NULL */
+ return;
+ }
+ if (fraglen) {
+ xdr->end = head->iov_base + head->iov_len;
+ xdr->page_ptr--;
+ }
+ /* (otherwise assume xdr->end is already set) */
+ head->iov_len = len;
+ buf->len = len;
+ xdr->p = head->iov_base + head->iov_len;
+ xdr->iov = buf->head;
+}
+EXPORT_SYMBOL(xdr_truncate_encode);
+
+/**
+ * xdr_restrict_buflen - decrease available buffer space
+ * @xdr: pointer to xdr_stream
+ * @newbuflen: new maximum number of bytes available
+ *
+ * Adjust our idea of how much space is available in the buffer.
+ * If we've already used too much space in the buffer, returns -1.
+ * If the available space is already smaller than newbuflen, returns 0
+ * and does nothing. Otherwise, adjusts xdr->buf->buflen to newbuflen
+ * and ensures xdr->end is set at most offset newbuflen from the start
+ * of the buffer.
+ */
+int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen)
+{
+ struct xdr_buf *buf = xdr->buf;
+ int left_in_this_buf = (void *)xdr->end - (void *)xdr->p;
+ int end_offset = buf->len + left_in_this_buf;
+
+ if (newbuflen < 0 || newbuflen < buf->len)
+ return -1;
+ if (newbuflen > buf->buflen)
+ return 0;
+ if (newbuflen < end_offset)
+ xdr->end = (void *)xdr->end + newbuflen - end_offset;
+ buf->buflen = newbuflen;
+ return 0;
+}
+EXPORT_SYMBOL(xdr_restrict_buflen);
+
+/**
* xdr_write_pages - Insert a list of pages into an XDR buffer for sending
* @xdr: pointer to xdr_stream
* @pages: list of pages
@@ -830,8 +1003,20 @@ xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
}
EXPORT_SYMBOL_GPL(xdr_buf_from_iov);
-/* Sets subbuf to the portion of buf of length len beginning base bytes
- * from the start of buf. Returns -1 if base of length are out of bounds. */
+/**
+ * xdr_buf_subsegment - set subbuf to a portion of buf
+ * @buf: an xdr buffer
+ * @subbuf: the result buffer
+ * @base: beginning of range in bytes
+ * @len: length of range in bytes
+ *
+ * sets @subbuf to an xdr buffer representing the portion of @buf of
+ * length @len starting at offset @base.
+ *
+ * @buf and @subbuf may be pointers to the same struct xdr_buf.
+ *
+ * Returns -1 if base of length are out of bounds.
+ */
int
xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
unsigned int base, unsigned int len)
@@ -844,9 +1029,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
len -= subbuf->head[0].iov_len;
base = 0;
} else {
- subbuf->head[0].iov_base = NULL;
- subbuf->head[0].iov_len = 0;
base -= buf->head[0].iov_len;
+ subbuf->head[0].iov_len = 0;
}
if (base < buf->page_len) {
@@ -868,9 +1052,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
len -= subbuf->tail[0].iov_len;
base = 0;
} else {
- subbuf->tail[0].iov_base = NULL;
- subbuf->tail[0].iov_len = 0;
base -= buf->tail[0].iov_len;
+ subbuf->tail[0].iov_len = 0;
}
if (base || len)
@@ -879,6 +1062,47 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
}
EXPORT_SYMBOL_GPL(xdr_buf_subsegment);
+/**
+ * xdr_buf_trim - lop at most "len" bytes off the end of "buf"
+ * @buf: buf to be trimmed
+ * @len: number of bytes to reduce "buf" by
+ *
+ * Trim an xdr_buf by the given number of bytes by fixing up the lengths. Note
+ * that it's possible that we'll trim less than that amount if the xdr_buf is
+ * too small, or if (for instance) it's all in the head and the parser has
+ * already read too far into it.
+ */
+void xdr_buf_trim(struct xdr_buf *buf, unsigned int len)
+{
+ size_t cur;
+ unsigned int trim = len;
+
+ if (buf->tail[0].iov_len) {
+ cur = min_t(size_t, buf->tail[0].iov_len, trim);
+ buf->tail[0].iov_len -= cur;
+ trim -= cur;
+ if (!trim)
+ goto fix_len;
+ }
+
+ if (buf->page_len) {
+ cur = min_t(unsigned int, buf->page_len, trim);
+ buf->page_len -= cur;
+ trim -= cur;
+ if (!trim)
+ goto fix_len;
+ }
+
+ if (buf->head[0].iov_len) {
+ cur = min_t(size_t, buf->head[0].iov_len, trim);
+ buf->head[0].iov_len -= cur;
+ trim -= cur;
+ }
+fix_len:
+ buf->len -= (len - trim);
+}
+EXPORT_SYMBOL_GPL(xdr_buf_trim);
+
static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
{
unsigned int this_len;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 33811db8788..c3b2b3369e5 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -71,24 +71,6 @@ static void xprt_destroy(struct rpc_xprt *xprt);
static DEFINE_SPINLOCK(xprt_list_lock);
static LIST_HEAD(xprt_list);
-/*
- * The transport code maintains an estimate on the maximum number of out-
- * standing RPC requests, using a smoothed version of the congestion
- * avoidance implemented in 44BSD. This is basically the Van Jacobson
- * congestion algorithm: If a retransmit occurs, the congestion window is
- * halved; otherwise, it is incremented by 1/cwnd when
- *
- * - a reply is received and
- * - a full number of requests are outstanding and
- * - the congestion window hasn't been updated recently.
- */
-#define RPC_CWNDSHIFT (8U)
-#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT)
-#define RPC_INITCWND RPC_CWNDSCALE
-#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT)
-
-#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
-
/**
* xprt_register_transport - register a transport implementation
* @transport: transport to register
@@ -205,10 +187,8 @@ int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
goto out_sleep;
}
xprt->snd_task = task;
- if (req != NULL) {
- req->rq_bytes_sent = 0;
+ if (req != NULL)
req->rq_ntrans++;
- }
return 1;
@@ -232,9 +212,9 @@ static void xprt_clear_locked(struct rpc_xprt *xprt)
{
xprt->snd_task = NULL;
if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) {
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(XPRT_LOCKED, &xprt->state);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
} else
queue_work(rpciod_workqueue, &xprt->task_cleanup);
}
@@ -263,7 +243,6 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
}
if (__xprt_get_cong(xprt, task)) {
xprt->snd_task = task;
- req->rq_bytes_sent = 0;
req->rq_ntrans++;
return 1;
}
@@ -300,10 +279,8 @@ static bool __xprt_lock_write_func(struct rpc_task *task, void *data)
req = task->tk_rqstp;
xprt->snd_task = task;
- if (req) {
- req->rq_bytes_sent = 0;
+ if (req)
req->rq_ntrans++;
- }
return true;
}
@@ -329,7 +306,6 @@ static bool __xprt_lock_write_cong_func(struct rpc_task *task, void *data)
}
if (__xprt_get_cong(xprt, task)) {
xprt->snd_task = task;
- req->rq_bytes_sent = 0;
req->rq_ntrans++;
return true;
}
@@ -358,6 +334,11 @@ out_unlock:
void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
{
if (xprt->snd_task == task) {
+ if (task != NULL) {
+ struct rpc_rqst *req = task->tk_rqstp;
+ if (req != NULL)
+ req->rq_bytes_sent = 0;
+ }
xprt_clear_locked(xprt);
__xprt_lock_write_next(xprt);
}
@@ -375,6 +356,11 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt);
void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
{
if (xprt->snd_task == task) {
+ if (task != NULL) {
+ struct rpc_rqst *req = task->tk_rqstp;
+ if (req != NULL)
+ req->rq_bytes_sent = 0;
+ }
xprt_clear_locked(xprt);
__xprt_lock_write_next_cong(xprt);
}
@@ -430,21 +416,31 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
*/
void xprt_release_rqst_cong(struct rpc_task *task)
{
- __xprt_put_cong(task->tk_xprt, task->tk_rqstp);
+ struct rpc_rqst *req = task->tk_rqstp;
+
+ __xprt_put_cong(req->rq_xprt, req);
}
EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
/**
* xprt_adjust_cwnd - adjust transport congestion window
+ * @xprt: pointer to xprt
* @task: recently completed RPC request used to adjust window
* @result: result code of completed RPC request
*
- * We use a time-smoothed congestion estimator to avoid heavy oscillation.
+ * The transport code maintains an estimate on the maximum number of out-
+ * standing RPC requests, using a smoothed version of the congestion
+ * avoidance implemented in 44BSD. This is basically the Van Jacobson
+ * congestion algorithm: If a retransmit occurs, the congestion window is
+ * halved; otherwise, it is incremented by 1/cwnd when
+ *
+ * - a reply is received and
+ * - a full number of requests are outstanding and
+ * - the congestion window hasn't been updated recently.
*/
-void xprt_adjust_cwnd(struct rpc_task *task, int result)
+void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result)
{
struct rpc_rqst *req = task->tk_rqstp;
- struct rpc_xprt *xprt = task->tk_xprt;
unsigned long cwnd = xprt->cwnd;
if (result >= 0 && cwnd <= xprt->cong) {
@@ -485,13 +481,17 @@ EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks);
* xprt_wait_for_buffer_space - wait for transport output buffer to clear
* @task: task to be put to sleep
* @action: function pointer to be executed after wait
+ *
+ * Note that we only set the timer for the case of RPC_IS_SOFT(), since
+ * we don't in general want to force a socket disconnection due to
+ * an incomplete RPC call transmission.
*/
void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action)
{
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
- task->tk_timeout = req->rq_timeout;
+ task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0;
rpc_sleep_on(&xprt->pending, task, action);
}
EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
@@ -695,7 +695,7 @@ out_abort:
*/
void xprt_connect(struct rpc_task *task)
{
- struct rpc_xprt *xprt = task->tk_xprt;
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
dprintk("RPC: %5u xprt_connect xprt %p %s connected\n", task->tk_pid,
xprt, (xprt_connected(xprt) ? "is" : "is not"));
@@ -722,13 +722,13 @@ void xprt_connect(struct rpc_task *task)
if (xprt_test_and_set_connecting(xprt))
return;
xprt->stat.connect_start = jiffies;
- xprt->ops->connect(task);
+ xprt->ops->connect(xprt, task);
}
}
static void xprt_connect_status(struct rpc_task *task)
{
- struct rpc_xprt *xprt = task->tk_xprt;
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
if (task->tk_status == 0) {
xprt->stat.connect_count++;
@@ -739,6 +739,11 @@ static void xprt_connect_status(struct rpc_task *task)
}
switch (task->tk_status) {
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -ECONNABORTED:
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
case -EAGAIN:
dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
break;
@@ -832,7 +837,7 @@ static void xprt_timer(struct rpc_task *task)
spin_lock_bh(&xprt->transport_lock);
if (!req->rq_reply_bytes_recvd) {
if (xprt->ops->timer)
- xprt->ops->timer(task);
+ xprt->ops->timer(xprt, task);
} else
task->tk_status = 0;
spin_unlock_bh(&xprt->transport_lock);
@@ -848,24 +853,36 @@ static inline int xprt_has_timer(struct rpc_xprt *xprt)
* @task: RPC task about to send a request
*
*/
-int xprt_prepare_transmit(struct rpc_task *task)
+bool xprt_prepare_transmit(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
- int err = 0;
+ bool ret = false;
dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid);
spin_lock_bh(&xprt->transport_lock);
- if (req->rq_reply_bytes_recvd && !req->rq_bytes_sent) {
- err = req->rq_reply_bytes_recvd;
+ if (!req->rq_bytes_sent) {
+ if (req->rq_reply_bytes_recvd) {
+ task->tk_status = req->rq_reply_bytes_recvd;
+ goto out_unlock;
+ }
+ if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT)
+ && xprt_connected(xprt)
+ && req->rq_connect_cookie == xprt->connect_cookie) {
+ xprt->ops->set_retrans_timeout(task);
+ rpc_sleep_on(&xprt->pending, task, xprt_timer);
+ goto out_unlock;
+ }
+ }
+ if (!xprt->ops->reserve_xprt(xprt, task)) {
+ task->tk_status = -EAGAIN;
goto out_unlock;
}
- if (!xprt->ops->reserve_xprt(xprt, task))
- err = -EAGAIN;
+ ret = true;
out_unlock:
spin_unlock_bh(&xprt->transport_lock);
- return err;
+ return ret;
}
void xprt_end_transmit(struct rpc_task *task)
@@ -906,7 +923,6 @@ void xprt_transmit(struct rpc_task *task)
} else if (!req->rq_bytes_sent)
return;
- req->rq_connect_cookie = xprt->connect_cookie;
req->rq_xtime = ktime_get();
status = xprt->ops->send_request(task);
if (status != 0) {
@@ -932,16 +948,46 @@ void xprt_transmit(struct rpc_task *task)
/* Don't race with disconnect */
if (!xprt_connected(xprt))
task->tk_status = -ENOTCONN;
- else if (!req->rq_reply_bytes_recvd && rpc_reply_expected(task)) {
+ else {
/*
* Sleep on the pending queue since
* we're expecting a reply.
*/
- rpc_sleep_on(&xprt->pending, task, xprt_timer);
+ if (!req->rq_reply_bytes_recvd && rpc_reply_expected(task))
+ rpc_sleep_on(&xprt->pending, task, xprt_timer);
+ req->rq_connect_cookie = xprt->connect_cookie;
}
spin_unlock_bh(&xprt->transport_lock);
}
+static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ set_bit(XPRT_CONGESTED, &xprt->state);
+ rpc_sleep_on(&xprt->backlog, task, NULL);
+}
+
+static void xprt_wake_up_backlog(struct rpc_xprt *xprt)
+{
+ if (rpc_wake_up_next(&xprt->backlog) == NULL)
+ clear_bit(XPRT_CONGESTED, &xprt->state);
+}
+
+static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ bool ret = false;
+
+ if (!test_bit(XPRT_CONGESTED, &xprt->state))
+ goto out;
+ spin_lock(&xprt->reserve_lock);
+ if (test_bit(XPRT_CONGESTED, &xprt->state)) {
+ rpc_sleep_on(&xprt->backlog, task, NULL);
+ ret = true;
+ }
+ spin_unlock(&xprt->reserve_lock);
+out:
+ return ret;
+}
+
static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt, gfp_t gfp_flags)
{
struct rpc_rqst *req = ERR_PTR(-EAGAIN);
@@ -986,7 +1032,7 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
task->tk_status = -ENOMEM;
break;
case -EAGAIN:
- rpc_sleep_on(&xprt->backlog, task, NULL);
+ xprt_add_backlog(xprt, task);
dprintk("RPC: waiting for request slot\n");
default:
task->tk_status = -EAGAIN;
@@ -1022,7 +1068,7 @@ static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
memset(req, 0, sizeof(*req)); /* mark unused */
list_add(&req->rq_list, &xprt->free);
}
- rpc_wake_up_next(&xprt->backlog);
+ xprt_wake_up_backlog(xprt);
spin_unlock(&xprt->reserve_lock);
}
@@ -1053,11 +1099,9 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size,
for (i = 0; i < num_prealloc; i++) {
req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL);
if (!req)
- break;
+ goto out_free;
list_add(&req->rq_list, &xprt->free);
}
- if (i < num_prealloc)
- goto out_free;
if (max_alloc > num_prealloc)
xprt->max_reqs = max_alloc;
else
@@ -1086,12 +1130,13 @@ EXPORT_SYMBOL_GPL(xprt_free);
* xprt_reserve - allocate an RPC request slot
* @task: RPC task requesting a slot allocation
*
- * If no more slots are available, place the task on the transport's
+ * If the transport is marked as being congested, or if no more
+ * slots are available, place the task on the transport's
* backlog queue.
*/
void xprt_reserve(struct rpc_task *task)
{
- struct rpc_xprt *xprt = task->tk_xprt;
+ struct rpc_xprt *xprt;
task->tk_status = 0;
if (task->tk_rqstp != NULL)
@@ -1099,7 +1144,36 @@ void xprt_reserve(struct rpc_task *task)
task->tk_timeout = 0;
task->tk_status = -EAGAIN;
+ rcu_read_lock();
+ xprt = rcu_dereference(task->tk_client->cl_xprt);
+ if (!xprt_throttle_congested(xprt, task))
+ xprt->ops->alloc_slot(xprt, task);
+ rcu_read_unlock();
+}
+
+/**
+ * xprt_retry_reserve - allocate an RPC request slot
+ * @task: RPC task requesting a slot allocation
+ *
+ * If no more slots are available, place the task on the transport's
+ * backlog queue.
+ * Note that the only difference with xprt_reserve is that we now
+ * ignore the value of the XPRT_CONGESTED flag.
+ */
+void xprt_retry_reserve(struct rpc_task *task)
+{
+ struct rpc_xprt *xprt;
+
+ task->tk_status = 0;
+ if (task->tk_rqstp != NULL)
+ return;
+
+ task->tk_timeout = 0;
+ task->tk_status = -EAGAIN;
+ rcu_read_lock();
+ xprt = rcu_dereference(task->tk_client->cl_xprt);
xprt->ops->alloc_slot(xprt, task);
+ rcu_read_unlock();
}
static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt)
@@ -1109,7 +1183,7 @@ static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt)
static inline void xprt_init_xid(struct rpc_xprt *xprt)
{
- xprt->xid = net_random();
+ xprt->xid = prandom_u32();
}
static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
@@ -1122,6 +1196,12 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
req->rq_xprt = xprt;
req->rq_buffer = NULL;
req->rq_xid = xprt_alloc_xid(xprt);
+ req->rq_connect_cookie = xprt->connect_cookie - 1;
+ req->rq_bytes_sent = 0;
+ req->rq_snd_buf.len = 0;
+ req->rq_snd_buf.buflen = 0;
+ req->rq_rcv_buf.len = 0;
+ req->rq_rcv_buf.buflen = 0;
req->rq_release_snd_buf = NULL;
xprt_reset_majortimeo(req);
dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
@@ -1236,6 +1316,8 @@ found:
-PTR_ERR(xprt));
goto out;
}
+ if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT)
+ xprt->idle_timeout = 0;
INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
if (xprt_has_timer(xprt))
setup_timer(&xprt->timer, xprt_init_autodisconnect,
@@ -1291,15 +1373,3 @@ void xprt_put(struct rpc_xprt *xprt)
if (atomic_dec_and_test(&xprt->count))
xprt_destroy(xprt);
}
-
-/**
- * xprt_get - return a reference to an RPC transport.
- * @xprt: pointer to the transport
- *
- */
-struct rpc_xprt *xprt_get(struct rpc_xprt *xprt)
-{
- if (atomic_inc_not_zero(&xprt->count))
- return xprt;
- return NULL;
-}
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 5a8f268bdd3..da5136fd569 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,8 +1,8 @@
-obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
+obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
xprtrdma-y := transport.o rpc_rdma.o verbs.o
-obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o
+obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
svcrdma-y := svc_rdma.o svc_rdma_transport.o \
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 558fbab574f..693966d3f33 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -78,8 +78,7 @@ static const char transfertypes[][12] = {
* elements. Segments are then coalesced when registered, if possible
* within the selected memreg mode.
*
- * Note, this routine is never called if the connection's memory
- * registration strategy is 0 (bounce buffers).
+ * Returns positive number of segments converted, or a negative errno.
*/
static int
@@ -102,10 +101,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
page_base = xdrbuf->page_base & ~PAGE_MASK;
p = 0;
while (len && n < nsegs) {
+ if (!ppages[p]) {
+ /* alloc the pagelist for receiving buffer */
+ ppages[p] = alloc_page(GFP_ATOMIC);
+ if (!ppages[p])
+ return -ENOMEM;
+ }
seg[n].mr_page = ppages[p];
seg[n].mr_offset = (void *)(unsigned long) page_base;
seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
- BUG_ON(seg[n].mr_len > PAGE_SIZE);
+ if (seg[n].mr_len > PAGE_SIZE)
+ return -EIO;
len -= seg[n].mr_len;
++n;
++p;
@@ -114,7 +120,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
/* Message overflows the seg array */
if (len && n == nsegs)
- return 0;
+ return -EIO;
if (xdrbuf->tail[0].iov_len) {
/* the rpcrdma protocol allows us to omit any trailing
@@ -123,7 +129,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
return n;
if (n == nsegs)
/* Tail remains, but we're out of segments */
- return 0;
+ return -EIO;
seg[n].mr_page = NULL;
seg[n].mr_offset = xdrbuf->tail[0].iov_base;
seg[n].mr_len = xdrbuf->tail[0].iov_len;
@@ -164,15 +170,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
* Reply chunk (a counted array):
* N elements:
* 1 - N - HLOO - HLOO - ... - HLOO
+ *
+ * Returns positive RPC/RDMA header size, or negative errno.
*/
-static unsigned int
+static ssize_t
rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
{
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_task->tk_xprt);
- int nsegs, nchunks = 0;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+ int n, nsegs, nchunks = 0;
unsigned int pos;
struct rpcrdma_mr_seg *seg = req->rl_segments;
struct rpcrdma_read_chunk *cur_rchunk = NULL;
@@ -198,12 +206,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
pos = target->head[0].iov_len;
nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
- if (nsegs == 0)
- return 0;
+ if (nsegs < 0)
+ return nsegs;
do {
- /* bind/register the memory, then build chunk from result. */
- int n = rpcrdma_register_external(seg, nsegs,
+ n = rpcrdma_register_external(seg, nsegs,
cur_wchunk != NULL, r_xprt);
if (n <= 0)
goto out;
@@ -248,10 +255,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
/* success. all failures return above */
req->rl_nchunks = nchunks;
- BUG_ON(nchunks == 0);
- BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
- && (nchunks > 3));
-
/*
* finish off header. If write, marshal discrim and nchunks.
*/
@@ -278,8 +281,8 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
out:
for (pos = 0; nchunks--;)
pos += rpcrdma_deregister_external(
- &req->rl_segments[pos], r_xprt, NULL);
- return 0;
+ &req->rl_segments[pos], r_xprt);
+ return n;
}
/*
@@ -361,16 +364,19 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
* [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
* [2] -- optional padding.
* [3] -- if padded, header only in [1] and data here.
+ *
+ * Returns zero on success, otherwise a negative errno.
*/
int
rpcrdma_marshal_req(struct rpc_rqst *rqst)
{
- struct rpc_xprt *xprt = rqst->rq_task->tk_xprt;
+ struct rpc_xprt *xprt = rqst->rq_xprt;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
char *base;
- size_t hdrlen, rpclen, padlen;
+ size_t rpclen, padlen;
+ ssize_t hdrlen;
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
@@ -441,14 +447,10 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
/* The following simplification is not true forever */
if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
wtype = rpcrdma_noch;
- BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch);
-
- if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS &&
- (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) {
- /* forced to "pure inline"? */
- dprintk("RPC: %s: too much data (%d/%d) for inline\n",
- __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len);
- return -1;
+ if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
+ dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
+ __func__);
+ return -EIO;
}
hdrlen = 28; /*sizeof *headerp;*/
@@ -474,8 +476,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
- BUG_ON(wtype != rpcrdma_noch);
-
+ if (wtype != rpcrdma_noch) {
+ dprintk("RPC: %s: invalid chunk list\n",
+ __func__);
+ return -EIO;
+ }
} else {
headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
@@ -492,8 +497,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* on receive. Therefore, we request a reply chunk
* for non-writes wherever feasible and efficient.
*/
- if (wtype == rpcrdma_noch &&
- r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER)
+ if (wtype == rpcrdma_noch)
wtype = rpcrdma_replych;
}
}
@@ -511,9 +515,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
hdrlen = rpcrdma_create_chunks(rqst,
&rqst->rq_rcv_buf, headerp, wtype);
}
-
- if (hdrlen == 0)
- return -1;
+ if (hdrlen < 0)
+ return hdrlen;
dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
" headerp 0x%p base 0x%p lkey 0x%x\n",
@@ -649,9 +652,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
break;
page_base = 0;
}
- rqst->rq_rcv_buf.page_len = olen - copy_len;
- } else
- rqst->rq_rcv_buf.page_len = 0;
+ }
if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
curlen = copy_len;
@@ -682,15 +683,11 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
rqst->rq_private_buf = rqst->rq_rcv_buf;
}
-/*
- * This function is called when an async event is posted to
- * the connection which changes the connection state. All it
- * does at this point is mark the connection up/down, the rpc
- * timers do the rest.
- */
void
-rpcrdma_conn_func(struct rpcrdma_ep *ep)
+rpcrdma_connect_worker(struct work_struct *work)
{
+ struct rpcrdma_ep *ep =
+ container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
struct rpc_xprt *xprt = ep->rep_xprt;
spin_lock_bh(&xprt->transport_lock);
@@ -707,13 +704,15 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
}
/*
- * This function is called when memory window unbind which we are waiting
- * for completes. Just use rr_func (zeroed by upcall) to signal completion.
+ * This function is called when an async event is posted to
+ * the connection which changes the connection state. All it
+ * does at this point is mark the connection up/down, the rpc
+ * timers do the rest.
*/
-static void
-rpcrdma_unbind_func(struct rpcrdma_rep *rep)
+void
+rpcrdma_conn_func(struct rpcrdma_ep *ep)
{
- wake_up(&rep->rr_unbind);
+ schedule_delayed_work(&ep->rep_connect_worker, 0);
}
/*
@@ -730,7 +729,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
struct rpc_xprt *xprt = rep->rr_xprt;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
__be32 *iptr;
- int i, rdmalen, status;
+ int rdmalen, status;
+ unsigned long cwnd;
/* Check status. If bad, signal disconnect and return rep to pool */
if (rep->rr_len == ~0U) {
@@ -785,6 +785,7 @@ repost:
/* from here on, the reply is no longer an orphan */
req->rl_reply = rep;
+ xprt->reestablish_timeout = 0;
/* check for expected message types */
/* The order of some of these tests is important. */
@@ -859,26 +860,10 @@ badheader:
break;
}
- /* If using mw bind, start the deregister process now. */
- /* (Note: if mr_free(), cannot perform it here, in tasklet context) */
- if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) {
- case RPCRDMA_MEMWINDOWS:
- for (i = 0; req->rl_nchunks-- > 1;)
- i += rpcrdma_deregister_external(
- &req->rl_segments[i], r_xprt, NULL);
- /* Optionally wait (not here) for unbinds to complete */
- rep->rr_func = rpcrdma_unbind_func;
- (void) rpcrdma_deregister_external(&req->rl_segments[i],
- r_xprt, rep);
- break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- for (i = 0; req->rl_nchunks--;)
- i += rpcrdma_deregister_external(&req->rl_segments[i],
- r_xprt, NULL);
- break;
- default:
- break;
- }
+ cwnd = xprt->cwnd;
+ xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
+ if (xprt->cwnd > cwnd)
+ xprt_release_rqst_cong(rqst->rq_task);
dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
__func__, xprt, rqst, status);
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 8343737e85f..c1b6270262c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -84,7 +84,7 @@ struct workqueue_struct *svc_rdma_wq;
* resets the associated statistic to zero. Any read returns it's
* current value.
*/
-static int read_reset_stat(ctl_table *table, int write,
+static int read_reset_stat(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -119,7 +119,7 @@ static int read_reset_stat(ctl_table *table, int write,
}
static struct ctl_table_header *svcrdma_table_header;
-static ctl_table svcrdma_parm_table[] = {
+static struct ctl_table svcrdma_parm_table[] = {
{
.procname = "max_requests",
.data = &svcrdma_max_requests,
@@ -214,7 +214,7 @@ static ctl_table svcrdma_parm_table[] = {
{ },
};
-static ctl_table svcrdma_table[] = {
+static struct ctl_table svcrdma_table[] = {
{
.procname = "svc_rdma",
.mode = 0555,
@@ -223,7 +223,7 @@ static ctl_table svcrdma_table[] = {
{ },
};
-static ctl_table svcrdma_root_table[] = {
+static struct ctl_table svcrdma_root_table[] = {
{
.procname = "sunrpc",
.mode = 0555,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 8d2edddf48c..65b146297f5 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -98,6 +98,7 @@ void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
*/
static u32 *decode_write_list(u32 *va, u32 *vaend)
{
+ unsigned long start, end;
int nchunks;
struct rpcrdma_write_array *ary =
@@ -113,9 +114,12 @@ static u32 *decode_write_list(u32 *va, u32 *vaend)
return NULL;
}
nchunks = ntohl(ary->wc_nchunks);
- if (((unsigned long)&ary->wc_array[0] +
- (sizeof(struct rpcrdma_write_chunk) * nchunks)) >
- (unsigned long)vaend) {
+
+ start = (unsigned long)&ary->wc_array[0];
+ end = (unsigned long)vaend;
+ if (nchunks < 0 ||
+ nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
+ (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
ary, nchunks, vaend);
return NULL;
@@ -129,6 +133,7 @@ static u32 *decode_write_list(u32 *va, u32 *vaend)
static u32 *decode_reply_array(u32 *va, u32 *vaend)
{
+ unsigned long start, end;
int nchunks;
struct rpcrdma_write_array *ary =
(struct rpcrdma_write_array *)va;
@@ -143,9 +148,12 @@ static u32 *decode_reply_array(u32 *va, u32 *vaend)
return NULL;
}
nchunks = ntohl(ary->wc_nchunks);
- if (((unsigned long)&ary->wc_array[0] +
- (sizeof(struct rpcrdma_write_chunk) * nchunks)) >
- (unsigned long)vaend) {
+
+ start = (unsigned long)&ary->wc_array[0];
+ end = (unsigned long)vaend;
+ if (nchunks < 0 ||
+ nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
+ (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
ary, nchunks, vaend);
return NULL;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 0ce75524ed2..8f92a61ee2d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
* Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
@@ -69,7 +70,8 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
/* Set up the XDR head */
rqstp->rq_arg.head[0].iov_base = page_address(page);
- rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length);
+ rqstp->rq_arg.head[0].iov_len =
+ min_t(size_t, byte_count, ctxt->sge[0].length);
rqstp->rq_arg.len = byte_count;
rqstp->rq_arg.buflen = byte_count;
@@ -85,11 +87,12 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
page = ctxt->pages[sge_no];
put_page(rqstp->rq_pages[sge_no]);
rqstp->rq_pages[sge_no] = page;
- bc -= min(bc, ctxt->sge[sge_no].length);
+ bc -= min_t(u32, bc, ctxt->sge[sge_no].length);
rqstp->rq_arg.buflen += ctxt->sge[sge_no].length;
sge_no++;
}
rqstp->rq_respages = &rqstp->rq_pages[sge_no];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
/* We should never run out of SGE because the limit is defined to
* support the max allowed RPC data length
@@ -112,289 +115,265 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
rqstp->rq_arg.tail[0].iov_len = 0;
}
-/* Encode a read-chunk-list as an array of IB SGE
- *
- * Assumptions:
- * - chunk[0]->position points to pages[0] at an offset of 0
- * - pages[] is not physically or virtually contiguous and consists of
- * PAGE_SIZE elements.
- *
- * Output:
- * - sge array pointing into pages[] array.
- * - chunk_sge array specifying sge index and count for each
- * chunk in the read list
- *
- */
-static int map_read_chunks(struct svcxprt_rdma *xprt,
- struct svc_rqst *rqstp,
- struct svc_rdma_op_ctxt *head,
- struct rpcrdma_msg *rmsgp,
- struct svc_rdma_req_map *rpl_map,
- struct svc_rdma_req_map *chl_map,
- int ch_count,
- int byte_count)
+static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
{
- int sge_no;
- int sge_bytes;
- int page_off;
- int page_no;
- int ch_bytes;
- int ch_no;
- struct rpcrdma_read_chunk *ch;
+ if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
+ RDMA_TRANSPORT_IWARP)
+ return 1;
+ else
+ return min_t(int, sge_count, xprt->sc_max_sge);
+}
- sge_no = 0;
- page_no = 0;
- page_off = 0;
- ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
- ch_no = 0;
- ch_bytes = ntohl(ch->rc_target.rs_length);
- head->arg.head[0] = rqstp->rq_arg.head[0];
- head->arg.tail[0] = rqstp->rq_arg.tail[0];
- head->arg.pages = &head->pages[head->count];
- head->hdr_count = head->count; /* save count of hdr pages */
- head->arg.page_base = 0;
- head->arg.page_len = ch_bytes;
- head->arg.len = rqstp->rq_arg.len + ch_bytes;
- head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
- head->count++;
- chl_map->ch[0].start = 0;
- while (byte_count) {
- rpl_map->sge[sge_no].iov_base =
- page_address(rqstp->rq_arg.pages[page_no]) + page_off;
- sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
- rpl_map->sge[sge_no].iov_len = sge_bytes;
- /*
- * Don't bump head->count here because the same page
- * may be used by multiple SGE.
- */
- head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
- rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
+typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head,
+ int *page_no,
+ u32 *page_offset,
+ u32 rs_handle,
+ u32 rs_length,
+ u64 rs_offset,
+ int last);
+
+/* Issue an RDMA_READ using the local lkey to map the data sink */
+static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head,
+ int *page_no,
+ u32 *page_offset,
+ u32 rs_handle,
+ u32 rs_length,
+ u64 rs_offset,
+ int last)
+{
+ struct ib_send_wr read_wr;
+ int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
+ struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
+ int ret, read, pno;
+ u32 pg_off = *page_offset;
+ u32 pg_no = *page_no;
- byte_count -= sge_bytes;
- ch_bytes -= sge_bytes;
- sge_no++;
- /*
- * If all bytes for this chunk have been mapped to an
- * SGE, move to the next SGE
- */
- if (ch_bytes == 0) {
- chl_map->ch[ch_no].count =
- sge_no - chl_map->ch[ch_no].start;
- ch_no++;
- ch++;
- chl_map->ch[ch_no].start = sge_no;
- ch_bytes = ntohl(ch->rc_target.rs_length);
- /* If bytes remaining account for next chunk */
- if (byte_count) {
- head->arg.page_len += ch_bytes;
- head->arg.len += ch_bytes;
- head->arg.buflen += ch_bytes;
- }
+ ctxt->direction = DMA_FROM_DEVICE;
+ ctxt->read_hdr = head;
+ pages_needed =
+ min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed));
+ read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
+
+ for (pno = 0; pno < pages_needed; pno++) {
+ int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
+
+ head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
+ head->arg.page_len += len;
+ head->arg.len += len;
+ if (!pg_off)
+ head->count++;
+ rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+ ctxt->sge[pno].addr =
+ ib_dma_map_page(xprt->sc_cm_id->device,
+ head->arg.pages[pg_no], pg_off,
+ PAGE_SIZE - pg_off,
+ DMA_FROM_DEVICE);
+ ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
+ ctxt->sge[pno].addr);
+ if (ret)
+ goto err;
+ atomic_inc(&xprt->sc_dma_used);
+
+ /* The lkey here is either a local dma lkey or a dma_mr lkey */
+ ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
+ ctxt->sge[pno].length = len;
+ ctxt->count++;
+
+ /* adjust offset and wrap to next page if needed */
+ pg_off += len;
+ if (pg_off == PAGE_SIZE) {
+ pg_off = 0;
+ pg_no++;
}
- /*
- * If this SGE consumed all of the page, move to the
- * next page
- */
- if ((sge_bytes + page_off) == PAGE_SIZE) {
- page_no++;
- page_off = 0;
- /*
- * If there are still bytes left to map, bump
- * the page count
- */
- if (byte_count)
- head->count++;
- } else
- page_off += sge_bytes;
+ rs_length -= len;
}
- BUG_ON(byte_count != 0);
- return sge_no;
+
+ if (last && rs_length == 0)
+ set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+ else
+ clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+
+ memset(&read_wr, 0, sizeof(read_wr));
+ read_wr.wr_id = (unsigned long)ctxt;
+ read_wr.opcode = IB_WR_RDMA_READ;
+ ctxt->wr_op = read_wr.opcode;
+ read_wr.send_flags = IB_SEND_SIGNALED;
+ read_wr.wr.rdma.rkey = rs_handle;
+ read_wr.wr.rdma.remote_addr = rs_offset;
+ read_wr.sg_list = ctxt->sge;
+ read_wr.num_sge = pages_needed;
+
+ ret = svc_rdma_send(xprt, &read_wr);
+ if (ret) {
+ pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ goto err;
+ }
+
+ /* return current location in page array */
+ *page_no = pg_no;
+ *page_offset = pg_off;
+ ret = read;
+ atomic_inc(&rdma_stat_read);
+ return ret;
+ err:
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 0);
+ return ret;
}
-/* Map a read-chunk-list to an XDR and fast register the page-list.
- *
- * Assumptions:
- * - chunk[0] position points to pages[0] at an offset of 0
- * - pages[] will be made physically contiguous by creating a one-off memory
- * region using the fastreg verb.
- * - byte_count is # of bytes in read-chunk-list
- * - ch_count is # of chunks in read-chunk-list
- *
- * Output:
- * - sge array pointing into pages[] array.
- * - chunk_sge array specifying sge index and count for each
- * chunk in the read list
- */
-static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
+/* Issue an RDMA_READ using an FRMR to map the data sink */
+static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
struct svc_rqst *rqstp,
struct svc_rdma_op_ctxt *head,
- struct rpcrdma_msg *rmsgp,
- struct svc_rdma_req_map *rpl_map,
- struct svc_rdma_req_map *chl_map,
- int ch_count,
- int byte_count)
+ int *page_no,
+ u32 *page_offset,
+ u32 rs_handle,
+ u32 rs_length,
+ u64 rs_offset,
+ int last)
{
- int page_no;
- int ch_no;
- u32 offset;
- struct rpcrdma_read_chunk *ch;
- struct svc_rdma_fastreg_mr *frmr;
- int ret = 0;
+ struct ib_send_wr read_wr;
+ struct ib_send_wr inv_wr;
+ struct ib_send_wr fastreg_wr;
+ u8 key;
+ int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
+ struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
+ struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt);
+ int ret, read, pno;
+ u32 pg_off = *page_offset;
+ u32 pg_no = *page_no;
- frmr = svc_rdma_get_frmr(xprt);
if (IS_ERR(frmr))
return -ENOMEM;
- head->frmr = frmr;
- head->arg.head[0] = rqstp->rq_arg.head[0];
- head->arg.tail[0] = rqstp->rq_arg.tail[0];
- head->arg.pages = &head->pages[head->count];
- head->hdr_count = head->count; /* save count of hdr pages */
- head->arg.page_base = 0;
- head->arg.page_len = byte_count;
- head->arg.len = rqstp->rq_arg.len + byte_count;
- head->arg.buflen = rqstp->rq_arg.buflen + byte_count;
+ ctxt->direction = DMA_FROM_DEVICE;
+ ctxt->frmr = frmr;
+ pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len);
+ read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
- /* Fast register the page list */
- frmr->kva = page_address(rqstp->rq_arg.pages[0]);
+ frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]);
frmr->direction = DMA_FROM_DEVICE;
frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
- frmr->map_len = byte_count;
- frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT;
- for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
- frmr->page_list->page_list[page_no] =
+ frmr->map_len = pages_needed << PAGE_SHIFT;
+ frmr->page_list_len = pages_needed;
+
+ for (pno = 0; pno < pages_needed; pno++) {
+ int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
+
+ head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
+ head->arg.page_len += len;
+ head->arg.len += len;
+ if (!pg_off)
+ head->count++;
+ rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+ frmr->page_list->page_list[pno] =
ib_dma_map_page(xprt->sc_cm_id->device,
- rqstp->rq_arg.pages[page_no], 0,
+ head->arg.pages[pg_no], 0,
PAGE_SIZE, DMA_FROM_DEVICE);
- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
- frmr->page_list->page_list[page_no]))
- goto fatal_err;
+ ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
+ frmr->page_list->page_list[pno]);
+ if (ret)
+ goto err;
atomic_inc(&xprt->sc_dma_used);
- head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
- }
- head->count += page_no;
- /* rq_respages points one past arg pages */
- rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
-
- /* Create the reply and chunk maps */
- offset = 0;
- ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
- for (ch_no = 0; ch_no < ch_count; ch_no++) {
- int len = ntohl(ch->rc_target.rs_length);
- rpl_map->sge[ch_no].iov_base = frmr->kva + offset;
- rpl_map->sge[ch_no].iov_len = len;
- chl_map->ch[ch_no].count = 1;
- chl_map->ch[ch_no].start = ch_no;
- offset += len;
- ch++;
+ /* adjust offset and wrap to next page if needed */
+ pg_off += len;
+ if (pg_off == PAGE_SIZE) {
+ pg_off = 0;
+ pg_no++;
+ }
+ rs_length -= len;
}
- ret = svc_rdma_fastreg(xprt, frmr);
- if (ret)
- goto fatal_err;
-
- return ch_no;
-
- fatal_err:
- printk("svcrdma: error fast registering xdr for xprt %p", xprt);
- svc_rdma_put_frmr(xprt, frmr);
- return -EIO;
-}
-
-static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
- struct svc_rdma_op_ctxt *ctxt,
- struct svc_rdma_fastreg_mr *frmr,
- struct kvec *vec,
- u64 *sgl_offset,
- int count)
-{
- int i;
- unsigned long off;
+ if (last && rs_length == 0)
+ set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+ else
+ clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
- ctxt->count = count;
- ctxt->direction = DMA_FROM_DEVICE;
- for (i = 0; i < count; i++) {
- ctxt->sge[i].length = 0; /* in case map fails */
- if (!frmr) {
- BUG_ON(!virt_to_page(vec[i].iov_base));
- off = (unsigned long)vec[i].iov_base & ~PAGE_MASK;
- ctxt->sge[i].addr =
- ib_dma_map_page(xprt->sc_cm_id->device,
- virt_to_page(vec[i].iov_base),
- off,
- vec[i].iov_len,
- DMA_FROM_DEVICE);
- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
- ctxt->sge[i].addr))
- return -EINVAL;
- ctxt->sge[i].lkey = xprt->sc_dma_lkey;
- atomic_inc(&xprt->sc_dma_used);
- } else {
- ctxt->sge[i].addr = (unsigned long)vec[i].iov_base;
- ctxt->sge[i].lkey = frmr->mr->lkey;
- }
- ctxt->sge[i].length = vec[i].iov_len;
- *sgl_offset = *sgl_offset + vec[i].iov_len;
+ /* Bump the key */
+ key = (u8)(frmr->mr->lkey & 0x000000FF);
+ ib_update_fast_reg_key(frmr->mr, ++key);
+
+ ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset;
+ ctxt->sge[0].lkey = frmr->mr->lkey;
+ ctxt->sge[0].length = read;
+ ctxt->count = 1;
+ ctxt->read_hdr = head;
+
+ /* Prepare FASTREG WR */
+ memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+ fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+ fastreg_wr.send_flags = IB_SEND_SIGNALED;
+ fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
+ fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
+ fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
+ fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ fastreg_wr.wr.fast_reg.length = frmr->map_len;
+ fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
+ fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
+ fastreg_wr.next = &read_wr;
+
+ /* Prepare RDMA_READ */
+ memset(&read_wr, 0, sizeof(read_wr));
+ read_wr.send_flags = IB_SEND_SIGNALED;
+ read_wr.wr.rdma.rkey = rs_handle;
+ read_wr.wr.rdma.remote_addr = rs_offset;
+ read_wr.sg_list = ctxt->sge;
+ read_wr.num_sge = 1;
+ if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) {
+ read_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+ read_wr.wr_id = (unsigned long)ctxt;
+ read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey;
+ } else {
+ read_wr.opcode = IB_WR_RDMA_READ;
+ read_wr.next = &inv_wr;
+ /* Prepare invalidate */
+ memset(&inv_wr, 0, sizeof(inv_wr));
+ inv_wr.wr_id = (unsigned long)ctxt;
+ inv_wr.opcode = IB_WR_LOCAL_INV;
+ inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE;
+ inv_wr.ex.invalidate_rkey = frmr->mr->lkey;
+ }
+ ctxt->wr_op = read_wr.opcode;
+
+ /* Post the chain */
+ ret = svc_rdma_send(xprt, &fastreg_wr);
+ if (ret) {
+ pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ goto err;
}
- return 0;
-}
-static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
-{
- if ((rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
- RDMA_TRANSPORT_IWARP) &&
- sge_count > 1)
- return 1;
- else
- return min_t(int, sge_count, xprt->sc_max_sge);
+ /* return current location in page array */
+ *page_no = pg_no;
+ *page_offset = pg_off;
+ ret = read;
+ atomic_inc(&rdma_stat_read);
+ return ret;
+ err:
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 0);
+ svc_rdma_put_frmr(xprt, frmr);
+ return ret;
}
-/*
- * Use RDMA_READ to read data from the advertised client buffer into the
- * XDR stream starting at rq_arg.head[0].iov_base.
- * Each chunk in the array
- * contains the following fields:
- * discrim - '1', This isn't used for data placement
- * position - The xdr stream offset (the same for every chunk)
- * handle - RMR for client memory region
- * length - data transfer length
- * offset - 64 bit tagged offset in remote memory region
- *
- * On our side, we need to read into a pagelist. The first page immediately
- * follows the RPC header.
- *
- * This function returns:
- * 0 - No error and no read-list found.
- *
- * 1 - Successful read-list processing. The data is not yet in
- * the pagelist and therefore the RPC request must be deferred. The
- * I/O completion will enqueue the transport again and
- * svc_rdma_recvfrom will complete the request.
- *
- * <0 - Error processing/posting read-list.
- *
- * NOTE: The ctxt must not be touched after the last WR has been posted
- * because the I/O completion processing may occur on another
- * processor and free / modify the context. Ne touche pas!
- */
-static int rdma_read_xdr(struct svcxprt_rdma *xprt,
- struct rpcrdma_msg *rmsgp,
- struct svc_rqst *rqstp,
- struct svc_rdma_op_ctxt *hdr_ctxt)
+static int rdma_read_chunks(struct svcxprt_rdma *xprt,
+ struct rpcrdma_msg *rmsgp,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head)
{
- struct ib_send_wr read_wr;
- struct ib_send_wr inv_wr;
- int err = 0;
- int ch_no;
- int ch_count;
- int byte_count;
- int sge_count;
- u64 sgl_offset;
+ int page_no, ch_count, ret;
struct rpcrdma_read_chunk *ch;
- struct svc_rdma_op_ctxt *ctxt = NULL;
- struct svc_rdma_req_map *rpl_map;
- struct svc_rdma_req_map *chl_map;
+ u32 page_offset, byte_count;
+ u64 rs_offset;
+ rdma_reader_fn reader;
/* If no read list is present, return 0 */
ch = svc_rdma_get_read_chunk(rmsgp);
@@ -405,129 +384,55 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
if (ch_count > RPCSVC_MAXPAGES)
return -EINVAL;
- /* Allocate temporary reply and chunk maps */
- rpl_map = svc_rdma_get_req_map();
- chl_map = svc_rdma_get_req_map();
+ /* The request is completed when the RDMA_READs complete. The
+ * head context keeps all the pages that comprise the
+ * request.
+ */
+ head->arg.head[0] = rqstp->rq_arg.head[0];
+ head->arg.tail[0] = rqstp->rq_arg.tail[0];
+ head->arg.pages = &head->pages[head->count];
+ head->hdr_count = head->count;
+ head->arg.page_base = 0;
+ head->arg.page_len = 0;
+ head->arg.len = rqstp->rq_arg.len;
+ head->arg.buflen = rqstp->rq_arg.buflen;
- if (!xprt->sc_frmr_pg_list_len)
- sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
- rpl_map, chl_map, ch_count,
- byte_count);
+ /* Use FRMR if supported */
+ if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)
+ reader = rdma_read_chunk_frmr;
else
- sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
- rpl_map, chl_map, ch_count,
- byte_count);
- if (sge_count < 0) {
- err = -EIO;
- goto out;
- }
-
- sgl_offset = 0;
- ch_no = 0;
+ reader = rdma_read_chunk_lcl;
+ page_no = 0; page_offset = 0;
for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
- ch->rc_discrim != 0; ch++, ch_no++) {
- u64 rs_offset;
-next_sge:
- ctxt = svc_rdma_get_context(xprt);
- ctxt->direction = DMA_FROM_DEVICE;
- ctxt->frmr = hdr_ctxt->frmr;
- ctxt->read_hdr = NULL;
- clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
- clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+ ch->rc_discrim != 0; ch++) {
- /* Prepare READ WR */
- memset(&read_wr, 0, sizeof read_wr);
- read_wr.wr_id = (unsigned long)ctxt;
- read_wr.opcode = IB_WR_RDMA_READ;
- ctxt->wr_op = read_wr.opcode;
- read_wr.send_flags = IB_SEND_SIGNALED;
- read_wr.wr.rdma.rkey = ntohl(ch->rc_target.rs_handle);
xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset,
&rs_offset);
- read_wr.wr.rdma.remote_addr = rs_offset + sgl_offset;
- read_wr.sg_list = ctxt->sge;
- read_wr.num_sge =
- rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
- err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr,
- &rpl_map->sge[chl_map->ch[ch_no].start],
- &sgl_offset,
- read_wr.num_sge);
- if (err) {
- svc_rdma_unmap_dma(ctxt);
- svc_rdma_put_context(ctxt, 0);
- goto out;
- }
- if (((ch+1)->rc_discrim == 0) &&
- (read_wr.num_sge == chl_map->ch[ch_no].count)) {
- /*
- * Mark the last RDMA_READ with a bit to
- * indicate all RPC data has been fetched from
- * the client and the RPC needs to be enqueued.
- */
- set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
- if (hdr_ctxt->frmr) {
- set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
- /*
- * Invalidate the local MR used to map the data
- * sink.
- */
- if (xprt->sc_dev_caps &
- SVCRDMA_DEVCAP_READ_W_INV) {
- read_wr.opcode =
- IB_WR_RDMA_READ_WITH_INV;
- ctxt->wr_op = read_wr.opcode;
- read_wr.ex.invalidate_rkey =
- ctxt->frmr->mr->lkey;
- } else {
- /* Prepare INVALIDATE WR */
- memset(&inv_wr, 0, sizeof inv_wr);
- inv_wr.opcode = IB_WR_LOCAL_INV;
- inv_wr.send_flags = IB_SEND_SIGNALED;
- inv_wr.ex.invalidate_rkey =
- hdr_ctxt->frmr->mr->lkey;
- read_wr.next = &inv_wr;
- }
- }
- ctxt->read_hdr = hdr_ctxt;
+ byte_count = ntohl(ch->rc_target.rs_length);
+
+ while (byte_count > 0) {
+ ret = reader(xprt, rqstp, head,
+ &page_no, &page_offset,
+ ntohl(ch->rc_target.rs_handle),
+ byte_count, rs_offset,
+ ((ch+1)->rc_discrim == 0) /* last */
+ );
+ if (ret < 0)
+ goto err;
+ byte_count -= ret;
+ rs_offset += ret;
+ head->arg.buflen += ret;
}
- /* Post the read */
- err = svc_rdma_send(xprt, &read_wr);
- if (err) {
- printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n",
- err);
- set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
- svc_rdma_unmap_dma(ctxt);
- svc_rdma_put_context(ctxt, 0);
- goto out;
- }
- atomic_inc(&rdma_stat_read);
-
- if (read_wr.num_sge < chl_map->ch[ch_no].count) {
- chl_map->ch[ch_no].count -= read_wr.num_sge;
- chl_map->ch[ch_no].start += read_wr.num_sge;
- goto next_sge;
- }
- sgl_offset = 0;
- err = 1;
}
-
- out:
- svc_rdma_put_req_map(rpl_map);
- svc_rdma_put_req_map(chl_map);
-
+ ret = 1;
+ err:
/* Detach arg pages. svc_recv will replenish them */
- for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
- rqstp->rq_pages[ch_no] = NULL;
-
- /*
- * Detach res pages. If svc_release sees any it will attempt to
- * put them.
- */
- while (rqstp->rq_next_page != rqstp->rq_respages)
- *(--rqstp->rq_next_page) = NULL;
+ for (page_no = 0;
+ &rqstp->rq_pages[page_no] < rqstp->rq_respages; page_no++)
+ rqstp->rq_pages[page_no] = NULL;
- return err;
+ return ret;
}
static int rdma_read_complete(struct svc_rqst *rqstp,
@@ -550,7 +455,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
/* rq_respages starts after the last arg page */
rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
- rqstp->rq_next_page = &rqstp->rq_arg.pages[page_no];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
/* Rebuild rq_arg head and tail. */
rqstp->rq_arg.head[0] = head->arg.head[0];
@@ -599,13 +504,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
struct svc_rdma_op_ctxt,
dto_q);
list_del_init(&ctxt->dto_q);
- }
- if (ctxt) {
spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
return rdma_read_complete(rqstp, ctxt);
- }
-
- if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
+ } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
struct svc_rdma_op_ctxt,
dto_q);
@@ -625,7 +526,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
goto close_out;
- BUG_ON(ret);
goto out;
}
dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
@@ -648,12 +548,11 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
}
/* Read read-list data. */
- ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt);
+ ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt);
if (ret > 0) {
/* read-list posted, defer until data received from client. */
goto defer;
- }
- if (ret < 0) {
+ } else if (ret < 0) {
/* Post of read-list failed, free context. */
svc_rdma_put_context(ctxt, 1);
return 0;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index c1d124dc772..49fd21a5c21 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
* Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
@@ -49,152 +50,6 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
-/* Encode an XDR as an array of IB SGE
- *
- * Assumptions:
- * - head[0] is physically contiguous.
- * - tail[0] is physically contiguous.
- * - pages[] is not physically or virtually contiguous and consists of
- * PAGE_SIZE elements.
- *
- * Output:
- * SGE[0] reserved for RCPRDMA header
- * SGE[1] data from xdr->head[]
- * SGE[2..sge_count-2] data from xdr->pages[]
- * SGE[sge_count-1] data from xdr->tail.
- *
- * The max SGE we need is the length of the XDR / pagesize + one for
- * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES
- * reserves a page for both the request and the reply header, and this
- * array is only concerned with the reply we are assured that we have
- * on extra page for the RPCRMDA header.
- */
-static int fast_reg_xdr(struct svcxprt_rdma *xprt,
- struct xdr_buf *xdr,
- struct svc_rdma_req_map *vec)
-{
- int sge_no;
- u32 sge_bytes;
- u32 page_bytes;
- u32 page_off;
- int page_no = 0;
- u8 *frva;
- struct svc_rdma_fastreg_mr *frmr;
-
- frmr = svc_rdma_get_frmr(xprt);
- if (IS_ERR(frmr))
- return -ENOMEM;
- vec->frmr = frmr;
-
- /* Skip the RPCRDMA header */
- sge_no = 1;
-
- /* Map the head. */
- frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK);
- vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
- vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
- vec->count = 2;
- sge_no++;
-
- /* Map the XDR head */
- frmr->kva = frva;
- frmr->direction = DMA_TO_DEVICE;
- frmr->access_flags = 0;
- frmr->map_len = PAGE_SIZE;
- frmr->page_list_len = 1;
- page_off = (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
- frmr->page_list->page_list[page_no] =
- ib_dma_map_page(xprt->sc_cm_id->device,
- virt_to_page(xdr->head[0].iov_base),
- page_off,
- PAGE_SIZE - page_off,
- DMA_TO_DEVICE);
- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
- frmr->page_list->page_list[page_no]))
- goto fatal_err;
- atomic_inc(&xprt->sc_dma_used);
-
- /* Map the XDR page list */
- page_off = xdr->page_base;
- page_bytes = xdr->page_len + page_off;
- if (!page_bytes)
- goto encode_tail;
-
- /* Map the pages */
- vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
- vec->sge[sge_no].iov_len = page_bytes;
- sge_no++;
- while (page_bytes) {
- struct page *page;
-
- page = xdr->pages[page_no++];
- sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
- page_bytes -= sge_bytes;
-
- frmr->page_list->page_list[page_no] =
- ib_dma_map_page(xprt->sc_cm_id->device,
- page, page_off,
- sge_bytes, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
- frmr->page_list->page_list[page_no]))
- goto fatal_err;
-
- atomic_inc(&xprt->sc_dma_used);
- page_off = 0; /* reset for next time through loop */
- frmr->map_len += PAGE_SIZE;
- frmr->page_list_len++;
- }
- vec->count++;
-
- encode_tail:
- /* Map tail */
- if (0 == xdr->tail[0].iov_len)
- goto done;
-
- vec->count++;
- vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
-
- if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) ==
- ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) {
- /*
- * If head and tail use the same page, we don't need
- * to map it again.
- */
- vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
- } else {
- void *va;
-
- /* Map another page for the tail */
- page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
- va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK);
- vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
-
- frmr->page_list->page_list[page_no] =
- ib_dma_map_page(xprt->sc_cm_id->device, virt_to_page(va),
- page_off,
- PAGE_SIZE,
- DMA_TO_DEVICE);
- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
- frmr->page_list->page_list[page_no]))
- goto fatal_err;
- atomic_inc(&xprt->sc_dma_used);
- frmr->map_len += PAGE_SIZE;
- frmr->page_list_len++;
- }
-
- done:
- if (svc_rdma_fastreg(xprt, frmr))
- goto fatal_err;
-
- return 0;
-
- fatal_err:
- printk("svcrdma: Error fast registering memory for xprt %p\n", xprt);
- vec->frmr = NULL;
- svc_rdma_put_frmr(xprt, frmr);
- return -EIO;
-}
-
static int map_xdr(struct svcxprt_rdma *xprt,
struct xdr_buf *xdr,
struct svc_rdma_req_map *vec)
@@ -208,9 +63,6 @@ static int map_xdr(struct svcxprt_rdma *xprt,
BUG_ON(xdr->len !=
(xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
- if (xprt->sc_frmr_pg_list_len)
- return fast_reg_xdr(xprt, xdr, vec);
-
/* Skip the first sge, this is for the RPCRDMA header */
sge_no = 1;
@@ -265,6 +117,7 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
xdr_off -= xdr->head[0].iov_len;
if (xdr_off < xdr->page_len) {
/* This offset is in the page list */
+ xdr_off += xdr->page_base;
page = xdr->pages[xdr_off >> PAGE_SHIFT];
xdr_off &= ~PAGE_MASK;
} else {
@@ -281,8 +134,6 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
}
/* Assumptions:
- * - We are using FRMR
- * - or -
* - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
*/
static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
@@ -326,23 +177,16 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
sge_bytes = min_t(size_t,
bc, vec->sge[xdr_sge_no].iov_len-sge_off);
sge[sge_no].length = sge_bytes;
- if (!vec->frmr) {
- sge[sge_no].addr =
- dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
- sge_bytes, DMA_TO_DEVICE);
- xdr_off += sge_bytes;
- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
- sge[sge_no].addr))
- goto err;
- atomic_inc(&xprt->sc_dma_used);
- sge[sge_no].lkey = xprt->sc_dma_lkey;
- } else {
- sge[sge_no].addr = (unsigned long)
- vec->sge[xdr_sge_no].iov_base + sge_off;
- sge[sge_no].lkey = vec->frmr->mr->lkey;
- }
+ sge[sge_no].addr =
+ dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
+ sge_bytes, DMA_TO_DEVICE);
+ xdr_off += sge_bytes;
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+ sge[sge_no].addr))
+ goto err;
+ atomic_inc(&xprt->sc_dma_used);
+ sge[sge_no].lkey = xprt->sc_dma_lkey;
ctxt->count++;
- ctxt->frmr = vec->frmr;
sge_off = 0;
sge_no++;
xdr_sge_no++;
@@ -368,7 +212,6 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
return 0;
err:
svc_rdma_unmap_dma(ctxt);
- svc_rdma_put_frmr(xprt, vec->frmr);
svc_rdma_put_context(ctxt, 0);
/* Fatal error, close transport */
return -EIO;
@@ -396,10 +239,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
res_ary = (struct rpcrdma_write_array *)
&rdma_resp->rm_body.rm_chunks[1];
- if (vec->frmr)
- max_write = vec->frmr->map_len;
- else
- max_write = xprt->sc_max_sge * PAGE_SIZE;
+ max_write = xprt->sc_max_sge * PAGE_SIZE;
/* Write chunks start at the pagelist */
for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
@@ -471,10 +311,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
res_ary = (struct rpcrdma_write_array *)
&rdma_resp->rm_body.rm_chunks[2];
- if (vec->frmr)
- max_write = vec->frmr->map_len;
- else
- max_write = xprt->sc_max_sge * PAGE_SIZE;
+ max_write = xprt->sc_max_sge * PAGE_SIZE;
/* xdr offset starts at RPC message */
nchunks = ntohl(arg_ary->wc_nchunks);
@@ -544,7 +381,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
int byte_count)
{
struct ib_send_wr send_wr;
- struct ib_send_wr inv_wr;
int sge_no;
int sge_bytes;
int page_no;
@@ -558,7 +394,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
"svcrdma: could not post a receive buffer, err=%d."
"Closing transport %p.\n", ret, rdma);
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
- svc_rdma_put_frmr(rdma, vec->frmr);
svc_rdma_put_context(ctxt, 0);
return -ENOTCONN;
}
@@ -566,11 +401,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
/* Prepare the context */
ctxt->pages[0] = page;
ctxt->count = 1;
- ctxt->frmr = vec->frmr;
- if (vec->frmr)
- set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
- else
- clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
/* Prepare the SGE for the RPCRDMA Header */
ctxt->sge[0].lkey = rdma->sc_dma_lkey;
@@ -589,21 +419,15 @@ static int send_reply(struct svcxprt_rdma *rdma,
int xdr_off = 0;
sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
byte_count -= sge_bytes;
- if (!vec->frmr) {
- ctxt->sge[sge_no].addr =
- dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
- sge_bytes, DMA_TO_DEVICE);
- xdr_off += sge_bytes;
- if (ib_dma_mapping_error(rdma->sc_cm_id->device,
- ctxt->sge[sge_no].addr))
- goto err;
- atomic_inc(&rdma->sc_dma_used);
- ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
- } else {
- ctxt->sge[sge_no].addr = (unsigned long)
- vec->sge[sge_no].iov_base;
- ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey;
- }
+ ctxt->sge[sge_no].addr =
+ dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
+ sge_bytes, DMA_TO_DEVICE);
+ xdr_off += sge_bytes;
+ if (ib_dma_mapping_error(rdma->sc_cm_id->device,
+ ctxt->sge[sge_no].addr))
+ goto err;
+ atomic_inc(&rdma->sc_dma_used);
+ ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
ctxt->sge[sge_no].length = sge_bytes;
}
BUG_ON(byte_count != 0);
@@ -625,6 +449,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
if (page_no+1 >= sge_no)
ctxt->sge[page_no+1].length = 0;
}
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+
BUG_ON(sge_no > rdma->sc_max_sge);
memset(&send_wr, 0, sizeof send_wr);
ctxt->wr_op = IB_WR_SEND;
@@ -633,15 +459,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
send_wr.num_sge = sge_no;
send_wr.opcode = IB_WR_SEND;
send_wr.send_flags = IB_SEND_SIGNALED;
- if (vec->frmr) {
- /* Prepare INVALIDATE WR */
- memset(&inv_wr, 0, sizeof inv_wr);
- inv_wr.opcode = IB_WR_LOCAL_INV;
- inv_wr.send_flags = IB_SEND_SIGNALED;
- inv_wr.ex.invalidate_rkey =
- vec->frmr->mr->lkey;
- send_wr.next = &inv_wr;
- }
ret = svc_rdma_send(rdma, &send_wr);
if (ret)
@@ -651,7 +468,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
err:
svc_rdma_unmap_dma(ctxt);
- svc_rdma_put_frmr(rdma, vec->frmr);
svc_rdma_put_context(ctxt, 1);
return -EIO;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 62e4f9bcc38..e7323fbbd34 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
* Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
@@ -65,6 +66,7 @@ static void dto_tasklet_func(unsigned long data);
static void svc_rdma_detach(struct svc_xprt *xprt);
static void svc_rdma_free(struct svc_xprt *xprt);
static int svc_rdma_has_wspace(struct svc_xprt *xprt);
+static int svc_rdma_secure_port(struct svc_rqst *);
static void rq_cq_reap(struct svcxprt_rdma *xprt);
static void sq_cq_reap(struct svcxprt_rdma *xprt);
@@ -82,6 +84,7 @@ static struct svc_xprt_ops svc_rdma_ops = {
.xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
.xpo_has_wspace = svc_rdma_has_wspace,
.xpo_accept = svc_rdma_accept,
+ .xpo_secure_port = svc_rdma_secure_port,
};
struct svc_xprt_class svc_rdma_class = {
@@ -160,7 +163,6 @@ struct svc_rdma_req_map *svc_rdma_get_req_map(void)
schedule_timeout_uninterruptible(msecs_to_jiffies(500));
}
map->count = 0;
- map->frmr = NULL;
return map;
}
@@ -336,22 +338,21 @@ static void process_context(struct svcxprt_rdma *xprt,
switch (ctxt->wr_op) {
case IB_WR_SEND:
- if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
- svc_rdma_put_frmr(xprt, ctxt->frmr);
+ BUG_ON(ctxt->frmr);
svc_rdma_put_context(ctxt, 1);
break;
case IB_WR_RDMA_WRITE:
+ BUG_ON(ctxt->frmr);
svc_rdma_put_context(ctxt, 0);
break;
case IB_WR_RDMA_READ:
case IB_WR_RDMA_READ_WITH_INV:
+ svc_rdma_put_frmr(xprt, ctxt->frmr);
if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
BUG_ON(!read_hdr);
- if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
- svc_rdma_put_frmr(xprt, ctxt->frmr);
spin_lock_bh(&xprt->sc_rq_dto_lock);
set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
list_add_tail(&read_hdr->dto_q,
@@ -363,6 +364,7 @@ static void process_context(struct svcxprt_rdma *xprt,
break;
default:
+ BUG_ON(1);
printk(KERN_ERR "svcrdma: unexpected completion type, "
"opcode=%d\n",
ctxt->wr_op);
@@ -378,29 +380,42 @@ static void process_context(struct svcxprt_rdma *xprt,
static void sq_cq_reap(struct svcxprt_rdma *xprt)
{
struct svc_rdma_op_ctxt *ctxt = NULL;
- struct ib_wc wc;
+ struct ib_wc wc_a[6];
+ struct ib_wc *wc;
struct ib_cq *cq = xprt->sc_sq_cq;
int ret;
+ memset(wc_a, 0, sizeof(wc_a));
+
if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
return;
ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
atomic_inc(&rdma_stat_sq_poll);
- while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
- if (wc.status != IB_WC_SUCCESS)
- /* Close the transport */
- set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ while ((ret = ib_poll_cq(cq, ARRAY_SIZE(wc_a), wc_a)) > 0) {
+ int i;
- /* Decrement used SQ WR count */
- atomic_dec(&xprt->sc_sq_count);
- wake_up(&xprt->sc_send_wait);
+ for (i = 0; i < ret; i++) {
+ wc = &wc_a[i];
+ if (wc->status != IB_WC_SUCCESS) {
+ dprintk("svcrdma: sq wc err status %d\n",
+ wc->status);
- ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
- if (ctxt)
- process_context(xprt, ctxt);
+ /* Close the transport */
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ }
- svc_xprt_put(&xprt->sc_xprt);
+ /* Decrement used SQ WR count */
+ atomic_dec(&xprt->sc_sq_count);
+ wake_up(&xprt->sc_send_wait);
+
+ ctxt = (struct svc_rdma_op_ctxt *)
+ (unsigned long)wc->wr_id;
+ if (ctxt)
+ process_context(xprt, ctxt);
+
+ svc_xprt_put(&xprt->sc_xprt);
+ }
}
if (ctxt)
@@ -477,8 +492,7 @@ struct page *svc_rdma_get_page(void)
while ((page = alloc_page(GFP_KERNEL)) == NULL) {
/* If we can't get memory, wait a bit and try again */
- printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 "
- "jiffies.\n");
+ printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n");
schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
}
return page;
@@ -994,7 +1008,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
need_dma_mr = 0;
break;
case RDMA_TRANSPORT_IB:
- if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
+ if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
+ need_dma_mr = 1;
+ dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+ } else if (!(devattr.device_cap_flags &
+ IB_DEVICE_LOCAL_DMA_LKEY)) {
need_dma_mr = 1;
dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
} else
@@ -1191,14 +1209,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
container_of(xprt, struct svcxprt_rdma, sc_xprt);
/*
- * If there are fewer SQ WR available than required to send a
- * simple response, return false.
- */
- if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3))
- return 0;
-
- /*
- * ...or there are already waiters on the SQ,
+ * If there are already waiters on the SQ,
* return false.
*/
if (waitqueue_active(&rdma->sc_send_wait))
@@ -1208,6 +1219,11 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
return 1;
}
+static int svc_rdma_secure_port(struct svc_rqst *rqstp)
+{
+ return 1;
+}
+
/*
* Attempt to register the kvec representing the RPC memory with the
* device.
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index c9aa7a35f3b..66f91f0d071 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -51,6 +51,7 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
+#include <linux/sunrpc/addr.h>
#include "xprt_rdma.h"
@@ -85,7 +86,7 @@ static unsigned int max_memreg = RPCRDMA_LAST - 1;
static struct ctl_table_header *sunrpc_table_header;
-static ctl_table xr_tunables_table[] = {
+static struct ctl_table xr_tunables_table[] = {
{
.procname = "rdma_slot_table_entries",
.data = &xprt_rdma_slot_table_entries,
@@ -137,7 +138,7 @@ static ctl_table xr_tunables_table[] = {
{ },
};
-static ctl_table sunrpc_table[] = {
+static struct ctl_table sunrpc_table[] = {
{
.procname = "sunrpc",
.mode = 0555,
@@ -148,6 +149,11 @@ static ctl_table sunrpc_table[] = {
#endif
+#define RPCRDMA_BIND_TO (60U * HZ)
+#define RPCRDMA_INIT_REEST_TO (5U * HZ)
+#define RPCRDMA_MAX_REEST_TO (30U * HZ)
+#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
+
static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
static void
@@ -228,7 +234,6 @@ static void
xprt_rdma_destroy(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- int rc;
dprintk("RPC: %s: called\n", __func__);
@@ -237,10 +242,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
xprt_clear_connected(xprt);
rpcrdma_buffer_destroy(&r_xprt->rx_buf);
- rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
- if (rc)
- dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n",
- __func__, rc);
+ rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
rpcrdma_ia_close(&r_xprt->rx_ia);
xprt_rdma_free_addresses(xprt);
@@ -288,9 +290,9 @@ xprt_setup_rdma(struct xprt_create *args)
/* 60 second timeout, no retries */
xprt->timeout = &xprt_rdma_default_timeout;
- xprt->bind_timeout = (60U * HZ);
- xprt->reestablish_timeout = (5U * HZ);
- xprt->idle_timeout = (5U * 60 * HZ);
+ xprt->bind_timeout = RPCRDMA_BIND_TO;
+ xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+ xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
xprt->resvport = 0; /* privileged port not needed */
xprt->tsh_size = 0; /* RPC-RDMA handles framing */
@@ -390,7 +392,7 @@ out4:
xprt_rdma_free_addresses(xprt);
rc = -EINVAL;
out3:
- (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
+ rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
out2:
rpcrdma_ia_close(&new_xprt->rx_ia);
out1:
@@ -426,9 +428,8 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
}
static void
-xprt_rdma_connect(struct rpc_task *task)
+xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
{
- struct rpc_xprt *xprt = (struct rpc_xprt *)task->tk_xprt;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
if (r_xprt->rx_ep.rep_connected != 0) {
@@ -436,10 +437,10 @@ xprt_rdma_connect(struct rpc_task *task)
schedule_delayed_work(&r_xprt->rdma_connect,
xprt->reestablish_timeout);
xprt->reestablish_timeout <<= 1;
- if (xprt->reestablish_timeout > (30 * HZ))
- xprt->reestablish_timeout = (30 * HZ);
- else if (xprt->reestablish_timeout < (5 * HZ))
- xprt->reestablish_timeout = (5 * HZ);
+ if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
+ xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
+ else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
+ xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
} else {
schedule_delayed_work(&r_xprt->rdma_connect, 0);
if (!RPC_IS_ASYNC(task))
@@ -447,23 +448,6 @@ xprt_rdma_connect(struct rpc_task *task)
}
}
-static int
-xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
-{
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- int credits = atomic_read(&r_xprt->rx_buf.rb_credits);
-
- /* == RPC_CWNDSCALE @ init, but *after* setup */
- if (r_xprt->rx_buf.rb_cwndscale == 0UL) {
- r_xprt->rx_buf.rb_cwndscale = xprt->cwnd;
- dprintk("RPC: %s: cwndscale %lu\n", __func__,
- r_xprt->rx_buf.rb_cwndscale);
- BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0);
- }
- xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale;
- return xprt_reserve_xprt_cong(xprt, task);
-}
-
/*
* The RDMA allocate/free functions need the task structure as a place
* to hide the struct rpcrdma_req, which is necessary for the actual send/recv
@@ -475,11 +459,12 @@ xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
static void *
xprt_rdma_allocate(struct rpc_task *task, size_t size)
{
- struct rpc_xprt *xprt = task->tk_xprt;
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
struct rpcrdma_req *req, *nreq;
req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf);
- BUG_ON(NULL == req);
+ if (req == NULL)
+ return NULL;
if (size > req->rl_size) {
dprintk("RPC: %s: size %zd too large for buffer[%zd]: "
@@ -503,18 +488,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
* If the allocation or registration fails, the RPC framework
* will (doggedly) retry.
*/
- if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy ==
- RPCRDMA_BOUNCEBUFFERS) {
- /* forced to "pure inline" */
- dprintk("RPC: %s: too much data (%zd) for inline "
- "(r/w max %d/%d)\n", __func__, size,
- rpcx_to_rdmad(xprt).inline_rsize,
- rpcx_to_rdmad(xprt).inline_wsize);
- size = req->rl_size;
- rpc_exit(task, -EIO); /* fail the operation */
- rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
- goto out;
- }
if (task->tk_flags & RPC_TASK_SWAPPER)
nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
else
@@ -543,7 +516,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
req = nreq;
}
dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
-out:
req->rl_connect_cookie = 0; /* our reserved value */
return req->rl_xdr_buf;
@@ -579,9 +551,7 @@ xprt_rdma_free(void *buffer)
__func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
/*
- * Finish the deregistration. When using mw bind, this was
- * begun in rpcrdma_reply_handler(). In all other modes, we
- * do it here, in thread context. The process is considered
+ * Finish the deregistration. The process is considered
* complete when the rr_func vector becomes NULL - this
* was put in place during rpcrdma_reply_handler() - the wait
* call below will not block if the dereg is "done". If
@@ -590,12 +560,7 @@ xprt_rdma_free(void *buffer)
for (i = 0; req->rl_nchunks;) {
--req->rl_nchunks;
i += rpcrdma_deregister_external(
- &req->rl_segments[i], r_xprt, NULL);
- }
-
- if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) {
- rep->rr_func = NULL; /* abandon the callback */
- req->rl_reply = NULL;
+ &req->rl_segments[i], r_xprt);
}
if (req->rl_iov.length == 0) { /* see allocate above */
@@ -627,16 +592,15 @@ static int
xprt_rdma_send_request(struct rpc_task *task)
{
struct rpc_rqst *rqst = task->tk_rqstp;
- struct rpc_xprt *xprt = task->tk_xprt;
+ struct rpc_xprt *xprt = rqst->rq_xprt;
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ int rc;
- /* marshal the send itself */
- if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) {
- r_xprt->rx_stats.failed_marshal_count++;
- dprintk("RPC: %s: rpcrdma_marshal_req failed\n",
- __func__);
- return -EIO;
+ if (req->rl_niovs == 0) {
+ rc = rpcrdma_marshal_req(rqst);
+ if (rc < 0)
+ goto failed_marshal;
}
if (req->rl_reply == NULL) /* e.g. reconnection */
@@ -660,6 +624,12 @@ xprt_rdma_send_request(struct rpc_task *task)
rqst->rq_bytes_sent = 0;
return 0;
+failed_marshal:
+ r_xprt->rx_stats.failed_marshal_count++;
+ dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
+ __func__, rc);
+ if (rc == -EIO)
+ return -EIO;
drop_connection:
xprt_disconnect_done(xprt);
return -ENOTCONN; /* implies disconnect */
@@ -705,7 +675,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
*/
static struct rpc_xprt_ops xprt_rdma_procs = {
- .reserve_xprt = xprt_rdma_reserve_xprt,
+ .reserve_xprt = xprt_reserve_xprt_cong,
.release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
.alloc_slot = xprt_alloc_slot,
.release_request = xprt_release_rqst_cong, /* ditto */
@@ -733,7 +703,7 @@ static void __exit xprt_rdma_cleanup(void)
{
int rc;
- dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n");
+ dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
#ifdef RPC_DEBUG
if (sunrpc_table_header) {
unregister_sysctl_table(sunrpc_table_header);
@@ -755,14 +725,14 @@ static int __init xprt_rdma_init(void)
if (rc)
return rc;
- dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n");
+ dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
- dprintk(KERN_INFO "Defaults:\n");
- dprintk(KERN_INFO "\tSlots %d\n"
+ dprintk("Defaults:\n");
+ dprintk("\tSlots %d\n"
"\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
xprt_rdma_slot_table_entries,
xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
- dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n",
+ dprintk("\tPadding %d\n\tMemreg %d\n",
xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
#ifdef RPC_DEBUG
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 745973b729a..13dbd1c389f 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -48,8 +48,8 @@
*/
#include <linux/interrupt.h>
-#include <linux/pci.h> /* for Tavor hack below */
#include <linux/slab.h>
+#include <asm/bitops.h>
#include "xprt_rdma.h"
@@ -142,98 +142,139 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
}
}
-static inline
-void rpcrdma_event_process(struct ib_wc *wc)
+static void
+rpcrdma_sendcq_process_wc(struct ib_wc *wc)
{
- struct rpcrdma_mw *frmr;
- struct rpcrdma_rep *rep =
- (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
+ struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
- dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
- __func__, rep, wc->status, wc->opcode, wc->byte_len);
+ dprintk("RPC: %s: frmr %p status %X opcode %d\n",
+ __func__, frmr, wc->status, wc->opcode);
- if (!rep) /* send or bind completion that we don't care about */
+ if (wc->wr_id == 0ULL)
return;
-
- if (IB_WC_SUCCESS != wc->status) {
- dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
- __func__, wc->opcode, wc->status);
- rep->rr_len = ~0U;
- if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
- rpcrdma_schedule_tasklet(rep);
+ if (wc->status != IB_WC_SUCCESS)
return;
- }
- switch (wc->opcode) {
- case IB_WC_FAST_REG_MR:
- frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+ if (wc->opcode == IB_WC_FAST_REG_MR)
frmr->r.frmr.state = FRMR_IS_VALID;
- break;
- case IB_WC_LOCAL_INV:
- frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+ else if (wc->opcode == IB_WC_LOCAL_INV)
frmr->r.frmr.state = FRMR_IS_INVALID;
- break;
- case IB_WC_RECV:
- rep->rr_len = wc->byte_len;
- ib_dma_sync_single_for_cpu(
- rdmab_to_ia(rep->rr_buffer)->ri_id->device,
- rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
- /* Keep (only) the most recent credits, after check validity */
- if (rep->rr_len >= 16) {
- struct rpcrdma_msg *p =
- (struct rpcrdma_msg *) rep->rr_base;
- unsigned int credits = ntohl(p->rm_credit);
- if (credits == 0) {
- dprintk("RPC: %s: server"
- " dropped credits to 0!\n", __func__);
- /* don't deadlock */
- credits = 1;
- } else if (credits > rep->rr_buffer->rb_max_requests) {
- dprintk("RPC: %s: server"
- " over-crediting: %d (%d)\n",
- __func__, credits,
- rep->rr_buffer->rb_max_requests);
- credits = rep->rr_buffer->rb_max_requests;
- }
- atomic_set(&rep->rr_buffer->rb_credits, credits);
- }
- /* fall through */
- case IB_WC_BIND_MW:
- rpcrdma_schedule_tasklet(rep);
- break;
- default:
- dprintk("RPC: %s: unexpected WC event %X\n",
- __func__, wc->opcode);
- break;
- }
}
-static inline int
-rpcrdma_cq_poll(struct ib_cq *cq)
+static int
+rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
{
- struct ib_wc wc;
- int rc;
+ struct ib_wc *wcs;
+ int budget, count, rc;
- for (;;) {
- rc = ib_poll_cq(cq, 1, &wc);
- if (rc < 0) {
- dprintk("RPC: %s: ib_poll_cq failed %i\n",
- __func__, rc);
+ budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
+ do {
+ wcs = ep->rep_send_wcs;
+
+ rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
+ if (rc <= 0)
return rc;
- }
- if (rc == 0)
- break;
- rpcrdma_event_process(&wc);
+ count = rc;
+ while (count-- > 0)
+ rpcrdma_sendcq_process_wc(wcs++);
+ } while (rc == RPCRDMA_POLLSIZE && --budget);
+ return 0;
+}
+
+/*
+ * Handle send, fast_reg_mr, and local_inv completions.
+ *
+ * Send events are typically suppressed and thus do not result
+ * in an upcall. Occasionally one is signaled, however. This
+ * prevents the provider's completion queue from wrapping and
+ * losing a completion.
+ */
+static void
+rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
+{
+ struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
+ int rc;
+
+ rc = rpcrdma_sendcq_poll(cq, ep);
+ if (rc) {
+ dprintk("RPC: %s: ib_poll_cq failed: %i\n",
+ __func__, rc);
+ return;
}
+ rc = ib_req_notify_cq(cq,
+ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+ if (rc == 0)
+ return;
+ if (rc < 0) {
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
+ __func__, rc);
+ return;
+ }
+
+ rpcrdma_sendcq_poll(cq, ep);
+}
+
+static void
+rpcrdma_recvcq_process_wc(struct ib_wc *wc)
+{
+ struct rpcrdma_rep *rep =
+ (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
+
+ dprintk("RPC: %s: rep %p status %X opcode %X length %u\n",
+ __func__, rep, wc->status, wc->opcode, wc->byte_len);
+
+ if (wc->status != IB_WC_SUCCESS) {
+ rep->rr_len = ~0U;
+ goto out_schedule;
+ }
+ if (wc->opcode != IB_WC_RECV)
+ return;
+
+ rep->rr_len = wc->byte_len;
+ ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
+ rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
+
+ if (rep->rr_len >= 16) {
+ struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
+ unsigned int credits = ntohl(p->rm_credit);
+
+ if (credits == 0)
+ credits = 1; /* don't deadlock */
+ else if (credits > rep->rr_buffer->rb_max_requests)
+ credits = rep->rr_buffer->rb_max_requests;
+ atomic_set(&rep->rr_buffer->rb_credits, credits);
+ }
+
+out_schedule:
+ rpcrdma_schedule_tasklet(rep);
+}
+
+static int
+rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+{
+ struct ib_wc *wcs;
+ int budget, count, rc;
+
+ budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
+ do {
+ wcs = ep->rep_recv_wcs;
+
+ rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
+ if (rc <= 0)
+ return rc;
+
+ count = rc;
+ while (count-- > 0)
+ rpcrdma_recvcq_process_wc(wcs++);
+ } while (rc == RPCRDMA_POLLSIZE && --budget);
return 0;
}
/*
- * rpcrdma_cq_event_upcall
+ * Handle receive completions.
*
- * This upcall handles recv, send, bind and unbind events.
* It is reentrant but processes single events in order to maintain
* ordering of receives to keep server credits.
*
@@ -242,26 +283,31 @@ rpcrdma_cq_poll(struct ib_cq *cq)
* connection shutdown. That is, the structures required for
* the completion of the reply handler must remain intact until
* all memory has been reclaimed.
- *
- * Note that send events are suppressed and do not result in an upcall.
*/
static void
-rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
+rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
{
+ struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
int rc;
- rc = rpcrdma_cq_poll(cq);
- if (rc)
+ rc = rpcrdma_recvcq_poll(cq, ep);
+ if (rc) {
+ dprintk("RPC: %s: ib_poll_cq failed: %i\n",
+ __func__, rc);
return;
+ }
- rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
- if (rc) {
- dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
+ rc = ib_req_notify_cq(cq,
+ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+ if (rc == 0)
+ return;
+ if (rc < 0) {
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
__func__, rc);
return;
}
- rpcrdma_cq_poll(cq);
+ rpcrdma_recvcq_poll(cq, ep);
}
#ifdef RPC_DEBUG
@@ -493,54 +539,32 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
}
- switch (memreg) {
- case RPCRDMA_MEMWINDOWS:
- case RPCRDMA_MEMWINDOWS_ASYNC:
- if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
- dprintk("RPC: %s: MEMWINDOWS registration "
- "specified but not supported by adapter, "
- "using slower RPCRDMA_REGISTER\n",
- __func__);
- memreg = RPCRDMA_REGISTER;
- }
- break;
- case RPCRDMA_MTHCAFMR:
- if (!ia->ri_id->device->alloc_fmr) {
-#if RPCRDMA_PERSISTENT_REGISTRATION
- dprintk("RPC: %s: MTHCAFMR registration "
- "specified but not supported by adapter, "
- "using riskier RPCRDMA_ALLPHYSICAL\n",
- __func__);
- memreg = RPCRDMA_ALLPHYSICAL;
-#else
- dprintk("RPC: %s: MTHCAFMR registration "
- "specified but not supported by adapter, "
- "using slower RPCRDMA_REGISTER\n",
- __func__);
- memreg = RPCRDMA_REGISTER;
-#endif
- }
- break;
- case RPCRDMA_FRMR:
+ if (memreg == RPCRDMA_FRMR) {
/* Requires both frmr reg and local dma lkey */
if ((devattr.device_cap_flags &
(IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
(IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
-#if RPCRDMA_PERSISTENT_REGISTRATION
dprintk("RPC: %s: FRMR registration "
- "specified but not supported by adapter, "
- "using riskier RPCRDMA_ALLPHYSICAL\n",
- __func__);
+ "not supported by HCA\n", __func__);
+ memreg = RPCRDMA_MTHCAFMR;
+ } else {
+ /* Mind the ia limit on FRMR page list depth */
+ ia->ri_max_frmr_depth = min_t(unsigned int,
+ RPCRDMA_MAX_DATA_SEGS,
+ devattr.max_fast_reg_page_list_len);
+ }
+ }
+ if (memreg == RPCRDMA_MTHCAFMR) {
+ if (!ia->ri_id->device->alloc_fmr) {
+ dprintk("RPC: %s: MTHCAFMR registration "
+ "not supported by HCA\n", __func__);
+#if RPCRDMA_PERSISTENT_REGISTRATION
memreg = RPCRDMA_ALLPHYSICAL;
#else
- dprintk("RPC: %s: FRMR registration "
- "specified but not supported by adapter, "
- "using slower RPCRDMA_REGISTER\n",
- __func__);
- memreg = RPCRDMA_REGISTER;
+ rc = -ENOMEM;
+ goto out2;
#endif
}
- break;
}
/*
@@ -552,8 +576,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
* adapter.
*/
switch (memreg) {
- case RPCRDMA_BOUNCEBUFFERS:
- case RPCRDMA_REGISTER:
case RPCRDMA_FRMR:
break;
#if RPCRDMA_PERSISTENT_REGISTRATION
@@ -563,30 +585,26 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
IB_ACCESS_REMOTE_READ;
goto register_setup;
#endif
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- mem_priv = IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_MW_BIND;
- goto register_setup;
case RPCRDMA_MTHCAFMR:
if (ia->ri_have_dma_lkey)
break;
mem_priv = IB_ACCESS_LOCAL_WRITE;
+#if RPCRDMA_PERSISTENT_REGISTRATION
register_setup:
+#endif
ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
if (IS_ERR(ia->ri_bind_mem)) {
printk(KERN_ALERT "%s: ib_get_dma_mr for "
- "phys register failed with %lX\n\t"
- "Will continue with degraded performance\n",
+ "phys register failed with %lX\n",
__func__, PTR_ERR(ia->ri_bind_mem));
- memreg = RPCRDMA_REGISTER;
- ia->ri_bind_mem = NULL;
+ rc = -ENOMEM;
+ goto out2;
}
break;
default:
- printk(KERN_ERR "%s: invalid memory registration mode %d\n",
- __func__, memreg);
- rc = -EINVAL;
+ printk(KERN_ERR "RPC: Unsupported memory "
+ "registration mode: %d\n", memreg);
+ rc = -ENOMEM;
goto out2;
}
dprintk("RPC: %s: memory registration strategy is %d\n",
@@ -640,6 +658,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
struct rpcrdma_create_data_internal *cdata)
{
struct ib_device_attr devattr;
+ struct ib_cq *sendcq, *recvcq;
int rc, err;
rc = ib_query_device(ia->ri_id->device, &devattr);
@@ -659,32 +678,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.srq = NULL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests;
switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR:
+ case RPCRDMA_FRMR: {
+ int depth = 7;
+
/* Add room for frmr register and invalidate WRs.
* 1. FRMR reg WR for head
* 2. FRMR invalidate WR for head
- * 3. FRMR reg WR for pagelist
- * 4. FRMR invalidate WR for pagelist
+ * 3. N FRMR reg WRs for pagelist
+ * 4. N FRMR invalidate WRs for pagelist
* 5. FRMR reg WR for tail
* 6. FRMR invalidate WR for tail
* 7. The RDMA_SEND WR
*/
- ep->rep_attr.cap.max_send_wr *= 7;
+
+ /* Calculate N if the device max FRMR depth is smaller than
+ * RPCRDMA_MAX_DATA_SEGS.
+ */
+ if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
+ int delta = RPCRDMA_MAX_DATA_SEGS -
+ ia->ri_max_frmr_depth;
+
+ do {
+ depth += 2; /* FRMR reg + invalidate */
+ delta -= ia->ri_max_frmr_depth;
+ } while (delta > 0);
+
+ }
+ ep->rep_attr.cap.max_send_wr *= depth;
if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
- cdata->max_requests = devattr.max_qp_wr / 7;
+ cdata->max_requests = devattr.max_qp_wr / depth;
if (!cdata->max_requests)
return -EINVAL;
- ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
+ ep->rep_attr.cap.max_send_wr = cdata->max_requests *
+ depth;
}
break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- /* Add room for mw_binds+unbinds - overkill! */
- ep->rep_attr.cap.max_send_wr++;
- ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
- if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
- return -EINVAL;
- break;
+ }
default:
break;
}
@@ -705,46 +734,51 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.cap.max_recv_sge);
/* set trigger for requesting send completion */
- ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
- break;
- default:
- break;
- }
+ ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
if (ep->rep_cqinit <= 2)
ep->rep_cqinit = 0;
INIT_CQCOUNT(ep);
ep->rep_ia = ia;
init_waitqueue_head(&ep->rep_connect_wait);
+ INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
- /*
- * Create a single cq for receive dto and mw_bind (only ever
- * care about unbind, really). Send completions are suppressed.
- * Use single threaded tasklet upcalls to maintain ordering.
- */
- ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
- rpcrdma_cq_async_error_upcall, NULL,
- ep->rep_attr.cap.max_recv_wr +
+ sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
+ rpcrdma_cq_async_error_upcall, ep,
ep->rep_attr.cap.max_send_wr + 1, 0);
- if (IS_ERR(ep->rep_cq)) {
- rc = PTR_ERR(ep->rep_cq);
- dprintk("RPC: %s: ib_create_cq failed: %i\n",
+ if (IS_ERR(sendcq)) {
+ rc = PTR_ERR(sendcq);
+ dprintk("RPC: %s: failed to create send CQ: %i\n",
__func__, rc);
goto out1;
}
- rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
+ rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
+ if (rc) {
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
+ __func__, rc);
+ goto out2;
+ }
+
+ recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
+ rpcrdma_cq_async_error_upcall, ep,
+ ep->rep_attr.cap.max_recv_wr + 1, 0);
+ if (IS_ERR(recvcq)) {
+ rc = PTR_ERR(recvcq);
+ dprintk("RPC: %s: failed to create recv CQ: %i\n",
+ __func__, rc);
+ goto out2;
+ }
+
+ rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
if (rc) {
dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
__func__, rc);
+ ib_destroy_cq(recvcq);
goto out2;
}
- ep->rep_attr.send_cq = ep->rep_cq;
- ep->rep_attr.recv_cq = ep->rep_cq;
+ ep->rep_attr.send_cq = sendcq;
+ ep->rep_attr.recv_cq = recvcq;
/* Initialize cma parameters */
@@ -754,9 +788,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
/* Client offers RDMA Read but does not initiate */
ep->rep_remote_cma.initiator_depth = 0;
- if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
- ep->rep_remote_cma.responder_resources = 0;
- else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
+ if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
ep->rep_remote_cma.responder_resources = 32;
else
ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
@@ -768,7 +800,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
return 0;
out2:
- err = ib_destroy_cq(ep->rep_cq);
+ err = ib_destroy_cq(sendcq);
if (err)
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, err);
@@ -782,11 +814,8 @@ out1:
* Disconnect and destroy endpoint. After this, the only
* valid operations on the ep are to free it (if dynamically
* allocated) or re-create it.
- *
- * The caller's error handling must be sure to not leak the endpoint
- * if this function fails.
*/
-int
+void
rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
int rc;
@@ -794,6 +823,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
dprintk("RPC: %s: entering, connected is %d\n",
__func__, ep->rep_connected);
+ cancel_delayed_work_sync(&ep->rep_connect_worker);
+
if (ia->ri_id->qp) {
rc = rpcrdma_ep_disconnect(ep, ia);
if (rc)
@@ -809,13 +840,17 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ep->rep_pad_mr = NULL;
}
- rpcrdma_clean_cq(ep->rep_cq);
- rc = ib_destroy_cq(ep->rep_cq);
+ rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+ rc = ib_destroy_cq(ep->rep_attr.recv_cq);
if (rc)
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, rc);
- return rc;
+ rpcrdma_clean_cq(ep->rep_attr.send_cq);
+ rc = ib_destroy_cq(ep->rep_attr.send_cq);
+ if (rc)
+ dprintk("RPC: %s: ib_destroy_cq returned %i\n",
+ __func__, rc);
}
/*
@@ -831,17 +866,20 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
if (ep->rep_connected != 0) {
struct rpcrdma_xprt *xprt;
retry:
+ dprintk("RPC: %s: reconnecting...\n", __func__);
rc = rpcrdma_ep_disconnect(ep, ia);
if (rc && rc != -ENOTCONN)
dprintk("RPC: %s: rpcrdma_ep_disconnect"
" status %i\n", __func__, rc);
- rpcrdma_clean_cq(ep->rep_cq);
+
+ rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+ rpcrdma_clean_cq(ep->rep_attr.send_cq);
xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
id = rpcrdma_create_id(xprt, ia,
(struct sockaddr *)&xprt->rx_data.addr);
if (IS_ERR(id)) {
- rc = PTR_ERR(id);
+ rc = -EHOSTUNREACH;
goto out;
}
/* TEMP TEMP TEMP - fail if new device:
@@ -855,35 +893,32 @@ retry:
printk("RPC: %s: can't reconnect on "
"different device!\n", __func__);
rdma_destroy_id(id);
- rc = -ENETDOWN;
+ rc = -ENETUNREACH;
goto out;
}
/* END TEMP */
+ rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
+ if (rc) {
+ dprintk("RPC: %s: rdma_create_qp failed %i\n",
+ __func__, rc);
+ rdma_destroy_id(id);
+ rc = -ENETUNREACH;
+ goto out;
+ }
rdma_destroy_qp(ia->ri_id);
rdma_destroy_id(ia->ri_id);
ia->ri_id = id;
+ } else {
+ dprintk("RPC: %s: connecting...\n", __func__);
+ rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+ if (rc) {
+ dprintk("RPC: %s: rdma_create_qp failed %i\n",
+ __func__, rc);
+ /* do not update ep->rep_connected */
+ return -ENETUNREACH;
+ }
}
- rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
- if (rc) {
- dprintk("RPC: %s: rdma_create_qp failed %i\n",
- __func__, rc);
- goto out;
- }
-
-/* XXX Tavor device performs badly with 2K MTU! */
-if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
- struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
- if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
- (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
- pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
- struct ib_qp_attr attr = {
- .path_mtu = IB_MTU_1024
- };
- rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
- }
-}
-
ep->rep_connected = 0;
rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
@@ -944,7 +979,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
int rc;
- rpcrdma_clean_cq(ep->rep_cq);
+ rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+ rpcrdma_clean_cq(ep->rep_attr.send_cq);
rc = rdma_disconnect(ia->ri_id);
if (!rc) {
/* returns without wait if not connected */
@@ -967,7 +1003,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
{
char *p;
- size_t len;
+ size_t len, rlen, wlen;
int i, rc;
struct rpcrdma_mw *r;
@@ -997,11 +1033,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
sizeof(struct rpcrdma_mw);
break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
- sizeof(struct rpcrdma_mw);
- break;
default:
break;
}
@@ -1032,32 +1063,29 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
}
p += cdata->padding;
- /*
- * Allocate the fmr's, or mw's for mw_bind chunk registration.
- * We "cycle" the mw's in order to minimize rkey reuse,
- * and also reduce unbind-to-bind collision.
- */
INIT_LIST_HEAD(&buf->rb_mws);
r = (struct rpcrdma_mw *)p;
switch (ia->ri_memreg_strategy) {
case RPCRDMA_FRMR:
for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
- RPCRDMA_MAX_SEGS);
+ ia->ri_max_frmr_depth);
if (IS_ERR(r->r.frmr.fr_mr)) {
rc = PTR_ERR(r->r.frmr.fr_mr);
dprintk("RPC: %s: ib_alloc_fast_reg_mr"
" failed %i\n", __func__, rc);
goto out;
}
- r->r.frmr.fr_pgl =
- ib_alloc_fast_reg_page_list(ia->ri_id->device,
- RPCRDMA_MAX_SEGS);
+ r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
+ ia->ri_id->device,
+ ia->ri_max_frmr_depth);
if (IS_ERR(r->r.frmr.fr_pgl)) {
rc = PTR_ERR(r->r.frmr.fr_pgl);
dprintk("RPC: %s: "
"ib_alloc_fast_reg_page_list "
"failed %i\n", __func__, rc);
+
+ ib_dereg_mr(r->r.frmr.fr_mr);
goto out;
}
list_add(&r->mw_list, &buf->rb_mws);
@@ -1082,21 +1110,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
++r;
}
break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- /* Allocate one extra request's worth, for full cycling */
- for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
- r->r.mw = ib_alloc_mw(ia->ri_pd);
- if (IS_ERR(r->r.mw)) {
- rc = PTR_ERR(r->r.mw);
- dprintk("RPC: %s: ib_alloc_mw"
- " failed %i\n", __func__, rc);
- goto out;
- }
- list_add(&r->mw_list, &buf->rb_mws);
- ++r;
- }
- break;
default:
break;
}
@@ -1105,16 +1118,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
* Allocate/init the request/reply buffers. Doing this
* using kmalloc for now -- one for each buf.
*/
+ wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
+ rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
+ dprintk("RPC: %s: wlen = %zu, rlen = %zu\n",
+ __func__, wlen, rlen);
+
for (i = 0; i < buf->rb_max_requests; i++) {
struct rpcrdma_req *req;
struct rpcrdma_rep *rep;
- len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
- /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
- /* Typical ~2400b, so rounding up saves work later */
- if (len < 4096)
- len = 4096;
- req = kmalloc(len, GFP_KERNEL);
+ req = kmalloc(wlen, GFP_KERNEL);
if (req == NULL) {
dprintk("RPC: %s: request buffer %d alloc"
" failed\n", __func__, i);
@@ -1126,16 +1139,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
buf->rb_send_bufs[i]->rl_buffer = buf;
rc = rpcrdma_register_internal(ia, req->rl_base,
- len - offsetof(struct rpcrdma_req, rl_base),
+ wlen - offsetof(struct rpcrdma_req, rl_base),
&buf->rb_send_bufs[i]->rl_handle,
&buf->rb_send_bufs[i]->rl_iov);
if (rc)
goto out;
- buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
+ buf->rb_send_bufs[i]->rl_size = wlen -
+ sizeof(struct rpcrdma_req);
- len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
- rep = kmalloc(len, GFP_KERNEL);
+ rep = kmalloc(rlen, GFP_KERNEL);
if (rep == NULL) {
dprintk("RPC: %s: reply buffer %d alloc failed\n",
__func__, i);
@@ -1145,10 +1158,9 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
memset(rep, 0, sizeof(struct rpcrdma_rep));
buf->rb_recv_bufs[i] = rep;
buf->rb_recv_bufs[i]->rr_buffer = buf;
- init_waitqueue_head(&rep->rr_unbind);
rc = rpcrdma_register_internal(ia, rep->rr_base,
- len - offsetof(struct rpcrdma_rep, rr_base),
+ rlen - offsetof(struct rpcrdma_rep, rr_base),
&buf->rb_recv_bufs[i]->rr_handle,
&buf->rb_recv_bufs[i]->rr_iov);
if (rc)
@@ -1179,7 +1191,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
/* clean up in reverse order from create
* 1. recv mr memory (mr free, then kfree)
- * 1a. bind mw memory
* 2. send mr memory (mr free, then kfree)
* 3. padding (if any) [moved to rpcrdma_ep_destroy]
* 4. arrays
@@ -1194,41 +1205,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
kfree(buf->rb_recv_bufs[i]);
}
if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
- while (!list_empty(&buf->rb_mws)) {
- r = list_entry(buf->rb_mws.next,
- struct rpcrdma_mw, mw_list);
- list_del(&r->mw_list);
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR:
- rc = ib_dereg_mr(r->r.frmr.fr_mr);
- if (rc)
- dprintk("RPC: %s:"
- " ib_dereg_mr"
- " failed %i\n",
- __func__, rc);
- ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
- break;
- case RPCRDMA_MTHCAFMR:
- rc = ib_dealloc_fmr(r->r.fmr);
- if (rc)
- dprintk("RPC: %s:"
- " ib_dealloc_fmr"
- " failed %i\n",
- __func__, rc);
- break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- rc = ib_dealloc_mw(r->r.mw);
- if (rc)
- dprintk("RPC: %s:"
- " ib_dealloc_mw"
- " failed %i\n",
- __func__, rc);
- break;
- default:
- break;
- }
- }
rpcrdma_deregister_internal(ia,
buf->rb_send_bufs[i]->rl_handle,
&buf->rb_send_bufs[i]->rl_iov);
@@ -1236,6 +1212,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
}
}
+ while (!list_empty(&buf->rb_mws)) {
+ r = list_entry(buf->rb_mws.next,
+ struct rpcrdma_mw, mw_list);
+ list_del(&r->mw_list);
+ switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
+ rc = ib_dereg_mr(r->r.frmr.fr_mr);
+ if (rc)
+ dprintk("RPC: %s:"
+ " ib_dereg_mr"
+ " failed %i\n",
+ __func__, rc);
+ ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+ break;
+ case RPCRDMA_MTHCAFMR:
+ rc = ib_dealloc_fmr(r->r.fmr);
+ if (rc)
+ dprintk("RPC: %s:"
+ " ib_dealloc_fmr"
+ " failed %i\n",
+ __func__, rc);
+ break;
+ default:
+ break;
+ }
+ }
+
kfree(buf->rb_pool);
}
@@ -1299,21 +1302,17 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
int i;
unsigned long flags;
- BUG_ON(req->rl_nchunks != 0);
spin_lock_irqsave(&buffers->rb_lock, flags);
buffers->rb_send_bufs[--buffers->rb_send_index] = req;
req->rl_niovs = 0;
if (req->rl_reply) {
buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
- init_waitqueue_head(&req->rl_reply->rr_unbind);
req->rl_reply->rr_func = NULL;
req->rl_reply = NULL;
}
switch (ia->ri_memreg_strategy) {
case RPCRDMA_FRMR:
case RPCRDMA_MTHCAFMR:
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
/*
* Cycle mw's back in reverse order, and "spin" them.
* This delays and scrambles reuse as much as possible.
@@ -1358,8 +1357,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
/*
* Put reply buffers back into pool when not attached to
- * request. This happens in error conditions, and when
- * aborting unbinds. Pre-decrement counter/array index.
+ * request. This happens in error conditions.
*/
void
rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
@@ -1498,8 +1496,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
seg1->mr_offset -= pageoff; /* start of page */
seg1->mr_len += pageoff;
len = -pageoff;
- if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
- *nsegs = RPCRDMA_MAX_DATA_SEGS;
+ if (*nsegs > ia->ri_max_frmr_depth)
+ *nsegs = ia->ri_max_frmr_depth;
for (page_no = i = 0; i < *nsegs;) {
rpcrdma_map_one(ia, seg, writing);
pa = seg->mr_dma;
@@ -1536,10 +1534,6 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
} else
post_wr = &frmr_wr;
- /* Bump the key */
- key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
- ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
-
/* Prepare FRMR WR */
memset(&frmr_wr, 0, sizeof frmr_wr);
frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
@@ -1550,7 +1544,16 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
frmr_wr.wr.fast_reg.page_list_len = page_no;
frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
- BUG_ON(frmr_wr.wr.fast_reg.length < len);
+ if (frmr_wr.wr.fast_reg.length < len) {
+ while (seg1->mr_nsegs--)
+ rpcrdma_unmap_one(ia, seg++);
+ return -EIO;
+ }
+
+ /* Bump the key */
+ key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
+ ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
+
frmr_wr.wr.fast_reg.access_flags = (writing ?
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
IB_ACCESS_REMOTE_READ);
@@ -1661,135 +1664,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
return rc;
}
-static int
-rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
- int *nsegs, int writing, struct rpcrdma_ia *ia,
- struct rpcrdma_xprt *r_xprt)
-{
- int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
- IB_ACCESS_REMOTE_READ);
- struct ib_mw_bind param;
- int rc;
-
- *nsegs = 1;
- rpcrdma_map_one(ia, seg, writing);
- param.mr = ia->ri_bind_mem;
- param.wr_id = 0ULL; /* no send cookie */
- param.addr = seg->mr_dma;
- param.length = seg->mr_len;
- param.send_flags = 0;
- param.mw_access_flags = mem_priv;
-
- DECR_CQCOUNT(&r_xprt->rx_ep);
- rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
- if (rc) {
- dprintk("RPC: %s: failed ib_bind_mw "
- "%u@0x%llx status %i\n",
- __func__, seg->mr_len,
- (unsigned long long)seg->mr_dma, rc);
- rpcrdma_unmap_one(ia, seg);
- } else {
- seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
- seg->mr_base = param.addr;
- seg->mr_nsegs = 1;
- }
- return rc;
-}
-
-static int
-rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
- struct rpcrdma_ia *ia,
- struct rpcrdma_xprt *r_xprt, void **r)
-{
- struct ib_mw_bind param;
- LIST_HEAD(l);
- int rc;
-
- BUG_ON(seg->mr_nsegs != 1);
- param.mr = ia->ri_bind_mem;
- param.addr = 0ULL; /* unbind */
- param.length = 0;
- param.mw_access_flags = 0;
- if (*r) {
- param.wr_id = (u64) (unsigned long) *r;
- param.send_flags = IB_SEND_SIGNALED;
- INIT_CQCOUNT(&r_xprt->rx_ep);
- } else {
- param.wr_id = 0ULL;
- param.send_flags = 0;
- DECR_CQCOUNT(&r_xprt->rx_ep);
- }
- rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
- rpcrdma_unmap_one(ia, seg);
- if (rc)
- dprintk("RPC: %s: failed ib_(un)bind_mw,"
- " status %i\n", __func__, rc);
- else
- *r = NULL; /* will upcall on completion */
- return rc;
-}
-
-static int
-rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
- int *nsegs, int writing, struct rpcrdma_ia *ia)
-{
- int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
- IB_ACCESS_REMOTE_READ);
- struct rpcrdma_mr_seg *seg1 = seg;
- struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
- int len, i, rc = 0;
-
- if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
- *nsegs = RPCRDMA_MAX_DATA_SEGS;
- for (len = 0, i = 0; i < *nsegs;) {
- rpcrdma_map_one(ia, seg, writing);
- ipb[i].addr = seg->mr_dma;
- ipb[i].size = seg->mr_len;
- len += seg->mr_len;
- ++seg;
- ++i;
- /* Check for holes */
- if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
- offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
- break;
- }
- seg1->mr_base = seg1->mr_dma;
- seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
- ipb, i, mem_priv, &seg1->mr_base);
- if (IS_ERR(seg1->mr_chunk.rl_mr)) {
- rc = PTR_ERR(seg1->mr_chunk.rl_mr);
- dprintk("RPC: %s: failed ib_reg_phys_mr "
- "%u@0x%llx (%d)... status %i\n",
- __func__, len,
- (unsigned long long)seg1->mr_dma, i, rc);
- while (i--)
- rpcrdma_unmap_one(ia, --seg);
- } else {
- seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
- seg1->mr_nsegs = i;
- seg1->mr_len = len;
- }
- *nsegs = i;
- return rc;
-}
-
-static int
-rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
- struct rpcrdma_ia *ia)
-{
- struct rpcrdma_mr_seg *seg1 = seg;
- int rc;
-
- rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
- seg1->mr_chunk.rl_mr = NULL;
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia, seg++);
- if (rc)
- dprintk("RPC: %s: failed ib_dereg_mr,"
- " status %i\n", __func__, rc);
- return rc;
-}
-
int
rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
@@ -1819,16 +1693,8 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
break;
- /* Registration using memory windows */
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
- break;
-
- /* Default registration each time */
default:
- rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
- break;
+ return -1;
}
if (rc)
return -1;
@@ -1838,7 +1704,7 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
int
rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
- struct rpcrdma_xprt *r_xprt, void *r)
+ struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
int nsegs = seg->mr_nsegs, rc;
@@ -1847,9 +1713,7 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
#if RPCRDMA_PERSISTENT_REGISTRATION
case RPCRDMA_ALLPHYSICAL:
- BUG_ON(nsegs != 1);
rpcrdma_unmap_one(ia, seg);
- rc = 0;
break;
#endif
@@ -1861,21 +1725,9 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
rc = rpcrdma_deregister_fmr_external(seg, ia);
break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- case RPCRDMA_MEMWINDOWS:
- rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
- break;
-
default:
- rc = rpcrdma_deregister_default_external(seg, ia);
break;
}
- if (r) {
- struct rpcrdma_rep *rep = r;
- void (*func)(struct rpcrdma_rep *) = rep->rr_func;
- rep->rr_func = NULL;
- func(rep); /* dereg done, callback now */
- }
return nsegs;
}
@@ -1950,7 +1802,6 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
ib_dma_sync_single_for_cpu(ia->ri_id->device,
rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
- DECR_CQCOUNT(ep);
rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
if (rc)
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 9a66c95b583..89e7cd47970 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -43,6 +43,7 @@
#include <linux/wait.h> /* wait_queue_head_t, etc */
#include <linux/spinlock.h> /* spinlock_t, etc */
#include <linux/atomic.h> /* atomic_t, etc */
+#include <linux/workqueue.h> /* struct work_struct */
#include <rdma/rdma_cm.h> /* RDMA connection api */
#include <rdma/ib_verbs.h> /* RDMA verbs api */
@@ -66,18 +67,21 @@ struct rpcrdma_ia {
struct completion ri_done;
int ri_async_rc;
enum rpcrdma_memreg ri_memreg_strategy;
+ unsigned int ri_max_frmr_depth;
};
/*
* RDMA Endpoint -- one per transport instance
*/
+#define RPCRDMA_WC_BUDGET (128)
+#define RPCRDMA_POLLSIZE (16)
+
struct rpcrdma_ep {
atomic_t rep_cqcount;
int rep_cqinit;
int rep_connected;
struct rpcrdma_ia *rep_ia;
- struct ib_cq *rep_cq;
struct ib_qp_init_attr rep_attr;
wait_queue_head_t rep_connect_wait;
struct ib_sge rep_pad; /* holds zeroed pad */
@@ -86,6 +90,9 @@ struct rpcrdma_ep {
struct rpc_xprt *rep_xprt; /* for rep_func */
struct rdma_conn_param rep_remote_cma;
struct sockaddr_storage rep_remote_addr;
+ struct delayed_work rep_connect_worker;
+ struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE];
+ struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE];
};
#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
@@ -124,7 +131,6 @@ struct rpcrdma_rep {
struct rpc_xprt *rr_xprt; /* needed for request/reply matching */
void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
struct list_head rr_list; /* tasklet list */
- wait_queue_head_t rr_unbind; /* optional unbind wait */
struct ib_sge rr_iov; /* for posting */
struct ib_mr *rr_handle; /* handle for mem in rr_iov */
char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
@@ -159,7 +165,6 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
struct ib_mr *rl_mr; /* if registered directly */
struct rpcrdma_mw { /* if registered from region */
union {
- struct ib_mw *mw;
struct ib_fmr *fmr;
struct {
struct ib_fast_reg_page_list *fr_pgl;
@@ -207,7 +212,6 @@ struct rpcrdma_req {
struct rpcrdma_buffer {
spinlock_t rb_lock; /* protects indexes */
atomic_t rb_credits; /* most recent server credits */
- unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */
int rb_max_requests;/* client max requests */
struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
int rb_send_index;
@@ -235,13 +239,13 @@ struct rpcrdma_create_data_internal {
};
#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
- (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_rsize)
+ (rpcx_to_rdmad(rq->rq_xprt).inline_rsize)
#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
- (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_wsize)
+ (rpcx_to_rdmad(rq->rq_xprt).inline_wsize)
#define RPCRDMA_INLINE_PAD_VALUE(rq)\
- rpcx_to_rdmad(rq->rq_task->tk_xprt).padding
+ rpcx_to_rdmad(rq->rq_xprt).padding
/*
* Statistics for RPCRDMA
@@ -300,7 +304,7 @@ void rpcrdma_ia_close(struct rpcrdma_ia *);
*/
int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
struct rpcrdma_create_data_internal *);
-int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
@@ -330,11 +334,12 @@ int rpcrdma_deregister_internal(struct rpcrdma_ia *,
int rpcrdma_register_external(struct rpcrdma_mr_seg *,
int, int, struct rpcrdma_xprt *);
int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
- struct rpcrdma_xprt *, void *);
+ struct rpcrdma_xprt *);
/*
* RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
*/
+void rpcrdma_connect_worker(struct work_struct *);
void rpcrdma_conn_func(struct rpcrdma_ep *);
void rpcrdma_reply_handler(struct rpcrdma_rep *);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 68b0a81c31d..be8bbd5d65e 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -33,6 +33,7 @@
#include <linux/udp.h>
#include <linux/tcp.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/sched.h>
#include <linux/sunrpc/svcsock.h>
#include <linux/sunrpc/xprtsock.h>
@@ -46,6 +47,8 @@
#include <net/udp.h>
#include <net/tcp.h>
+#include <trace/events/sunrpc.h>
+
#include "sunrpc.h"
static void xs_close(struct rpc_xprt *xprt);
@@ -86,7 +89,7 @@ static struct ctl_table_header *sunrpc_table_header;
* FIXME: changing the UDP slot table size should also resize the UDP
* socket buffers for existing UDP transports
*/
-static ctl_table xs_tunables_table[] = {
+static struct ctl_table xs_tunables_table[] = {
{
.procname = "udp_slot_table_entries",
.data = &xprt_udp_slot_table_entries,
@@ -142,7 +145,7 @@ static ctl_table xs_tunables_table[] = {
{ },
};
-static ctl_table sunrpc_table[] = {
+static struct ctl_table sunrpc_table[] = {
{
.procname = "sunrpc",
.mode = 0555,
@@ -251,9 +254,10 @@ struct sock_xprt {
/*
* Saved socket callback addresses
*/
- void (*old_data_ready)(struct sock *, int);
+ void (*old_data_ready)(struct sock *);
void (*old_state_change)(struct sock *);
void (*old_write_space)(struct sock *);
+ void (*old_error_report)(struct sock *);
};
/*
@@ -271,6 +275,11 @@ struct sock_xprt {
*/
#define TCP_RPC_REPLY (1UL << 6)
+static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
+{
+ return (struct rpc_xprt *) sk->sk_user_data;
+}
+
static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt)
{
return (struct sockaddr *) &xprt->addr;
@@ -390,8 +399,10 @@ static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen,
return kernel_sendmsg(sock, &msg, NULL, 0, 0);
}
-static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more)
+static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy)
{
+ ssize_t (*do_sendpage)(struct socket *sock, struct page *page,
+ int offset, size_t size, int flags);
struct page **ppage;
unsigned int remainder;
int err, sent = 0;
@@ -400,6 +411,9 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
base += xdr->page_base;
ppage = xdr->pages + (base >> PAGE_SHIFT);
base &= ~PAGE_MASK;
+ do_sendpage = sock->ops->sendpage;
+ if (!zerocopy)
+ do_sendpage = sock_no_sendpage;
for(;;) {
unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder);
int flags = XS_SENDMSG_FLAGS;
@@ -407,7 +421,7 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
remainder -= len;
if (remainder != 0 || more)
flags |= MSG_MORE;
- err = sock->ops->sendpage(sock, *ppage, base, len, flags);
+ err = do_sendpage(sock, *ppage, base, len, flags);
if (remainder == 0 || err != len)
break;
sent += err;
@@ -428,9 +442,10 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
* @addrlen: UDP only -- length of destination address
* @xdr: buffer containing this request
* @base: starting position in the buffer
+ * @zerocopy: true if it is safe to use sendpage()
*
*/
-static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base)
+static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy)
{
unsigned int remainder = xdr->len - base;
int err, sent = 0;
@@ -458,7 +473,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
if (base < xdr->page_len) {
unsigned int len = xdr->page_len - base;
remainder -= len;
- err = xs_send_pagedata(sock, xdr, base, remainder != 0);
+ err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy);
if (remainder == 0 || err != len)
goto out;
sent += err;
@@ -495,6 +510,7 @@ static int xs_nospace(struct rpc_task *task)
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ struct sock *sk = transport->inet;
int ret = -EAGAIN;
dprintk("RPC: %5u xmit incomplete (%u left of %u)\n",
@@ -512,7 +528,7 @@ static int xs_nospace(struct rpc_task *task)
* window size
*/
set_bit(SOCK_NOSPACE, &transport->sock->flags);
- transport->inet->sk_write_pending++;
+ sk->sk_write_pending++;
/* ...and wait for more buffer space */
xprt_wait_for_buffer_space(task, xs_nospace_callback);
}
@@ -522,6 +538,9 @@ static int xs_nospace(struct rpc_task *task)
}
spin_unlock_bh(&xprt->transport_lock);
+
+ /* Race breaker in case memory is freed before above code is called */
+ sk->sk_write_space(sk);
return ret;
}
@@ -561,7 +580,7 @@ static int xs_local_send_request(struct rpc_task *task)
req->rq_svec->iov_base, req->rq_svec->iov_len);
status = xs_sendpages(transport->sock, NULL, 0,
- xdr, req->rq_bytes_sent);
+ xdr, req->rq_bytes_sent, true);
dprintk("RPC: %s(%u) = %d\n",
__func__, xdr->len - req->rq_bytes_sent, status);
if (likely(status >= 0)) {
@@ -617,7 +636,7 @@ static int xs_udp_send_request(struct rpc_task *task)
status = xs_sendpages(transport->sock,
xs_addr(xprt),
xprt->addrlen, xdr,
- req->rq_bytes_sent);
+ req->rq_bytes_sent, true);
dprintk("RPC: xs_udp_send_request(%u) = %d\n",
xdr->len - req->rq_bytes_sent, status);
@@ -664,8 +683,10 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt)
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct socket *sock = transport->sock;
- if (sock != NULL)
+ if (sock != NULL) {
kernel_sock_shutdown(sock, SHUT_WR);
+ trace_rpc_socket_shutdown(xprt, sock);
+ }
}
/**
@@ -688,6 +709,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
struct rpc_xprt *xprt = req->rq_xprt;
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct xdr_buf *xdr = &req->rq_snd_buf;
+ bool zerocopy = true;
int status;
xs_encode_stream_record_marker(&req->rq_snd_buf);
@@ -695,13 +717,20 @@ static int xs_tcp_send_request(struct rpc_task *task)
xs_pktdump("packet data:",
req->rq_svec->iov_base,
req->rq_svec->iov_len);
+ /* Don't use zero copy if this is a resend. If the RPC call
+ * completes while the socket holds a reference to the pages,
+ * then we may end up resending corrupted data.
+ */
+ if (task->tk_flags & RPC_TASK_SENT)
+ zerocopy = false;
/* Continue transmitting the packet/record. We must be careful
* to cope with writespace callbacks arriving _after_ we have
* called sendmsg(). */
while (1) {
status = xs_sendpages(transport->sock,
- NULL, 0, xdr, req->rq_bytes_sent);
+ NULL, 0, xdr, req->rq_bytes_sent,
+ zerocopy);
dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
xdr->len - req->rq_bytes_sent, status);
@@ -770,7 +799,7 @@ static void xs_tcp_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
goto out_release;
if (req->rq_bytes_sent == req->rq_snd_buf.len)
goto out_release;
- set_bit(XPRT_CLOSE_WAIT, &task->tk_xprt->state);
+ set_bit(XPRT_CLOSE_WAIT, &xprt->state);
out_release:
xprt_release_xprt(xprt, task);
}
@@ -780,6 +809,7 @@ static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk)
transport->old_data_ready = sk->sk_data_ready;
transport->old_state_change = sk->sk_state_change;
transport->old_write_space = sk->sk_write_space;
+ transport->old_error_report = sk->sk_error_report;
}
static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *sk)
@@ -787,6 +817,34 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s
sk->sk_data_ready = transport->old_data_ready;
sk->sk_state_change = transport->old_state_change;
sk->sk_write_space = transport->old_write_space;
+ sk->sk_error_report = transport->old_error_report;
+}
+
+/**
+ * xs_error_report - callback to handle TCP socket state errors
+ * @sk: socket
+ *
+ * Note: we don't call sock_error() since there may be a rpc_task
+ * using the socket, and so we don't want to clear sk->sk_err.
+ */
+static void xs_error_report(struct sock *sk)
+{
+ struct rpc_xprt *xprt;
+ int err;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ if (!(xprt = xprt_from_sock(sk)))
+ goto out;
+
+ err = -sk->sk_err;
+ if (err == 0)
+ goto out;
+ dprintk("RPC: xs_error_report client %p, error=%d...\n",
+ xprt, -err);
+ trace_rpc_socket_error(xprt, sk->sk_socket, err);
+ xprt_wake_pending_tasks(xprt, err);
+ out:
+ read_unlock_bh(&sk->sk_callback_lock);
}
static void xs_reset_transport(struct sock_xprt *transport)
@@ -808,8 +866,7 @@ static void xs_reset_transport(struct sock_xprt *transport)
xs_restore_old_callbacks(transport, sk);
write_unlock_bh(&sk->sk_callback_lock);
- sk->sk_no_check = 0;
-
+ trace_rpc_socket_close(&transport->xprt, sock);
sock_release(sock);
}
@@ -829,14 +886,16 @@ static void xs_close(struct rpc_xprt *xprt)
dprintk("RPC: xs_close xprt %p\n", xprt);
+ cancel_delayed_work_sync(&transport->connect_worker);
+
xs_reset_transport(transport);
xprt->reestablish_timeout = 0;
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
clear_bit(XPRT_CLOSING, &xprt->state);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
xprt_disconnect_done(xprt);
}
@@ -848,6 +907,12 @@ static void xs_tcp_close(struct rpc_xprt *xprt)
xs_tcp_shutdown(xprt);
}
+static void xs_xprt_free(struct rpc_xprt *xprt)
+{
+ xs_free_peer_addresses(xprt);
+ xprt_free(xprt);
+}
+
/**
* xs_destroy - prepare to shutdown a transport
* @xprt: doomed transport
@@ -855,23 +920,13 @@ static void xs_tcp_close(struct rpc_xprt *xprt)
*/
static void xs_destroy(struct rpc_xprt *xprt)
{
- struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
-
dprintk("RPC: xs_destroy xprt %p\n", xprt);
- cancel_delayed_work_sync(&transport->connect_worker);
-
xs_close(xprt);
- xs_free_peer_addresses(xprt);
- xprt_free(xprt);
+ xs_xprt_free(xprt);
module_put(THIS_MODULE);
}
-static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
-{
- return (struct rpc_xprt *) sk->sk_user_data;
-}
-
static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
{
struct xdr_skb_reader desc = {
@@ -894,7 +949,7 @@ static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
*
* Currently this assumes we can read the whole reply in a single gulp.
*/
-static void xs_local_data_ready(struct sock *sk, int len)
+static void xs_local_data_ready(struct sock *sk)
{
struct rpc_task *task;
struct rpc_xprt *xprt;
@@ -957,7 +1012,7 @@ static void xs_local_data_ready(struct sock *sk, int len)
* @len: how much data to read
*
*/
-static void xs_udp_data_ready(struct sock *sk, int len)
+static void xs_udp_data_ready(struct sock *sk)
{
struct rpc_task *task;
struct rpc_xprt *xprt;
@@ -1005,7 +1060,7 @@ static void xs_udp_data_ready(struct sock *sk, int len)
UDPX_INC_STATS_BH(sk, UDP_MIB_INDATAGRAMS);
- xprt_adjust_cwnd(task, copied);
+ xprt_adjust_cwnd(xprt, task, copied);
xprt_complete_rqst(task, copied);
out_unlock:
@@ -1254,41 +1309,29 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
* If we're unable to obtain the rpc_rqst we schedule the closing of the
* connection and return -1.
*/
-static inline int xs_tcp_read_callback(struct rpc_xprt *xprt,
+static int xs_tcp_read_callback(struct rpc_xprt *xprt,
struct xdr_skb_reader *desc)
{
struct sock_xprt *transport =
container_of(xprt, struct sock_xprt, xprt);
struct rpc_rqst *req;
- req = xprt_alloc_bc_request(xprt);
+ /* Look up and lock the request corresponding to the given XID */
+ spin_lock(&xprt->transport_lock);
+ req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
if (req == NULL) {
+ spin_unlock(&xprt->transport_lock);
printk(KERN_WARNING "Callback slot table overflowed\n");
xprt_force_disconnect(xprt);
return -1;
}
- req->rq_xid = transport->tcp_xid;
dprintk("RPC: read callback XID %08x\n", ntohl(req->rq_xid));
xs_tcp_read_common(xprt, desc, req);
- if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) {
- struct svc_serv *bc_serv = xprt->bc_serv;
-
- /*
- * Add callback request to callback list. The callback
- * service sleeps on the sv_cb_waitq waiting for new
- * requests. Wake it up after adding enqueing the
- * request.
- */
- dprintk("RPC: add callback request to list\n");
- spin_lock(&bc_serv->sv_cb_lock);
- list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
- spin_unlock(&bc_serv->sv_cb_lock);
- wake_up(&bc_serv->sv_cb_waitq);
- }
-
- req->rq_private_buf.len = transport->tcp_copied;
+ if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
+ xprt_complete_bc_request(req, transport->tcp_copied);
+ spin_unlock(&xprt->transport_lock);
return 0;
}
@@ -1392,7 +1435,7 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns
* @bytes: how much data to read
*
*/
-static void xs_tcp_data_ready(struct sock *sk, int bytes)
+static void xs_tcp_data_ready(struct sock *sk)
{
struct rpc_xprt *xprt;
read_descriptor_t rd_desc;
@@ -1452,12 +1495,12 @@ static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt)
static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
{
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
clear_bit(XPRT_CLOSING, &xprt->state);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
}
static void xs_sock_mark_closed(struct rpc_xprt *xprt)
@@ -1486,6 +1529,7 @@ static void xs_tcp_state_change(struct sock *sk)
sock_flag(sk, SOCK_ZAPPED),
sk->sk_shutdown);
+ trace_rpc_socket_state_change(xprt, sk->sk_socket);
switch (sk->sk_state) {
case TCP_ESTABLISHED:
spin_lock(&xprt->transport_lock);
@@ -1499,6 +1543,7 @@ static void xs_tcp_state_change(struct sock *sk)
transport->tcp_copied = 0;
transport->tcp_flags =
TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID;
+ xprt->connect_cookie++;
xprt_wake_pending_tasks(xprt, -EAGAIN);
}
@@ -1509,10 +1554,10 @@ static void xs_tcp_state_change(struct sock *sk)
xprt->connect_cookie++;
xprt->reestablish_timeout = 0;
set_bit(XPRT_CLOSING, &xprt->state);
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(XPRT_CONNECTED, &xprt->state);
clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout);
break;
case TCP_CLOSE_WAIT:
@@ -1531,9 +1576,9 @@ static void xs_tcp_state_change(struct sock *sk)
case TCP_LAST_ACK:
set_bit(XPRT_CLOSING, &xprt->state);
xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout);
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(XPRT_CONNECTED, &xprt->state);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
break;
case TCP_CLOSE:
xs_tcp_cancel_linger_timeout(xprt);
@@ -1596,7 +1641,7 @@ static void xs_tcp_write_space(struct sock *sk)
read_lock_bh(&sk->sk_callback_lock);
/* from net/core/stream.c:sk_stream_write_space */
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+ if (sk_stream_is_writeable(sk))
xs_write_space(sk);
read_unlock_bh(&sk->sk_callback_lock);
@@ -1646,15 +1691,15 @@ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t
*
* Adjust the congestion window after a retransmit timeout has occurred.
*/
-static void xs_udp_timer(struct rpc_task *task)
+static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
{
- xprt_adjust_cwnd(task, -ETIMEDOUT);
+ xprt_adjust_cwnd(xprt, task, -ETIMEDOUT);
}
static unsigned short xs_get_random_port(void)
{
unsigned short range = xprt_max_resvport - xprt_min_resvport;
- unsigned short rand = (unsigned short) net_random() % range;
+ unsigned short rand = (unsigned short) prandom_u32() % range;
return rand + xprt_min_resvport;
}
@@ -1731,7 +1776,9 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
*/
static void xs_local_rpcbind(struct rpc_task *task)
{
- xprt_set_bound(task->tk_xprt);
+ rcu_read_lock();
+ xprt_set_bound(rcu_dereference(task->tk_client->cl_xprt));
+ rcu_read_unlock();
}
static void xs_local_set_port(struct rpc_xprt *xprt, unsigned short port)
@@ -1802,6 +1849,10 @@ static inline void xs_reclassify_socket(int family, struct socket *sock)
}
#endif
+static void xs_dummy_setup_socket(struct work_struct *work)
+{
+}
+
static struct socket *xs_create_sock(struct rpc_xprt *xprt,
struct sock_xprt *transport, int family, int type, int protocol)
{
@@ -1843,6 +1894,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
sk->sk_user_data = xprt;
sk->sk_data_ready = xs_local_data_ready;
sk->sk_write_space = xs_udp_write_space;
+ sk->sk_error_report = xs_error_report;
sk->sk_allocation = GFP_ATOMIC;
xprt_clear_connected(xprt);
@@ -1865,13 +1917,9 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
* @xprt: RPC transport to connect
* @transport: socket transport to connect
* @create_sock: function to create a socket of the correct type
- *
- * Invoked by a work queue tasklet.
*/
-static void xs_local_setup_socket(struct work_struct *work)
+static int xs_local_setup_socket(struct sock_xprt *transport)
{
- struct sock_xprt *transport =
- container_of(work, struct sock_xprt, connect_worker.work);
struct rpc_xprt *xprt = &transport->xprt;
struct socket *sock;
int status = -EIO;
@@ -1892,6 +1940,7 @@ static void xs_local_setup_socket(struct work_struct *work)
xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
status = xs_local_finish_connecting(xprt, sock);
+ trace_rpc_socket_connect(xprt, sock, status);
switch (status) {
case 0:
dprintk("RPC: xprt %p connected to %s\n",
@@ -1916,6 +1965,30 @@ out:
xprt_clear_connecting(xprt);
xprt_wake_pending_tasks(xprt, status);
current->flags &= ~PF_FSTRANS;
+ return status;
+}
+
+static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ int ret;
+
+ if (RPC_IS_ASYNC(task)) {
+ /*
+ * We want the AF_LOCAL connect to be resolved in the
+ * filesystem namespace of the process making the rpc
+ * call. Thus we connect synchronously.
+ *
+ * If we want to support asynchronous AF_LOCAL calls,
+ * we'll need to figure out how to pass a namespace to
+ * connect.
+ */
+ rpc_exit(task, -ENOTCONN);
+ return;
+ }
+ ret = xs_local_setup_socket(transport);
+ if (ret && !RPC_IS_SOFTCONN(task))
+ msleep_interruptible(15000);
}
#ifdef CONFIG_SUNRPC_SWAP
@@ -1971,7 +2044,6 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_user_data = xprt;
sk->sk_data_ready = xs_udp_data_ready;
sk->sk_write_space = xs_udp_write_space;
- sk->sk_no_check = UDP_CSUM_NORCV;
sk->sk_allocation = GFP_ATOMIC;
xprt_set_connected(xprt);
@@ -2011,6 +2083,7 @@ static void xs_udp_setup_socket(struct work_struct *work)
xprt->address_strings[RPC_DISPLAY_PORT]);
xs_udp_finish_connecting(xprt, sock);
+ trace_rpc_socket_connect(xprt, sock, 0);
status = 0;
out:
xprt_clear_connecting(xprt);
@@ -2036,6 +2109,8 @@ static void xs_abort_connection(struct sock_xprt *transport)
memset(&any, 0, sizeof(any));
any.sa_family = AF_UNSPEC;
result = kernel_connect(transport->sock, &any, sizeof(any), 0);
+ trace_rpc_socket_reset_connection(&transport->xprt,
+ transport->sock, result);
if (!result)
xs_sock_reset_connection_flags(&transport->xprt);
dprintk("RPC: AF_UNSPEC connect return code %d\n", result);
@@ -2074,6 +2149,19 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
if (!transport->inet) {
struct sock *sk = sock->sk;
+ unsigned int keepidle = xprt->timeout->to_initval / HZ;
+ unsigned int keepcnt = xprt->timeout->to_retries + 1;
+ unsigned int opt_on = 1;
+
+ /* TCP Keepalive options */
+ kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+ (char *)&opt_on, sizeof(opt_on));
+ kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
+ (char *)&keepidle, sizeof(keepidle));
+ kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
+ (char *)&keepidle, sizeof(keepidle));
+ kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
+ (char *)&keepcnt, sizeof(keepcnt));
write_lock_bh(&sk->sk_callback_lock);
@@ -2083,6 +2171,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_data_ready = xs_tcp_data_ready;
sk->sk_state_change = xs_tcp_state_change;
sk->sk_write_space = xs_tcp_write_space;
+ sk->sk_error_report = xs_error_report;
sk->sk_allocation = GFP_ATOMIC;
/* socket options */
@@ -2113,7 +2202,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
case 0:
case -EINPROGRESS:
/* SYN_SENT! */
- xprt->connect_cookie++;
if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
}
@@ -2166,6 +2254,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
xprt->address_strings[RPC_DISPLAY_PORT]);
status = xs_tcp_finish_connecting(xprt, sock);
+ trace_rpc_socket_connect(xprt, sock, status);
dprintk("RPC: %p connect status %d connected %d sock state %d\n",
xprt, -status, xprt_connected(xprt),
sock->sk->sk_state);
@@ -2179,10 +2268,6 @@ static void xs_tcp_setup_socket(struct work_struct *work)
*/
xs_tcp_force_close(xprt);
break;
- case -ECONNREFUSED:
- case -ECONNRESET:
- case -ENETUNREACH:
- /* retry with existing socket, after a delay */
case 0:
case -EINPROGRESS:
case -EALREADY:
@@ -2193,6 +2278,10 @@ static void xs_tcp_setup_socket(struct work_struct *work)
/* Happens, for instance, if the user specified a link
* local IPv6 address without a scope-id.
*/
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -ENETUNREACH:
+ /* retry with existing socket, after a delay */
goto out;
}
out_eagain:
@@ -2205,6 +2294,7 @@ out:
/**
* xs_connect - connect a socket to a remote endpoint
+ * @xprt: pointer to transport structure
* @task: address of RPC task that manages state of connect request
*
* TCP: If the remote end dropped the connection, delay reconnecting.
@@ -2216,9 +2306,8 @@ out:
* If a UDP socket connect fails, the delay behavior here prevents
* retry floods (hard mounts).
*/
-static void xs_connect(struct rpc_task *task)
+static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
{
- struct rpc_xprt *xprt = task->tk_xprt;
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) {
@@ -2445,6 +2534,10 @@ static void bc_close(struct rpc_xprt *xprt)
static void bc_destroy(struct rpc_xprt *xprt)
{
+ dprintk("RPC: bc_destroy xprt %p\n", xprt);
+
+ xs_xprt_free(xprt);
+ module_put(THIS_MODULE);
}
static struct rpc_xprt_ops xs_local_ops = {
@@ -2453,7 +2546,7 @@ static struct rpc_xprt_ops xs_local_ops = {
.alloc_slot = xprt_alloc_slot,
.rpcbind = xs_local_rpcbind,
.set_port = xs_local_set_port,
- .connect = xs_connect,
+ .connect = xs_local_connect,
.buf_alloc = rpc_malloc,
.buf_free = rpc_free,
.send_request = xs_local_send_request,
@@ -2506,7 +2599,6 @@ static struct rpc_xprt_ops bc_tcp_ops = {
.reserve_xprt = xprt_reserve_xprt,
.release_xprt = xprt_release_xprt,
.alloc_slot = xprt_alloc_slot,
- .rpcbind = xs_local_rpcbind,
.buf_alloc = bc_malloc,
.buf_free = bc_free,
.send_request = bc_send_request,
@@ -2617,6 +2709,9 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
xprt->ops = &xs_local_ops;
xprt->timeout = &xs_local_default_timeout;
+ INIT_DELAYED_WORK(&transport->connect_worker,
+ xs_dummy_setup_socket);
+
switch (sun->sun_family) {
case AF_LOCAL:
if (sun->sun_path[0] != '/') {
@@ -2626,9 +2721,10 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
goto out_err;
}
xprt_set_bound(xprt);
- INIT_DELAYED_WORK(&transport->connect_worker,
- xs_local_setup_socket);
xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL);
+ ret = ERR_PTR(xs_local_setup_socket(transport));
+ if (ret)
+ goto out_err;
break;
default:
ret = ERR_PTR(-EAFNOSUPPORT);
@@ -2642,7 +2738,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
return xprt;
ret = ERR_PTR(-EINVAL);
out_err:
- xprt_free(xprt);
+ xs_xprt_free(xprt);
return ret;
}
@@ -2720,7 +2816,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
return xprt;
ret = ERR_PTR(-EINVAL);
out_err:
- xprt_free(xprt);
+ xs_xprt_free(xprt);
return ret;
}
@@ -2741,9 +2837,13 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
struct rpc_xprt *xprt;
struct sock_xprt *transport;
struct rpc_xprt *ret;
+ unsigned int max_slot_table_size = xprt_max_tcp_slot_table_entries;
+
+ if (args->flags & XPRT_CREATE_INFINITE_SLOTS)
+ max_slot_table_size = RPC_MAX_SLOT_TABLE_LIMIT;
xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries,
- xprt_max_tcp_slot_table_entries);
+ max_slot_table_size);
if (IS_ERR(xprt))
return xprt;
transport = container_of(xprt, struct sock_xprt, xprt);
@@ -2791,12 +2891,11 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
xprt->address_strings[RPC_DISPLAY_ADDR],
xprt->address_strings[RPC_DISPLAY_PROTO]);
-
if (try_module_get(THIS_MODULE))
return xprt;
ret = ERR_PTR(-EINVAL);
out_err:
- xprt_free(xprt);
+ xs_xprt_free(xprt);
return ret;
}
@@ -2813,15 +2912,6 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
struct svc_sock *bc_sock;
struct rpc_xprt *ret;
- if (args->bc_xprt->xpt_bc_xprt) {
- /*
- * This server connection already has a backchannel
- * export; we can't create a new one, as we wouldn't be
- * able to match replies based on xid any more. So,
- * reuse the already-existing one:
- */
- return args->bc_xprt->xpt_bc_xprt;
- }
xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries,
xprt_tcp_slot_table_entries);
if (IS_ERR(xprt))
@@ -2862,10 +2952,9 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
/*
* Once we've associated a backchannel xprt with a connection,
- * we want to keep it around as long as long as the connection
- * lasts, in case we need to start using it for a backchannel
- * again; this reference won't be dropped until bc_xprt is
- * destroyed.
+ * we want to keep it around as long as the connection lasts,
+ * in case we need to start using it for a backchannel again;
+ * this reference won't be dropped until bc_xprt is destroyed.
*/
xprt_get(xprt);
args->bc_xprt->xpt_bc_xprt = xprt;
@@ -2880,13 +2969,14 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
*/
xprt_set_connected(xprt);
-
if (try_module_get(THIS_MODULE))
return xprt;
+
+ args->bc_xprt->xpt_bc_xprt = NULL;
xprt_put(xprt);
ret = ERR_PTR(-EINVAL);
out_err:
- xprt_free(xprt);
+ xs_xprt_free(xprt);
return ret;
}