diff options
Diffstat (limited to 'net/ceph')
| -rw-r--r-- | net/ceph/Kconfig | 19 | ||||
| -rw-r--r-- | net/ceph/Makefile | 24 | ||||
| -rw-r--r-- | net/ceph/armor.c | 4 | ||||
| -rw-r--r-- | net/ceph/auth.c | 125 | ||||
| -rw-r--r-- | net/ceph/auth_none.c | 21 | ||||
| -rw-r--r-- | net/ceph/auth_none.h | 2 | ||||
| -rw-r--r-- | net/ceph/auth_x.c | 47 | ||||
| -rw-r--r-- | net/ceph/auth_x.h | 9 | ||||
| -rw-r--r-- | net/ceph/buffer.c | 22 | ||||
| -rw-r--r-- | net/ceph/ceph_common.c | 247 | ||||
| -rw-r--r-- | net/ceph/ceph_fs.c | 17 | ||||
| -rw-r--r-- | net/ceph/ceph_hash.c | 9 | ||||
| -rw-r--r-- | net/ceph/ceph_strings.c | 39 | ||||
| -rw-r--r-- | net/ceph/crush/crush.c | 46 | ||||
| -rw-r--r-- | net/ceph/crush/mapper.c | 492 | ||||
| -rw-r--r-- | net/ceph/crypto.c | 75 | ||||
| -rw-r--r-- | net/ceph/crypto.h | 47 | ||||
| -rw-r--r-- | net/ceph/debugfs.c | 99 | ||||
| -rw-r--r-- | net/ceph/messenger.c | 2715 | ||||
| -rw-r--r-- | net/ceph/mon_client.c | 373 | ||||
| -rw-r--r-- | net/ceph/msgpool.c | 45 | ||||
| -rw-r--r-- | net/ceph/osd_client.c | 2423 | ||||
| -rw-r--r-- | net/ceph/osdmap.c | 1391 | ||||
| -rw-r--r-- | net/ceph/pagelist.c | 19 | ||||
| -rw-r--r-- | net/ceph/pagevec.c | 83 | ||||
| -rw-r--r-- | net/ceph/snapshot.c | 78 | 
26 files changed, 5965 insertions, 2506 deletions
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index ad424049b0c..e50cc69ae8c 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -1,9 +1,10 @@  config CEPH_LIB -        tristate "Ceph core library (EXPERIMENTAL)" -	depends on INET && EXPERIMENTAL +	tristate "Ceph core library" +	depends on INET  	select LIBCRC32C  	select CRYPTO_AES  	select CRYPTO +	select KEYS  	default n  	help  	  Choose Y or M here to include cephlib, which provides the @@ -26,3 +27,17 @@ config CEPH_LIB_PRETTYDEBUG  	  If unsure, say N. +config CEPH_LIB_USE_DNS_RESOLVER +	bool "Use in-kernel support for DNS lookup" +	depends on CEPH_LIB +	select DNS_RESOLVER +	default n +	help +	  If you say Y here, hostnames (e.g. monitor addresses) will +	  be resolved using the CONFIG_DNS_RESOLVER facility. + +	  For information on how to use CONFIG_DNS_RESOLVER consult +	  Documentation/networking/dns_resolver.txt + +	  If unsure, say N. + diff --git a/net/ceph/Makefile b/net/ceph/Makefile index 153bdec4083..958d9856912 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -1,9 +1,6 @@  #  # Makefile for CEPH filesystem.  # - -ifneq ($(KERNELRELEASE),) -  obj-$(CONFIG_CEPH_LIB) += libceph.o  libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ @@ -14,24 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \  	crypto.o armor.o \  	auth_x.o \  	ceph_fs.o ceph_strings.o ceph_hash.o \ -	pagevec.o - -else -#Otherwise we were called directly from the command -# line; invoke the kernel build system. - -KERNELDIR ?= /lib/modules/$(shell uname -r)/build -PWD := $(shell pwd) - -default: all - -all: -	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules - -modules_install: -	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install - -clean: -	$(MAKE) -C $(KERNELDIR) M=$(PWD) clean +	pagevec.o snapshot.o -endif diff --git a/net/ceph/armor.c b/net/ceph/armor.c index eb2a666b0be..1fc1ee11dfa 100644 --- a/net/ceph/armor.c +++ b/net/ceph/armor.c @@ -78,8 +78,10 @@ int ceph_unarmor(char *dst, const char *src, const char *end)  	while (src < end) {  		int a, b, c, d; -		if (src < end && src[0] == '\n') +		if (src[0] == '\n') {  			src++; +			continue; +		}  		if (src + 4 > end)  			return -EINVAL;  		a = decode_bits(src[0]); diff --git a/net/ceph/auth.c b/net/ceph/auth.c index 549c1f43e1d..6b923bcaa2a 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -35,25 +35,26 @@ static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)  /*   * setup, teardown.   */ -struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret) +struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_crypto_key *key)  {  	struct ceph_auth_client *ac;  	int ret; -	dout("auth_init name '%s' secret '%s'\n", name, secret); +	dout("auth_init name '%s'\n", name);  	ret = -ENOMEM;  	ac = kzalloc(sizeof(*ac), GFP_NOFS);  	if (!ac)  		goto out; +	mutex_init(&ac->mutex);  	ac->negotiating = true;  	if (name)  		ac->name = name;  	else  		ac->name = CEPH_AUTH_NAME_DEFAULT; -	dout("auth_init name %s secret %s\n", ac->name, secret); -	ac->secret = secret; +	dout("auth_init name %s\n", ac->name); +	ac->key = key;  	return ac;  out: @@ -73,10 +74,12 @@ void ceph_auth_destroy(struct ceph_auth_client *ac)   */  void ceph_auth_reset(struct ceph_auth_client *ac)  { +	mutex_lock(&ac->mutex);  	dout("auth_reset %p\n", ac);  	if (ac->ops && !ac->negotiating)  		ac->ops->reset(ac);  	ac->negotiating = true; +	mutex_unlock(&ac->mutex);  }  int ceph_entity_name_encode(const char *name, void **p, void *end) @@ -102,6 +105,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)  	int i, num;  	int ret; +	mutex_lock(&ac->mutex);  	dout("auth_build_hello\n");  	monhdr->have_version = 0;  	monhdr->session_mon = cpu_to_le16(-1); @@ -122,15 +126,19 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)  	ret = ceph_entity_name_encode(ac->name, &p, end);  	if (ret < 0) -		return ret; +		goto out;  	ceph_decode_need(&p, end, sizeof(u64), bad);  	ceph_encode_64(&p, ac->global_id);  	ceph_encode_32(&lenp, p - lenp - sizeof(u32)); -	return p - buf; +	ret = p - buf; +out: +	mutex_unlock(&ac->mutex); +	return ret;  bad: -	return -ERANGE; +	ret = -ERANGE; +	goto out;  }  static int ceph_build_auth_request(struct ceph_auth_client *ac, @@ -151,11 +159,13 @@ static int ceph_build_auth_request(struct ceph_auth_client *ac,  	if (ret < 0) {  		pr_err("error %d building auth method %s request\n", ret,  		       ac->ops->name); -		return ret; +		goto out;  	}  	dout(" built request %d bytes\n", ret);  	ceph_encode_32(&p, ret); -	return p + ret - msg_buf; +	ret = p + ret - msg_buf; +out: +	return ret;  }  /* @@ -176,6 +186,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,  	int result_msg_len;  	int ret = -EINVAL; +	mutex_lock(&ac->mutex);  	dout("handle_auth_reply %p %p\n", p, end);  	ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);  	protocol = ceph_decode_32(&p); @@ -227,33 +238,103 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,  	ret = ac->ops->handle_reply(ac, result, payload, payload_end);  	if (ret == -EAGAIN) { -		return ceph_build_auth_request(ac, reply_buf, reply_len); +		ret = ceph_build_auth_request(ac, reply_buf, reply_len);  	} else if (ret) {  		pr_err("auth method '%s' error %d\n", ac->ops->name, ret); -		return ret;  	} -	return 0; -bad: -	pr_err("failed to decode auth msg\n");  out: +	mutex_unlock(&ac->mutex);  	return ret; + +bad: +	pr_err("failed to decode auth msg\n"); +	ret = -EINVAL; +	goto out;  }  int ceph_build_auth(struct ceph_auth_client *ac,  		    void *msg_buf, size_t msg_len)  { +	int ret = 0; + +	mutex_lock(&ac->mutex);  	if (!ac->protocol) -		return ceph_auth_build_hello(ac, msg_buf, msg_len); -	BUG_ON(!ac->ops); -	if (ac->ops->should_authenticate(ac)) -		return ceph_build_auth_request(ac, msg_buf, msg_len); -	return 0; +		ret = ceph_auth_build_hello(ac, msg_buf, msg_len); +	else if (ac->ops->should_authenticate(ac)) +		ret = ceph_build_auth_request(ac, msg_buf, msg_len); +	mutex_unlock(&ac->mutex); +	return ret;  }  int ceph_auth_is_authenticated(struct ceph_auth_client *ac)  { -	if (!ac->ops) -		return 0; -	return ac->ops->is_authenticated(ac); +	int ret = 0; + +	mutex_lock(&ac->mutex); +	if (ac->ops) +		ret = ac->ops->is_authenticated(ac); +	mutex_unlock(&ac->mutex); +	return ret; +} +EXPORT_SYMBOL(ceph_auth_is_authenticated); + +int ceph_auth_create_authorizer(struct ceph_auth_client *ac, +				int peer_type, +				struct ceph_auth_handshake *auth) +{ +	int ret = 0; + +	mutex_lock(&ac->mutex); +	if (ac->ops && ac->ops->create_authorizer) +		ret = ac->ops->create_authorizer(ac, peer_type, auth); +	mutex_unlock(&ac->mutex); +	return ret; +} +EXPORT_SYMBOL(ceph_auth_create_authorizer); + +void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, +				  struct ceph_authorizer *a) +{ +	mutex_lock(&ac->mutex); +	if (ac->ops && ac->ops->destroy_authorizer) +		ac->ops->destroy_authorizer(ac, a); +	mutex_unlock(&ac->mutex); +} +EXPORT_SYMBOL(ceph_auth_destroy_authorizer); + +int ceph_auth_update_authorizer(struct ceph_auth_client *ac, +				int peer_type, +				struct ceph_auth_handshake *a) +{ +	int ret = 0; + +	mutex_lock(&ac->mutex); +	if (ac->ops && ac->ops->update_authorizer) +		ret = ac->ops->update_authorizer(ac, peer_type, a); +	mutex_unlock(&ac->mutex); +	return ret; +} +EXPORT_SYMBOL(ceph_auth_update_authorizer); + +int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, +				      struct ceph_authorizer *a, size_t len) +{ +	int ret = 0; + +	mutex_lock(&ac->mutex); +	if (ac->ops && ac->ops->verify_authorizer_reply) +		ret = ac->ops->verify_authorizer_reply(ac, a, len); +	mutex_unlock(&ac->mutex); +	return ret; +} +EXPORT_SYMBOL(ceph_auth_verify_authorizer_reply); + +void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type) +{ +	mutex_lock(&ac->mutex); +	if (ac->ops && ac->ops->invalidate_authorizer) +		ac->ops->invalidate_authorizer(ac, peer_type); +	mutex_unlock(&ac->mutex);  } +EXPORT_SYMBOL(ceph_auth_invalidate_authorizer); diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c index 214c2bb43d6..8c93fa8d81b 100644 --- a/net/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -39,6 +39,11 @@ static int should_authenticate(struct ceph_auth_client *ac)  	return xi->starting;  } +static int build_request(struct ceph_auth_client *ac, void *buf, void *end) +{ +	return 0; +} +  /*   * the generic auth code decode the global_id, and we carry no actual   * authenticate state, so nothing happens here. @@ -59,9 +64,7 @@ static int handle_reply(struct ceph_auth_client *ac, int result,   */  static int ceph_auth_none_create_authorizer(  	struct ceph_auth_client *ac, int peer_type, -	struct ceph_authorizer **a, -	void **buf, size_t *len, -	void **reply_buf, size_t *reply_len) +	struct ceph_auth_handshake *auth)  {  	struct ceph_auth_none_info *ai = ac->private;  	struct ceph_none_authorizer *au = &ai->au; @@ -82,11 +85,12 @@ static int ceph_auth_none_create_authorizer(  		dout("built authorizer len %d\n", au->buf_len);  	} -	*a = (struct ceph_authorizer *)au; -	*buf = au->buf; -	*len = au->buf_len; -	*reply_buf = au->reply_buf; -	*reply_len = sizeof(au->reply_buf); +	auth->authorizer = (struct ceph_authorizer *) au; +	auth->authorizer_buf = au->buf; +	auth->authorizer_buf_len = au->buf_len; +	auth->authorizer_reply_buf = au->reply_buf; +	auth->authorizer_reply_buf_len = sizeof (au->reply_buf); +  	return 0;  bad2: @@ -107,6 +111,7 @@ static const struct ceph_auth_client_ops ceph_auth_none_ops = {  	.destroy = destroy,  	.is_authenticated = is_authenticated,  	.should_authenticate = should_authenticate, +	.build_request = build_request,  	.handle_reply = handle_reply,  	.create_authorizer = ceph_auth_none_create_authorizer,  	.destroy_authorizer = ceph_auth_none_destroy_authorizer, diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h index ed7d088b1bc..059a3ce4b53 100644 --- a/net/ceph/auth_none.h +++ b/net/ceph/auth_none.h @@ -23,7 +23,7 @@ struct ceph_auth_none_info {  	struct ceph_none_authorizer au;   /* we only need one; it's static */  }; -extern int ceph_auth_none_init(struct ceph_auth_client *ac); +int ceph_auth_none_init(struct ceph_auth_client *ac);  #endif diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 7fd5dfcf6e1..96238ba95f2 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -298,6 +298,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,  			return -ENOMEM;  	}  	au->service = th->service; +	au->secret_id = th->secret_id;  	msg_a = au->buf->vec.iov_base;  	msg_a->struct_v = 1; @@ -526,9 +527,7 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,  static int ceph_x_create_authorizer(  	struct ceph_auth_client *ac, int peer_type, -	struct ceph_authorizer **a, -	void **buf, size_t *len, -	void **reply_buf, size_t *reply_len) +	struct ceph_auth_handshake *auth)  {  	struct ceph_x_authorizer *au;  	struct ceph_x_ticket_handler *th; @@ -548,11 +547,32 @@ static int ceph_x_create_authorizer(  		return ret;  	} -	*a = (struct ceph_authorizer *)au; -	*buf = au->buf->vec.iov_base; -	*len = au->buf->vec.iov_len; -	*reply_buf = au->reply_buf; -	*reply_len = sizeof(au->reply_buf); +	auth->authorizer = (struct ceph_authorizer *) au; +	auth->authorizer_buf = au->buf->vec.iov_base; +	auth->authorizer_buf_len = au->buf->vec.iov_len; +	auth->authorizer_reply_buf = au->reply_buf; +	auth->authorizer_reply_buf_len = sizeof (au->reply_buf); + +	return 0; +} + +static int ceph_x_update_authorizer( +	struct ceph_auth_client *ac, int peer_type, +	struct ceph_auth_handshake *auth) +{ +	struct ceph_x_authorizer *au; +	struct ceph_x_ticket_handler *th; + +	th = get_ticket_handler(ac, peer_type); +	if (IS_ERR(th)) +		return PTR_ERR(th); + +	au = (struct ceph_x_authorizer *)auth->authorizer; +	if (au->secret_id < th->secret_id) { +		dout("ceph_x_update_authorizer service %u secret %llu < %llu\n", +		     au->service, au->secret_id, th->secret_id); +		return ceph_x_build_authorizer(ac, th, au); +	}  	return 0;  } @@ -631,7 +651,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,  	th = get_ticket_handler(ac, peer_type);  	if (!IS_ERR(th)) -		remove_ticket_handler(ac, th); +		memset(&th->validity, 0, sizeof(th->validity));  } @@ -642,6 +662,7 @@ static const struct ceph_auth_client_ops ceph_x_ops = {  	.build_request = ceph_x_build_request,  	.handle_reply = ceph_x_handle_reply,  	.create_authorizer = ceph_x_create_authorizer, +	.update_authorizer = ceph_x_update_authorizer,  	.verify_authorizer_reply = ceph_x_verify_authorizer_reply,  	.destroy_authorizer = ceph_x_destroy_authorizer,  	.invalidate_authorizer = ceph_x_invalidate_authorizer, @@ -662,14 +683,16 @@ int ceph_x_init(struct ceph_auth_client *ac)  		goto out;  	ret = -EINVAL; -	if (!ac->secret) { +	if (!ac->key) {  		pr_err("no secret set (for auth_x protocol)\n");  		goto out_nomem;  	} -	ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret); -	if (ret) +	ret = ceph_crypto_key_clone(&xi->secret, ac->key); +	if (ret < 0) { +		pr_err("cannot clone key: %d\n", ret);  		goto out_nomem; +	}  	xi->starting = true;  	xi->ticket_handlers = RB_ROOT; diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index e02da7a5c5a..65ee72082d9 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -13,7 +13,7 @@   */  struct ceph_x_ticket_handler {  	struct rb_node node; -	unsigned service; +	unsigned int service;  	struct ceph_crypto_key session_key;  	struct ceph_timespec validity; @@ -27,8 +27,9 @@ struct ceph_x_ticket_handler {  struct ceph_x_authorizer {  	struct ceph_buffer *buf; -	unsigned service; +	unsigned int service;  	u64 nonce; +	u64 secret_id;  	char reply_buf[128];  /* big enough for encrypted blob */  }; @@ -38,13 +39,13 @@ struct ceph_x_info {  	bool starting;  	u64 server_challenge; -	unsigned have_keys; +	unsigned int have_keys;  	struct rb_root ticket_handlers;  	struct ceph_x_authorizer auth_authorizer;  }; -extern int ceph_x_init(struct ceph_auth_client *ac); +int ceph_x_init(struct ceph_auth_client *ac);  #endif diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c index 53d8abfa25d..621b5f65407 100644 --- a/net/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -6,6 +6,7 @@  #include <linux/ceph/buffer.h>  #include <linux/ceph/decode.h> +#include <linux/ceph/libceph.h> /* for ceph_kv{malloc,free} */  struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)  { @@ -15,16 +16,10 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)  	if (!b)  		return NULL; -	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); -	if (b->vec.iov_base) { -		b->is_vmalloc = false; -	} else { -		b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL); -		if (!b->vec.iov_base) { -			kfree(b); -			return NULL; -		} -		b->is_vmalloc = true; +	b->vec.iov_base = ceph_kvmalloc(len, gfp); +	if (!b->vec.iov_base) { +		kfree(b); +		return NULL;  	}  	kref_init(&b->kref); @@ -40,12 +35,7 @@ void ceph_buffer_release(struct kref *kref)  	struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);  	dout("buffer_release %p\n", b); -	if (b->vec.iov_base) { -		if (b->is_vmalloc) -			vfree(b->vec.iov_base); -		else -			kfree(b->vec.iov_base); -	} +	ceph_kvfree(b->vec.iov_base);  	kfree(b);  }  EXPORT_SYMBOL(ceph_buffer_release); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index f3e4a13fea0..1675021d8c1 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -5,6 +5,8 @@  #include <linux/fs.h>  #include <linux/inet.h>  #include <linux/in6.h> +#include <linux/key.h> +#include <keys/ceph-type.h>  #include <linux/module.h>  #include <linux/mount.h>  #include <linux/parser.h> @@ -13,15 +15,36 @@  #include <linux/slab.h>  #include <linux/statfs.h>  #include <linux/string.h> +#include <linux/vmalloc.h> +#include <linux/nsproxy.h> +#include <net/net_namespace.h> +#include <linux/ceph/ceph_features.h>  #include <linux/ceph/libceph.h>  #include <linux/ceph/debugfs.h>  #include <linux/ceph/decode.h>  #include <linux/ceph/mon_client.h>  #include <linux/ceph/auth.h> +#include "crypto.h" +/* + * Module compatibility interface.  For now it doesn't do anything, + * but its existence signals a certain level of functionality. + * + * The data buffer is used to pass information both to and from + * libceph.  The return value indicates whether libceph determines + * it is compatible with the caller (from another kernel module), + * given the provided data. + * + * The data pointer can be null. + */ +bool libceph_compatible(void *data) +{ +	return true; +} +EXPORT_SYMBOL(libceph_compatible);  /*   * find filename portion of a path (/foo/bar/baz -> baz) @@ -49,6 +72,8 @@ const char *ceph_msg_type_name(int type)  	case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";  	case CEPH_MSG_STATFS: return "statfs";  	case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; +	case CEPH_MSG_MON_GET_VERSION: return "mon_get_version"; +	case CEPH_MSG_MON_GET_VERSION_REPLY: return "mon_get_version_reply";  	case CEPH_MSG_MDS_MAP: return "mds_map";  	case CEPH_MSG_CLIENT_SESSION: return "client_session";  	case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; @@ -62,6 +87,7 @@ const char *ceph_msg_type_name(int type)  	case CEPH_MSG_OSD_MAP: return "osd_map";  	case CEPH_MSG_OSD_OP: return "osd_op";  	case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; +	case CEPH_MSG_WATCH_NOTIFY: return "watch_notify";  	default: return "unknown";  	}  } @@ -79,10 +105,7 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)  			return -1;  		}  	} else { -		pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);  		memcpy(&client->fsid, fsid, sizeof(*fsid)); -		ceph_debugfs_client_init(client); -		client->have_fsid = true;  	}  	return 0;  } @@ -116,9 +139,29 @@ int ceph_compare_options(struct ceph_options *new_opt,  	if (ret)  		return ret; -	ret = strcmp_null(opt1->secret, opt2->secret); -	if (ret) -		return ret; +	if (opt1->key && !opt2->key) +		return -1; +	if (!opt1->key && opt2->key) +		return 1; +	if (opt1->key && opt2->key) { +		if (opt1->key->type != opt2->key->type) +			return -1; +		if (opt1->key->created.tv_sec != opt2->key->created.tv_sec) +			return -1; +		if (opt1->key->created.tv_nsec != opt2->key->created.tv_nsec) +			return -1; +		if (opt1->key->len != opt2->key->len) +			return -1; +		if (opt1->key->key && !opt2->key->key) +			return -1; +		if (!opt1->key->key && opt2->key->key) +			return 1; +		if (opt1->key->key && opt2->key->key) { +			ret = memcmp(opt1->key->key, opt2->key->key, opt1->key->len); +			if (ret) +				return ret; +		} +	}  	/* any matching mon ip implies a match */  	for (i = 0; i < opt1->num_mon; i++) { @@ -130,6 +173,25 @@ int ceph_compare_options(struct ceph_options *new_opt,  }  EXPORT_SYMBOL(ceph_compare_options); +void *ceph_kvmalloc(size_t size, gfp_t flags) +{ +	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { +		void *ptr = kmalloc(size, flags | __GFP_NOWARN); +		if (ptr) +			return ptr; +	} + +	return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); +} + +void ceph_kvfree(const void *ptr) +{ +	if (is_vmalloc_addr(ptr)) +		vfree(ptr); +	else +		kfree(ptr); +} +  static int parse_fsid(const char *str, struct ceph_fsid *fsid)  { @@ -175,10 +237,13 @@ enum {  	Opt_fsid,  	Opt_name,  	Opt_secret, +	Opt_key,  	Opt_ip,  	Opt_last_string,  	/* string args above */ +	Opt_share,  	Opt_noshare, +	Opt_crc,  	Opt_nocrc,  }; @@ -191,9 +256,12 @@ static match_table_t opt_tokens = {  	{Opt_fsid, "fsid=%s"},  	{Opt_name, "name=%s"},  	{Opt_secret, "secret=%s"}, +	{Opt_key, "key=%s"},  	{Opt_ip, "ip=%s"},  	/* string args above */ +	{Opt_share, "share"},  	{Opt_noshare, "noshare"}, +	{Opt_crc, "crc"},  	{Opt_nocrc, "nocrc"},  	{-1, NULL}  }; @@ -202,24 +270,74 @@ void ceph_destroy_options(struct ceph_options *opt)  {  	dout("destroy_options %p\n", opt);  	kfree(opt->name); -	kfree(opt->secret); +	if (opt->key) { +		ceph_crypto_key_destroy(opt->key); +		kfree(opt->key); +	} +	kfree(opt->mon_addr);  	kfree(opt);  }  EXPORT_SYMBOL(ceph_destroy_options); -int ceph_parse_options(struct ceph_options **popt, char *options, -		       const char *dev_name, const char *dev_name_end, -		       int (*parse_extra_token)(char *c, void *private), -		       void *private) +/* get secret from key store */ +static int get_secret(struct ceph_crypto_key *dst, const char *name) { +	struct key *ukey; +	int key_err; +	int err = 0; +	struct ceph_crypto_key *ckey; + +	ukey = request_key(&key_type_ceph, name, NULL); +	if (!ukey || IS_ERR(ukey)) { +		/* request_key errors don't map nicely to mount(2) +		   errors; don't even try, but still printk */ +		key_err = PTR_ERR(ukey); +		switch (key_err) { +		case -ENOKEY: +			pr_warning("ceph: Mount failed due to key not found: %s\n", name); +			break; +		case -EKEYEXPIRED: +			pr_warning("ceph: Mount failed due to expired key: %s\n", name); +			break; +		case -EKEYREVOKED: +			pr_warning("ceph: Mount failed due to revoked key: %s\n", name); +			break; +		default: +			pr_warning("ceph: Mount failed due to unknown key error" +			       " %d: %s\n", key_err, name); +		} +		err = -EPERM; +		goto out; +	} + +	ckey = ukey->payload.data; +	err = ceph_crypto_key_clone(dst, ckey); +	if (err) +		goto out_key; +	/* pass through, err is 0 */ + +out_key: +	key_put(ukey); +out: +	return err; +} + +struct ceph_options * +ceph_parse_options(char *options, const char *dev_name, +			const char *dev_name_end, +			int (*parse_extra_token)(char *c, void *private), +			void *private)  {  	struct ceph_options *opt;  	const char *c;  	int err = -ENOMEM;  	substring_t argstr[MAX_OPT_ARGS]; +	if (current->nsproxy->net_ns != &init_net) +		return ERR_PTR(-EINVAL); +  	opt = kzalloc(sizeof(*opt), GFP_KERNEL);  	if (!opt) -		return err; +		return ERR_PTR(-ENOMEM);  	opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),  				GFP_KERNEL);  	if (!opt->mon_addr) @@ -230,7 +348,6 @@ int ceph_parse_options(struct ceph_options **popt, char *options,  	/* start with defaults */  	opt->flags = CEPH_OPT_DEFAULT; -	opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;  	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;  	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */  	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */ @@ -294,14 +411,29 @@ int ceph_parse_options(struct ceph_options **popt, char *options,  					      GFP_KERNEL);  			break;  		case Opt_secret: -			opt->secret = kstrndup(argstr[0].from, -						argstr[0].to-argstr[0].from, -						GFP_KERNEL); +		        opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); +			if (!opt->key) { +				err = -ENOMEM; +				goto out; +			} +			err = ceph_crypto_key_unarmor(opt->key, argstr[0].from); +			if (err < 0) +				goto out; +			break; +		case Opt_key: +		        opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); +			if (!opt->key) { +				err = -ENOMEM; +				goto out; +			} +			err = get_secret(opt->key, argstr[0].from); +			if (err < 0) +				goto out;  			break;  			/* misc */  		case Opt_osdtimeout: -			opt->osd_timeout = intval; +			pr_warning("ignoring deprecated osdtimeout option\n");  			break;  		case Opt_osdkeepalivetimeout:  			opt->osd_keepalive_timeout = intval; @@ -313,10 +445,16 @@ int ceph_parse_options(struct ceph_options **popt, char *options,  			opt->mount_timeout = intval;  			break; +		case Opt_share: +			opt->flags &= ~CEPH_OPT_NOSHARE; +			break;  		case Opt_noshare:  			opt->flags |= CEPH_OPT_NOSHARE;  			break; +		case Opt_crc: +			opt->flags &= ~CEPH_OPT_NOCRC; +			break;  		case Opt_nocrc:  			opt->flags |= CEPH_OPT_NOCRC;  			break; @@ -327,12 +465,11 @@ int ceph_parse_options(struct ceph_options **popt, char *options,  	}  	/* success */ -	*popt = opt; -	return 0; +	return opt;  out:  	ceph_destroy_options(opt); -	return err; +	return ERR_PTR(err);  }  EXPORT_SYMBOL(ceph_parse_options); @@ -345,9 +482,12 @@ EXPORT_SYMBOL(ceph_client_id);  /*   * create a fresh client instance   */ -struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private) +struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, +				       u64 supported_features, +				       u64 required_features)  {  	struct ceph_client *client; +	struct ceph_entity_addr *myaddr = NULL;  	int err = -ENOMEM;  	client = kzalloc(sizeof(*client), GFP_KERNEL); @@ -362,10 +502,18 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)  	client->auth_err = 0;  	client->extra_mon_dispatch = NULL; -	client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT; -	client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT; - -	client->msgr = NULL; +	client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT | +		supported_features; +	client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT | +		required_features; + +	/* msgr */ +	if (ceph_test_opt(client, MYIP)) +		myaddr = &client->options->my_addr; +	ceph_messenger_init(&client->msgr, myaddr, +		client->supported_features, +		client->required_features, +		ceph_test_opt(client, NOCRC));  	/* subsystems */  	err = ceph_monc_init(&client->monc, client); @@ -389,23 +537,15 @@ void ceph_destroy_client(struct ceph_client *client)  {  	dout("destroy_client %p\n", client); +	atomic_set(&client->msgr.stopping, 1); +  	/* unmount */  	ceph_osdc_stop(&client->osdc); -	/* -	 * make sure mds and osd connections close out before destroying -	 * the auth module, which is needed to free those connections' -	 * ceph_authorizers. -	 */ -	ceph_msgr_flush(); -  	ceph_monc_stop(&client->monc);  	ceph_debugfs_client_cleanup(client); -	if (client->msgr) -		ceph_messenger_destroy(client->msgr); -  	ceph_destroy_options(client->options);  	kfree(client); @@ -427,24 +567,9 @@ static int have_mon_and_osd_map(struct ceph_client *client)   */  int __ceph_open_session(struct ceph_client *client, unsigned long started)  { -	struct ceph_entity_addr *myaddr = NULL;  	int err;  	unsigned long timeout = client->options->mount_timeout * HZ; -	/* initialize the messenger */ -	if (client->msgr == NULL) { -		if (ceph_test_opt(client, MYIP)) -			myaddr = &client->options->my_addr; -		client->msgr = ceph_messenger_create(myaddr, -					client->supported_features, -					client->required_features); -		if (IS_ERR(client->msgr)) { -			client->msgr = NULL; -			return PTR_ERR(client->msgr); -		} -		client->msgr->nocrc = ceph_test_opt(client, NOCRC); -	} -  	/* open session, and wait for mon and osd maps */  	err = ceph_monc_open_session(&client->monc);  	if (err < 0) @@ -495,17 +620,27 @@ static int __init init_ceph_lib(void)  	if (ret < 0)  		goto out; -	ret = ceph_msgr_init(); +	ret = ceph_crypto_init();  	if (ret < 0)  		goto out_debugfs; -	pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n", -		CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL, -		CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, -		CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); +	ret = ceph_msgr_init(); +	if (ret < 0) +		goto out_crypto; + +	ret = ceph_osdc_setup(); +	if (ret < 0) +		goto out_msgr; + +	pr_info("loaded (mon/osd proto %d/%d)\n", +		CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL);  	return 0; +out_msgr: +	ceph_msgr_exit(); +out_crypto: +	ceph_crypto_shutdown();  out_debugfs:  	ceph_debugfs_cleanup();  out: @@ -515,7 +650,9 @@ out:  static void __exit exit_ceph_lib(void)  {  	dout("exit_ceph_lib\n"); +	ceph_osdc_cleanup();  	ceph_msgr_exit(); +	ceph_crypto_shutdown();  	ceph_debugfs_cleanup();  } diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c index a3a3a31d3c3..41466ccb972 100644 --- a/net/ceph/ceph_fs.c +++ b/net/ceph/ceph_fs.c @@ -36,16 +36,19 @@ int ceph_flags_to_mode(int flags)  	if ((flags & O_DIRECTORY) == O_DIRECTORY)  		return CEPH_FILE_MODE_PIN;  #endif -	if ((flags & O_APPEND) == O_APPEND) -		flags |= O_WRONLY; -	if ((flags & O_ACCMODE) == O_RDWR) -		mode = CEPH_FILE_MODE_RDWR; -	else if ((flags & O_ACCMODE) == O_WRONLY) +	switch (flags & O_ACCMODE) { +	case O_WRONLY:  		mode = CEPH_FILE_MODE_WR; -	else +		break; +	case O_RDONLY:  		mode = CEPH_FILE_MODE_RD; - +		break; +	case O_RDWR: +	case O_ACCMODE: /* this is what the VFS does */ +		mode = CEPH_FILE_MODE_RDWR; +		break; +	}  #ifdef O_LAZY  	if (flags & O_LAZY)  		mode |= CEPH_FILE_MODE_LAZY; diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c index 815ef882679..67bb1f11e61 100644 --- a/net/ceph/ceph_hash.c +++ b/net/ceph/ceph_hash.c @@ -1,5 +1,6 @@  #include <linux/ceph/types.h> +#include <linux/module.h>  /*   * Robert Jenkin's hash function. @@ -19,7 +20,7 @@  		c = c - a;  c = c - b;  c = c ^ (b >> 15);	\  	} while (0) -unsigned ceph_str_hash_rjenkins(const char *str, unsigned length) +unsigned int ceph_str_hash_rjenkins(const char *str, unsigned int length)  {  	const unsigned char *k = (const unsigned char *)str;  	__u32 a, b, c;  /* the internal state */ @@ -80,7 +81,7 @@ unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)  /*   * linux dcache hash   */ -unsigned ceph_str_hash_linux(const char *str, unsigned length) +unsigned int ceph_str_hash_linux(const char *str, unsigned int length)  {  	unsigned long hash = 0;  	unsigned char c; @@ -93,7 +94,7 @@ unsigned ceph_str_hash_linux(const char *str, unsigned length)  } -unsigned ceph_str_hash(int type, const char *s, unsigned len) +unsigned int ceph_str_hash(int type, const char *s, unsigned int len)  {  	switch (type) {  	case CEPH_STR_HASH_LINUX: @@ -104,6 +105,7 @@ unsigned ceph_str_hash(int type, const char *s, unsigned len)  		return -1;  	}  } +EXPORT_SYMBOL(ceph_str_hash);  const char *ceph_str_hash_name(int type)  { @@ -116,3 +118,4 @@ const char *ceph_str_hash_name(int type)  		return "unknown";  	}  } +EXPORT_SYMBOL(ceph_str_hash_name); diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c index 3fbda04de29..1348df96fe1 100644 --- a/net/ceph/ceph_strings.c +++ b/net/ceph/ceph_strings.c @@ -21,9 +21,15 @@ const char *ceph_osd_op_name(int op)  	switch (op) {  	case CEPH_OSD_OP_READ: return "read";  	case CEPH_OSD_OP_STAT: return "stat"; +	case CEPH_OSD_OP_MAPEXT: return "mapext"; +	case CEPH_OSD_OP_SPARSE_READ: return "sparse-read"; +	case CEPH_OSD_OP_NOTIFY: return "notify"; +	case CEPH_OSD_OP_NOTIFY_ACK: return "notify-ack"; +	case CEPH_OSD_OP_ASSERT_VER: return "assert-version";  	case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; +	case CEPH_OSD_OP_CREATE: return "create";  	case CEPH_OSD_OP_WRITE: return "write";  	case CEPH_OSD_OP_DELETE: return "delete";  	case CEPH_OSD_OP_TRUNCATE: return "truncate"; @@ -39,6 +45,11 @@ const char *ceph_osd_op_name(int op)  	case CEPH_OSD_OP_TMAPUP: return "tmapup";  	case CEPH_OSD_OP_TMAPGET: return "tmapget";  	case CEPH_OSD_OP_TMAPPUT: return "tmapput"; +	case CEPH_OSD_OP_WATCH: return "watch"; + +	case CEPH_OSD_OP_CLONERANGE: return "clonerange"; +	case CEPH_OSD_OP_ASSERT_SRC_VERSION: return "assert-src-version"; +	case CEPH_OSD_OP_SRC_CMPXATTR: return "src-cmpxattr";  	case CEPH_OSD_OP_GETXATTR: return "getxattr";  	case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; @@ -53,6 +64,10 @@ const char *ceph_osd_op_name(int op)  	case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";  	case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";  	case CEPH_OSD_OP_SCRUB: return "scrub"; +	case CEPH_OSD_OP_SCRUB_RESERVE: return "scrub-reserve"; +	case CEPH_OSD_OP_SCRUB_UNRESERVE: return "scrub-unreserve"; +	case CEPH_OSD_OP_SCRUB_STOP: return "scrub-stop"; +	case CEPH_OSD_OP_SCRUB_MAP: return "scrub-map";  	case CEPH_OSD_OP_WRLOCK: return "wrlock";  	case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; @@ -64,10 +79,34 @@ const char *ceph_osd_op_name(int op)  	case CEPH_OSD_OP_CALL: return "call";  	case CEPH_OSD_OP_PGLS: return "pgls"; +	case CEPH_OSD_OP_PGLS_FILTER: return "pgls-filter"; +	case CEPH_OSD_OP_OMAPGETKEYS: return "omap-get-keys"; +	case CEPH_OSD_OP_OMAPGETVALS: return "omap-get-vals"; +	case CEPH_OSD_OP_OMAPGETHEADER: return "omap-get-header"; +	case CEPH_OSD_OP_OMAPGETVALSBYKEYS: return "omap-get-vals-by-keys"; +	case CEPH_OSD_OP_OMAPSETVALS: return "omap-set-vals"; +	case CEPH_OSD_OP_OMAPSETHEADER: return "omap-set-header"; +	case CEPH_OSD_OP_OMAPCLEAR: return "omap-clear"; +	case CEPH_OSD_OP_OMAPRMKEYS: return "omap-rm-keys";  	}  	return "???";  } +const char *ceph_osd_state_name(int s) +{ +	switch (s) { +	case CEPH_OSD_EXISTS: +		return "exists"; +	case CEPH_OSD_UP: +		return "up"; +	case CEPH_OSD_AUTOOUT: +		return "autoout"; +	case CEPH_OSD_NEW: +		return "new"; +	default: +		return "???"; +	} +}  const char *ceph_pool_op_name(int op)  { diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index d6ebb13a18a..16bc199d9a6 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -26,9 +26,9 @@ const char *crush_bucket_alg_name(int alg)   * @b: bucket pointer   * @p: item index in bucket   */ -int crush_get_bucket_item_weight(struct crush_bucket *b, int p) +int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)  { -	if (p >= b->size) +	if ((__u32)p >= b->size)  		return 0;  	switch (b->alg) { @@ -37,38 +37,13 @@ int crush_get_bucket_item_weight(struct crush_bucket *b, int p)  	case CRUSH_BUCKET_LIST:  		return ((struct crush_bucket_list *)b)->item_weights[p];  	case CRUSH_BUCKET_TREE: -		if (p & 1) -			return ((struct crush_bucket_tree *)b)->node_weights[p]; -		return 0; +		return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)];  	case CRUSH_BUCKET_STRAW:  		return ((struct crush_bucket_straw *)b)->item_weights[p];  	}  	return 0;  } -/** - * crush_calc_parents - Calculate parent vectors for the given crush map. - * @map: crush_map pointer - */ -void crush_calc_parents(struct crush_map *map) -{ -	int i, b, c; - -	for (b = 0; b < map->max_buckets; b++) { -		if (map->buckets[b] == NULL) -			continue; -		for (i = 0; i < map->buckets[b]->size; i++) { -			c = map->buckets[b]->items[i]; -			BUG_ON(c >= map->max_devices || -			       c < -map->max_buckets); -			if (c >= 0) -				map->device_parents[c] = map->buckets[b]->id; -			else -				map->bucket_parents[-1-c] = map->buckets[b]->id; -		} -	} -} -  void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)  {  	kfree(b->h.perm); @@ -87,6 +62,8 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b)  void crush_destroy_bucket_tree(struct crush_bucket_tree *b)  { +	kfree(b->h.perm); +	kfree(b->h.items);  	kfree(b->node_weights);  	kfree(b);  } @@ -124,10 +101,9 @@ void crush_destroy_bucket(struct crush_bucket *b)   */  void crush_destroy(struct crush_map *map)  { -	int b; -  	/* buckets */  	if (map->buckets) { +		__s32 b;  		for (b = 0; b < map->max_buckets; b++) {  			if (map->buckets[b] == NULL)  				continue; @@ -138,14 +114,16 @@ void crush_destroy(struct crush_map *map)  	/* rules */  	if (map->rules) { +		__u32 b;  		for (b = 0; b < map->max_rules; b++) -			kfree(map->rules[b]); +			crush_destroy_rule(map->rules[b]);  		kfree(map->rules);  	} -	kfree(map->bucket_parents); -	kfree(map->device_parents);  	kfree(map);  } - +void crush_destroy_rule(struct crush_rule *rule) +{ +	kfree(rule); +} diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 42599e31dca..a1ef53c0441 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -20,6 +20,7 @@  #include <linux/crush/crush.h>  #include <linux/crush/hash.h> +#include <linux/crush/mapper.h>  /*   * Implement the core CRUSH mapping algorithm. @@ -32,9 +33,9 @@   * @type: storage ruleset type (user defined)   * @size: output set size   */ -int crush_find_rule(struct crush_map *map, int ruleset, int type, int size) +int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size)  { -	int i; +	__u32 i;  	for (i = 0; i < map->max_rules; i++) {  		if (map->rules[i] && @@ -68,11 +69,11 @@ int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)  static int bucket_perm_choose(struct crush_bucket *bucket,  			      int x, int r)  { -	unsigned pr = r % bucket->size; -	unsigned i, s; +	unsigned int pr = r % bucket->size; +	unsigned int i, s;  	/* start a new permutation if @x has changed */ -	if (bucket->perm_x != x || bucket->perm_n == 0) { +	if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {  		dprintk("bucket %d new x=%d\n", bucket->id, x);  		bucket->perm_x = x; @@ -100,13 +101,13 @@ static int bucket_perm_choose(struct crush_bucket *bucket,  	for (i = 0; i < bucket->perm_n; i++)  		dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);  	while (bucket->perm_n <= pr) { -		unsigned p = bucket->perm_n; +		unsigned int p = bucket->perm_n;  		/* no point in swapping the final entry */  		if (p < bucket->size - 1) {  			i = crush_hash32_3(bucket->hash, x, bucket->id, p) %  				(bucket->size - p);  			if (i) { -				unsigned t = bucket->perm[p + i]; +				unsigned int t = bucket->perm[p + i];  				bucket->perm[p + i] = bucket->perm[p];  				bucket->perm[p] = t;  			} @@ -152,8 +153,8 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,  			return bucket->h.items[i];  	} -	BUG_ON(1); -	return 0; +	dprintk("bad list sums for bucket %d\n", bucket->h.id); +	return bucket->h.items[0];  } @@ -188,7 +189,7 @@ static int terminal(int x)  static int bucket_tree_choose(struct crush_bucket_tree *bucket,  			      int x, int r)  { -	int n, l; +	int n;  	__u32 w;  	__u64 t; @@ -196,6 +197,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,  	n = bucket->num_nodes >> 1;  	while (!terminal(n)) { +		int l;  		/* pick point in [0, w) */  		w = bucket->node_weights[n];  		t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, @@ -219,7 +221,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,  static int bucket_straw_choose(struct crush_bucket_straw *bucket,  			       int x, int r)  { -	int i; +	__u32 i;  	int high = 0;  	__u64 high_draw = 0;  	__u64 draw; @@ -239,6 +241,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,  static int crush_bucket_choose(struct crush_bucket *in, int x, int r)  {  	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); +	BUG_ON(in->size == 0);  	switch (in->alg) {  	case CRUSH_BUCKET_UNIFORM:  		return bucket_uniform_choose((struct crush_bucket_uniform *)in, @@ -253,7 +256,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)  		return bucket_straw_choose((struct crush_bucket_straw *)in,  					   x, r);  	default: -		BUG_ON(1); +		dprintk("unknown bucket %d alg %d\n", in->id, in->alg);  		return in->items[0];  	}  } @@ -262,8 +265,12 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)   * true if device is marked "out" (failed, fully offloaded)   * of the cluster   */ -static int is_out(struct crush_map *map, __u32 *weight, int item, int x) +static int is_out(const struct crush_map *map, +		  const __u32 *weight, int weight_max, +		  int item, int x)  { +	if (item >= weight_max) +		return 1;  	if (weight[item] >= 0x10000)  		return 0;  	if (weight[item] == 0) @@ -275,7 +282,7 @@ static int is_out(struct crush_map *map, __u32 *weight, int item, int x)  }  /** - * crush_choose - choose numrep distinct items of given type + * crush_choose_firstn - choose numrep distinct items of given type   * @map: the crush_map   * @bucket: the bucket we are choose an item from   * @x: crush input value @@ -283,20 +290,31 @@ static int is_out(struct crush_map *map, __u32 *weight, int item, int x)   * @type: the type of item to choose   * @out: pointer to output vector   * @outpos: our position in that vector - * @firstn: true if choosing "first n" items, false if choosing "indep" - * @recurse_to_leaf: true if we want one device under each item of given type + * @tries: number of attempts to make + * @recurse_tries: number of attempts to have recursive chooseleaf make + * @local_retries: localized retries + * @local_fallback_retries: localized fallback retries + * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) + * @vary_r: pass r to recursive calls   * @out2: second output vector for leaf items (if @recurse_to_leaf) + * @parent_r: r value passed from the parent   */ -static int crush_choose(struct crush_map *map, -			struct crush_bucket *bucket, -			__u32 *weight, -			int x, int numrep, int type, -			int *out, int outpos, -			int firstn, int recurse_to_leaf, -			int *out2) +static int crush_choose_firstn(const struct crush_map *map, +			       struct crush_bucket *bucket, +			       const __u32 *weight, int weight_max, +			       int x, int numrep, int type, +			       int *out, int outpos, +			       unsigned int tries, +			       unsigned int recurse_tries, +			       unsigned int local_retries, +			       unsigned int local_fallback_retries, +			       int recurse_to_leaf, +			       unsigned int vary_r, +			       int *out2, +			       int parent_r)  {  	int rep; -	int ftotal, flocal; +	unsigned int ftotal, flocal;  	int retry_descent, retry_bucket, skip_rep;  	struct crush_bucket *in = bucket;  	int r; @@ -304,10 +322,12 @@ static int crush_choose(struct crush_map *map,  	int item = 0;  	int itemtype;  	int collide, reject; -	const int orig_tries = 5; /* attempts before we fall back to search */ -	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", -		bucket->id, x, outpos, numrep); +	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", +		recurse_to_leaf ? "_LEAF" : "", +		bucket->id, x, outpos, numrep, +		tries, recurse_tries, local_retries, local_fallback_retries, +		parent_r);  	for (rep = outpos; rep < numrep; rep++) {  		/* keep trying until we get a non-out, non-colliding item */ @@ -322,39 +342,26 @@ static int crush_choose(struct crush_map *map,  			do {  				collide = 0;  				retry_bucket = 0; -				r = rep; -				if (in->alg == CRUSH_BUCKET_UNIFORM) { -					/* be careful */ -					if (firstn || numrep >= in->size) -						/* r' = r + f_total */ -						r += ftotal; -					else if (in->size % numrep == 0) -						/* r'=r+(n+1)*f_local */ -						r += (numrep+1) * -							(flocal+ftotal); -					else -						/* r' = r + n*f_local */ -						r += numrep * (flocal+ftotal); -				} else { -					if (firstn) -						/* r' = r + f_total */ -						r += ftotal; -					else -						/* r' = r + n*f_local */ -						r += numrep * (flocal+ftotal); -				} +				r = rep + parent_r; +				/* r' = r + f_total */ +				r += ftotal;  				/* bucket choose */  				if (in->size == 0) {  					reject = 1;  					goto reject;  				} -				if (flocal >= (in->size>>1) && -				    flocal > orig_tries) +				if (local_fallback_retries > 0 && +				    flocal >= (in->size>>1) && +				    flocal > local_fallback_retries)  					item = bucket_perm_choose(in, x, r);  				else  					item = crush_bucket_choose(in, x, r); -				BUG_ON(item >= map->max_devices); +				if (item >= map->max_devices) { +					dprintk("   bad item %d\n", item); +					skip_rep = 1; +					break; +				}  				/* desired type? */  				if (item < 0) @@ -365,8 +372,12 @@ static int crush_choose(struct crush_map *map,  				/* keep going? */  				if (itemtype != type) { -					BUG_ON(item >= 0 || -					       (-1-item) >= map->max_buckets); +					if (item >= 0 || +					    (-1-item) >= map->max_buckets) { +						dprintk("   bad item type %d\n", type); +						skip_rep = 1; +						break; +					}  					in = map->buckets[-1-item];  					retry_bucket = 1;  					continue; @@ -381,15 +392,25 @@ static int crush_choose(struct crush_map *map,  				}  				reject = 0; -				if (recurse_to_leaf) { +				if (!collide && recurse_to_leaf) {  					if (item < 0) { -						if (crush_choose(map, +						int sub_r; +						if (vary_r) +							sub_r = r >> (vary_r-1); +						else +							sub_r = 0; +						if (crush_choose_firstn(map,  							 map->buckets[-1-item], -							 weight, +							 weight, weight_max,  							 x, outpos+1, 0,  							 out2, outpos, -							 firstn, 0, -							 NULL) <= outpos) +							 recurse_tries, 0, +							 local_retries, +							 local_fallback_retries, +							 0, +							 vary_r, +							 NULL, +							 sub_r) <= outpos)  							/* didn't get leaf */  							reject = 1;  					} else { @@ -402,6 +423,7 @@ static int crush_choose(struct crush_map *map,  					/* out? */  					if (itemtype == 0)  						reject = is_out(map, weight, +								weight_max,  								item, x);  					else  						reject = 0; @@ -412,20 +434,21 @@ reject:  					ftotal++;  					flocal++; -					if (collide && flocal < 3) +					if (collide && flocal <= local_retries)  						/* retry locally a few times */  						retry_bucket = 1; -					else if (flocal < in->size + orig_tries) +					else if (local_fallback_retries > 0 && +						 flocal <= in->size + local_fallback_retries)  						/* exhaustive bucket search */  						retry_bucket = 1; -					else if (ftotal < 20) +					else if (ftotal < tries)  						/* then retry descent */  						retry_descent = 1;  					else  						/* else give up */  						skip_rep = 1;  					dprintk("  reject %d  collide %d  " -						"ftotal %d  flocal %d\n", +						"ftotal %u  flocal %u\n",  						reject, collide, ftotal,  						flocal);  				} @@ -448,24 +471,179 @@ reject:  /** + * crush_choose_indep: alternative breadth-first positionally stable mapping + * + */ +static void crush_choose_indep(const struct crush_map *map, +			       struct crush_bucket *bucket, +			       const __u32 *weight, int weight_max, +			       int x, int left, int numrep, int type, +			       int *out, int outpos, +			       unsigned int tries, +			       unsigned int recurse_tries, +			       int recurse_to_leaf, +			       int *out2, +			       int parent_r) +{ +	struct crush_bucket *in = bucket; +	int endpos = outpos + left; +	int rep; +	unsigned int ftotal; +	int r; +	int i; +	int item = 0; +	int itemtype; +	int collide; + +	dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", +		bucket->id, x, outpos, numrep); + +	/* initially my result is undefined */ +	for (rep = outpos; rep < endpos; rep++) { +		out[rep] = CRUSH_ITEM_UNDEF; +		if (out2) +			out2[rep] = CRUSH_ITEM_UNDEF; +	} + +	for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) { +		for (rep = outpos; rep < endpos; rep++) { +			if (out[rep] != CRUSH_ITEM_UNDEF) +				continue; + +			in = bucket;  /* initial bucket */ + +			/* choose through intervening buckets */ +			for (;;) { +				/* note: we base the choice on the position +				 * even in the nested call.  that means that +				 * if the first layer chooses the same bucket +				 * in a different position, we will tend to +				 * choose a different item in that bucket. +				 * this will involve more devices in data +				 * movement and tend to distribute the load. +				 */ +				r = rep + parent_r; + +				/* be careful */ +				if (in->alg == CRUSH_BUCKET_UNIFORM && +				    in->size % numrep == 0) +					/* r'=r+(n+1)*f_total */ +					r += (numrep+1) * ftotal; +				else +					/* r' = r + n*f_total */ +					r += numrep * ftotal; + +				/* bucket choose */ +				if (in->size == 0) { +					dprintk("   empty bucket\n"); +					break; +				} + +				item = crush_bucket_choose(in, x, r); +				if (item >= map->max_devices) { +					dprintk("   bad item %d\n", item); +					out[rep] = CRUSH_ITEM_NONE; +					if (out2) +						out2[rep] = CRUSH_ITEM_NONE; +					left--; +					break; +				} + +				/* desired type? */ +				if (item < 0) +					itemtype = map->buckets[-1-item]->type; +				else +					itemtype = 0; +				dprintk("  item %d type %d\n", item, itemtype); + +				/* keep going? */ +				if (itemtype != type) { +					if (item >= 0 || +					    (-1-item) >= map->max_buckets) { +						dprintk("   bad item type %d\n", type); +						out[rep] = CRUSH_ITEM_NONE; +						if (out2) +							out2[rep] = +								CRUSH_ITEM_NONE; +						left--; +						break; +					} +					in = map->buckets[-1-item]; +					continue; +				} + +				/* collision? */ +				collide = 0; +				for (i = outpos; i < endpos; i++) { +					if (out[i] == item) { +						collide = 1; +						break; +					} +				} +				if (collide) +					break; + +				if (recurse_to_leaf) { +					if (item < 0) { +						crush_choose_indep(map, +						   map->buckets[-1-item], +						   weight, weight_max, +						   x, 1, numrep, 0, +						   out2, rep, +						   recurse_tries, 0, +						   0, NULL, r); +						if (out2[rep] == CRUSH_ITEM_NONE) { +							/* placed nothing; no leaf */ +							break; +						} +					} else { +						/* we already have a leaf! */ +						out2[rep] = item; +					} +				} + +				/* out? */ +				if (itemtype == 0 && +				    is_out(map, weight, weight_max, item, x)) +					break; + +				/* yay! */ +				out[rep] = item; +				left--; +				break; +			} +		} +	} +	for (rep = outpos; rep < endpos; rep++) { +		if (out[rep] == CRUSH_ITEM_UNDEF) { +			out[rep] = CRUSH_ITEM_NONE; +		} +		if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) { +			out2[rep] = CRUSH_ITEM_NONE; +		} +	} +} + +/**   * crush_do_rule - calculate a mapping with the given input and rule   * @map: the crush_map   * @ruleno: the rule id   * @x: hash input   * @result: pointer to result vector   * @result_max: maximum result size - * @force: force initial replica choice; -1 for none + * @weight: weight vector (for map leaves) + * @weight_max: size of weight vector + * @scratch: scratch vector for private use; must be >= 3 * result_max   */ -int crush_do_rule(struct crush_map *map, +int crush_do_rule(const struct crush_map *map,  		  int ruleno, int x, int *result, int result_max, -		  int force, __u32 *weight) +		  const __u32 *weight, int weight_max, +		  int *scratch)  {  	int result_len; -	int force_context[CRUSH_MAX_DEPTH]; -	int force_pos = -1; -	int a[CRUSH_MAX_SET]; -	int b[CRUSH_MAX_SET]; -	int c[CRUSH_MAX_SET]; +	int *a = scratch; +	int *b = scratch + result_max; +	int *c = scratch + result_max*2;  	int recurse_to_leaf;  	int *w;  	int wsize = 0; @@ -473,68 +651,83 @@ int crush_do_rule(struct crush_map *map,  	int osize;  	int *tmp;  	struct crush_rule *rule; -	int step; +	__u32 step;  	int i, j;  	int numrep; -	int firstn; -	int rc = -1; +	/* +	 * the original choose_total_tries value was off by one (it +	 * counted "retries" and not "tries").  add one. +	 */ +	int choose_tries = map->choose_total_tries + 1; +	int choose_leaf_tries = 0; +	/* +	 * the local tries values were counted as "retries", though, +	 * and need no adjustment +	 */ +	int choose_local_retries = map->choose_local_tries; +	int choose_local_fallback_retries = map->choose_local_fallback_tries; -	BUG_ON(ruleno >= map->max_rules); +	int vary_r = map->chooseleaf_vary_r; + +	if ((__u32)ruleno >= map->max_rules) { +		dprintk(" bad ruleno %d\n", ruleno); +		return 0; +	}  	rule = map->rules[ruleno];  	result_len = 0;  	w = a;  	o = b; -	/* -	 * determine hierarchical context of force, if any.  note -	 * that this may or may not correspond to the specific types -	 * referenced by the crush rule. -	 */ -	if (force >= 0) { -		if (force >= map->max_devices || -		    map->device_parents[force] == 0) { -			/*dprintk("CRUSH: forcefed device dne\n");*/ -			rc = -1;  /* force fed device dne */ -			goto out; -		} -		if (!is_out(map, weight, force, x)) { -			while (1) { -				force_context[++force_pos] = force; -				if (force >= 0) -					force = map->device_parents[force]; -				else -					force = map->bucket_parents[-1-force]; -				if (force == 0) -					break; -			} -		} -	} -  	for (step = 0; step < rule->len; step++) { -		firstn = 0; -		switch (rule->steps[step].op) { +		int firstn = 0; +		struct crush_rule_step *curstep = &rule->steps[step]; + +		switch (curstep->op) {  		case CRUSH_RULE_TAKE: -			w[0] = rule->steps[step].arg1; -			if (force_pos >= 0) { -				BUG_ON(force_context[force_pos] != w[0]); -				force_pos--; -			} +			w[0] = curstep->arg1;  			wsize = 1;  			break; -		case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: +		case CRUSH_RULE_SET_CHOOSE_TRIES: +			if (curstep->arg1 > 0) +				choose_tries = curstep->arg1; +			break; + +		case CRUSH_RULE_SET_CHOOSELEAF_TRIES: +			if (curstep->arg1 > 0) +				choose_leaf_tries = curstep->arg1; +			break; + +		case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: +			if (curstep->arg1 >= 0) +				choose_local_retries = curstep->arg1; +			break; + +		case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: +			if (curstep->arg1 >= 0) +				choose_local_fallback_retries = curstep->arg1; +			break; + +		case CRUSH_RULE_SET_CHOOSELEAF_VARY_R: +			if (curstep->arg1 >= 0) +				vary_r = curstep->arg1; +			break; + +		case CRUSH_RULE_CHOOSELEAF_FIRSTN:  		case CRUSH_RULE_CHOOSE_FIRSTN:  			firstn = 1; -		case CRUSH_RULE_CHOOSE_LEAF_INDEP: +			/* fall through */ +		case CRUSH_RULE_CHOOSELEAF_INDEP:  		case CRUSH_RULE_CHOOSE_INDEP: -			BUG_ON(wsize == 0); +			if (wsize == 0) +				break;  			recurse_to_leaf = -				rule->steps[step].op == -				 CRUSH_RULE_CHOOSE_LEAF_FIRSTN || -				rule->steps[step].op == -				CRUSH_RULE_CHOOSE_LEAF_INDEP; +				curstep->op == +				 CRUSH_RULE_CHOOSELEAF_FIRSTN || +				curstep->op == +				CRUSH_RULE_CHOOSELEAF_INDEP;  			/* reset output */  			osize = 0; @@ -545,42 +738,60 @@ int crush_do_rule(struct crush_map *map,  				 * basically, numrep <= 0 means relative to  				 * the provided result_max  				 */ -				numrep = rule->steps[step].arg1; +				numrep = curstep->arg1;  				if (numrep <= 0) {  					numrep += result_max;  					if (numrep <= 0)  						continue;  				}  				j = 0; -				if (osize == 0 && force_pos >= 0) { -					/* skip any intermediate types */ -					while (force_pos && -					       force_context[force_pos] < 0 && -					       rule->steps[step].arg2 != -					       map->buckets[-1 - -					       force_context[force_pos]]->type) -						force_pos--; -					o[osize] = force_context[force_pos]; -					if (recurse_to_leaf) -						c[osize] = force_context[0]; -					j++; -					force_pos--; +				if (firstn) { +					int recurse_tries; +					if (choose_leaf_tries) +						recurse_tries = +							choose_leaf_tries; +					else if (map->chooseleaf_descend_once) +						recurse_tries = 1; +					else +						recurse_tries = choose_tries; +					osize += crush_choose_firstn( +						map, +						map->buckets[-1-w[i]], +						weight, weight_max, +						x, numrep, +						curstep->arg2, +						o+osize, j, +						choose_tries, +						recurse_tries, +						choose_local_retries, +						choose_local_fallback_retries, +						recurse_to_leaf, +						vary_r, +						c+osize, +						0); +				} else { +					crush_choose_indep( +						map, +						map->buckets[-1-w[i]], +						weight, weight_max, +						x, numrep, numrep, +						curstep->arg2, +						o+osize, j, +						choose_tries, +						choose_leaf_tries ? +						   choose_leaf_tries : 1, +						recurse_to_leaf, +						c+osize, +						0); +					osize += numrep;  				} -				osize += crush_choose(map, -						      map->buckets[-1-w[i]], -						      weight, -						      x, numrep, -						      rule->steps[step].arg2, -						      o+osize, j, -						      firstn, -						      recurse_to_leaf, c+osize);  			}  			if (recurse_to_leaf)  				/* copy final _leaf_ values to output set */  				memcpy(o, c, osize*sizeof(*o)); -			/* swap t and w arrays */ +			/* swap o and w arrays */  			tmp = o;  			o = w;  			w = tmp; @@ -597,13 +808,12 @@ int crush_do_rule(struct crush_map *map,  			break;  		default: -			BUG_ON(1); +			dprintk(" unknown op %d at step %d\n", +				curstep->op, step); +			break;  		}  	} -	rc = result_len; - -out: -	return rc; +	return result_len;  } diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 7b505b0c983..6e7a236525b 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -5,10 +5,22 @@  #include <linux/scatterlist.h>  #include <linux/slab.h>  #include <crypto/hash.h> +#include <linux/key-type.h> +#include <keys/ceph-type.h>  #include <linux/ceph/decode.h>  #include "crypto.h" +int ceph_crypto_key_clone(struct ceph_crypto_key *dst, +			  const struct ceph_crypto_key *src) +{ +	memcpy(dst, src, sizeof(struct ceph_crypto_key)); +	dst->key = kmemdup(src->key, src->len, GFP_NOFS); +	if (!dst->key) +		return -ENOMEM; +	return 0; +} +  int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)  {  	if (*p + sizeof(u16) + sizeof(key->created) + @@ -410,3 +422,66 @@ int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,  		return -EINVAL;  	}  } + +static int ceph_key_instantiate(struct key *key, +				struct key_preparsed_payload *prep) +{ +	struct ceph_crypto_key *ckey; +	size_t datalen = prep->datalen; +	int ret; +	void *p; + +	ret = -EINVAL; +	if (datalen <= 0 || datalen > 32767 || !prep->data) +		goto err; + +	ret = key_payload_reserve(key, datalen); +	if (ret < 0) +		goto err; + +	ret = -ENOMEM; +	ckey = kmalloc(sizeof(*ckey), GFP_KERNEL); +	if (!ckey) +		goto err; + +	/* TODO ceph_crypto_key_decode should really take const input */ +	p = (void *)prep->data; +	ret = ceph_crypto_key_decode(ckey, &p, (char*)prep->data+datalen); +	if (ret < 0) +		goto err_ckey; + +	key->payload.data = ckey; +	return 0; + +err_ckey: +	kfree(ckey); +err: +	return ret; +} + +static int ceph_key_match(const struct key *key, const void *description) +{ +	return strcmp(key->description, description) == 0; +} + +static void ceph_key_destroy(struct key *key) { +	struct ceph_crypto_key *ckey = key->payload.data; + +	ceph_crypto_key_destroy(ckey); +	kfree(ckey); +} + +struct key_type key_type_ceph = { +	.name		= "ceph", +	.instantiate	= ceph_key_instantiate, +	.match		= ceph_key_match, +	.destroy	= ceph_key_destroy, +}; + +int ceph_crypto_init(void) { +	return register_key_type(&key_type_ceph); +} + +void ceph_crypto_shutdown(void) { +	unregister_key_type(&key_type_ceph); +} diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index f9eccace592..d1498224c49 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -16,33 +16,36 @@ struct ceph_crypto_key {  static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)  { -	kfree(key->key); +	if (key) +		kfree(key->key);  } -extern int ceph_crypto_key_encode(struct ceph_crypto_key *key, -				  void **p, void *end); -extern int ceph_crypto_key_decode(struct ceph_crypto_key *key, -				  void **p, void *end); -extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in); +int ceph_crypto_key_clone(struct ceph_crypto_key *dst, +			  const struct ceph_crypto_key *src); +int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end); +int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end); +int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);  /* crypto.c */ -extern int ceph_decrypt(struct ceph_crypto_key *secret, -			void *dst, size_t *dst_len, -			const void *src, size_t src_len); -extern int ceph_encrypt(struct ceph_crypto_key *secret, -			void *dst, size_t *dst_len, -			const void *src, size_t src_len); -extern int ceph_decrypt2(struct ceph_crypto_key *secret, -			void *dst1, size_t *dst1_len, -			void *dst2, size_t *dst2_len, -			const void *src, size_t src_len); -extern int ceph_encrypt2(struct ceph_crypto_key *secret, -			 void *dst, size_t *dst_len, -			 const void *src1, size_t src1_len, -			 const void *src2, size_t src2_len); +int ceph_decrypt(struct ceph_crypto_key *secret, +		 void *dst, size_t *dst_len, +		 const void *src, size_t src_len); +int ceph_encrypt(struct ceph_crypto_key *secret, +		 void *dst, size_t *dst_len, +		 const void *src, size_t src_len); +int ceph_decrypt2(struct ceph_crypto_key *secret, +		  void *dst1, size_t *dst1_len, +		  void *dst2, size_t *dst2_len, +		  const void *src, size_t src_len); +int ceph_encrypt2(struct ceph_crypto_key *secret, +		  void *dst, size_t *dst_len, +		  const void *src1, size_t src1_len, +		  const void *src2, size_t src2_len); +int ceph_crypto_init(void); +void ceph_crypto_shutdown(void);  /* armor.c */ -extern int ceph_armor(char *dst, const char *src, const char *end); -extern int ceph_unarmor(char *dst, const char *src, const char *end); +int ceph_armor(char *dst, const char *src, const char *end); +int ceph_unarmor(char *dst, const char *src, const char *end);  #endif diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 27d4ea315d1..d1a62c69a9f 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -53,34 +53,55 @@ static int osdmap_show(struct seq_file *s, void *p)  {  	int i;  	struct ceph_client *client = s->private; +	struct ceph_osdmap *map = client->osdc.osdmap;  	struct rb_node *n; -	if (client->osdc.osdmap == NULL) +	if (map == NULL)  		return 0; -	seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); + +	seq_printf(s, "epoch %d\n", map->epoch);  	seq_printf(s, "flags%s%s\n", -		   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? -		   " NEARFULL" : "", -		   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? -		   " FULL" : ""); -	for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { +		   (map->flags & CEPH_OSDMAP_NEARFULL) ?  " NEARFULL" : "", +		   (map->flags & CEPH_OSDMAP_FULL) ?  " FULL" : ""); + +	for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {  		struct ceph_pg_pool_info *pool =  			rb_entry(n, struct ceph_pg_pool_info, node); -		seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", -			   pool->id, pool->v.pg_num, pool->pg_num_mask, -			   pool->v.lpg_num, pool->lpg_num_mask); + +		seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n", +			   pool->id, pool->pg_num, pool->pg_num_mask, +			   pool->read_tier, pool->write_tier);  	} -	for (i = 0; i < client->osdc.osdmap->max_osd; i++) { -		struct ceph_entity_addr *addr = -			&client->osdc.osdmap->osd_addr[i]; -		int state = client->osdc.osdmap->osd_state[i]; +	for (i = 0; i < map->max_osd; i++) { +		struct ceph_entity_addr *addr = &map->osd_addr[i]; +		int state = map->osd_state[i];  		char sb[64]; -		seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", +		seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",  			   i, ceph_pr_addr(&addr->in_addr), -			   ((client->osdc.osdmap->osd_weight[i]*100) >> 16), -			   ceph_osdmap_state_str(sb, sizeof(sb), state)); +			   ((map->osd_weight[i]*100) >> 16), +			   ceph_osdmap_state_str(sb, sizeof(sb), state), +			   ((ceph_get_primary_affinity(map, i)*100) >> 16));  	} +	for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) { +		struct ceph_pg_mapping *pg = +			rb_entry(n, struct ceph_pg_mapping, node); + +		seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool, +			   pg->pgid.seed); +		for (i = 0; i < pg->pg_temp.len; i++) +			seq_printf(s, "%s%d", (i == 0 ? "" : ","), +				   pg->pg_temp.osds[i]); +		seq_printf(s, "]\n"); +	} +	for (n = rb_first(&map->primary_temp); n; n = rb_next(n)) { +		struct ceph_pg_mapping *pg = +			rb_entry(n, struct ceph_pg_mapping, node); + +		seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool, +			   pg->pgid.seed, pg->primary_temp.osd); +	} +  	return 0;  } @@ -94,9 +115,9 @@ static int monc_show(struct seq_file *s, void *p)  	mutex_lock(&monc->mutex);  	if (monc->have_mdsmap) -		seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap); +		seq_printf(s, "have mdsmap %u\n", (unsigned int)monc->have_mdsmap);  	if (monc->have_osdmap) -		seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap); +		seq_printf(s, "have osdmap %u\n", (unsigned int)monc->have_osdmap);  	if (monc->want_next_osdmap)  		seq_printf(s, "want next osdmap\n"); @@ -105,9 +126,13 @@ static int monc_show(struct seq_file *s, void *p)  		req = rb_entry(rp, struct ceph_mon_generic_request, node);  		op = le16_to_cpu(req->request->hdr.type);  		if (op == CEPH_MSG_STATFS) -			seq_printf(s, "%lld statfs\n", req->tid); +			seq_printf(s, "%llu statfs\n", req->tid); +		else if (op == CEPH_MSG_POOLOP) +			seq_printf(s, "%llu poolop\n", req->tid); +		else if (op == CEPH_MSG_MON_GET_VERSION) +			seq_printf(s, "%llu mon_get_version", req->tid);  		else -			seq_printf(s, "%lld unknown\n", req->tid); +			seq_printf(s, "%llu unknown\n", req->tid);  	}  	mutex_unlock(&monc->mutex); @@ -123,38 +148,28 @@ static int osdc_show(struct seq_file *s, void *pp)  	mutex_lock(&osdc->request_mutex);  	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {  		struct ceph_osd_request *req; -		struct ceph_osd_request_head *head; -		struct ceph_osd_op *op; -		int num_ops; -		int opcode, olen; -		int i; +		unsigned int i; +		int opcode;  		req = rb_entry(p, struct ceph_osd_request, r_node); -		seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, +		seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,  			   req->r_osd ? req->r_osd->o_osd : -1, -			   le32_to_cpu(req->r_pgid.pool), -			   le16_to_cpu(req->r_pgid.ps)); +			   req->r_pgid.pool, req->r_pgid.seed); -		head = req->r_request->front.iov_base; -		op = (void *)(head + 1); - -		num_ops = le16_to_cpu(head->num_ops); -		olen = le32_to_cpu(head->object_len); -		seq_printf(s, "%.*s", olen, -			   (const char *)(head->ops + num_ops)); +		seq_printf(s, "%.*s", req->r_base_oid.name_len, +			   req->r_base_oid.name);  		if (req->r_reassert_version.epoch)  			seq_printf(s, "\t%u'%llu", -			   (unsigned)le32_to_cpu(req->r_reassert_version.epoch), +			   (unsigned int)le32_to_cpu(req->r_reassert_version.epoch),  			   le64_to_cpu(req->r_reassert_version.version));  		else  			seq_printf(s, "\t"); -		for (i = 0; i < num_ops; i++) { -			opcode = le16_to_cpu(op->op); +		for (i = 0; i < req->r_num_ops; i++) { +			opcode = req->r_ops[i].op;  			seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); -			op++;  		}  		seq_printf(s, "\n"); @@ -189,6 +204,9 @@ int ceph_debugfs_client_init(struct ceph_client *client)  	snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,  		 client->monc.auth->global_id); +	dout("ceph_debugfs_client_init %p %s\n", client, name); + +	BUG_ON(client->debugfs_dir);  	client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);  	if (!client->debugfs_dir)  		goto out; @@ -234,6 +252,7 @@ out:  void ceph_debugfs_client_cleanup(struct ceph_client *client)  { +	dout("ceph_debugfs_client_cleanup %p\n", client);  	debugfs_remove(client->debugfs_osdmap);  	debugfs_remove(client->debugfs_monmap);  	debugfs_remove(client->osdc.debugfs_file); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 0e8157ee5d4..1948d592aa5 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -9,14 +9,21 @@  #include <linux/slab.h>  #include <linux/socket.h>  #include <linux/string.h> +#ifdef	CONFIG_BLOCK  #include <linux/bio.h> -#include <linux/blkdev.h> +#endif	/* CONFIG_BLOCK */ +#include <linux/dns_resolver.h>  #include <net/tcp.h> +#include <linux/ceph/ceph_features.h>  #include <linux/ceph/libceph.h>  #include <linux/ceph/messenger.h>  #include <linux/ceph/decode.h>  #include <linux/ceph/pagelist.h> +#include <linux/export.h> + +#define list_entry_next(pos, member)					\ +	list_entry(pos->member.next, typeof(*pos), member)  /*   * Ceph uses the messenger to exchange ceph_msg messages with other @@ -27,6 +34,130 @@   * the sender.   */ +/* + * We track the state of the socket on a given connection using + * values defined below.  The transition to a new socket state is + * handled by a function which verifies we aren't coming from an + * unexpected state. + * + *      -------- + *      | NEW* |  transient initial state + *      -------- + *          | con_sock_state_init() + *          v + *      ---------- + *      | CLOSED |  initialized, but no socket (and no + *      ----------  TCP connection) + *       ^      \ + *       |       \ con_sock_state_connecting() + *       |        ---------------------- + *       |                              \ + *       + con_sock_state_closed()       \ + *       |+---------------------------    \ + *       | \                          \    \ + *       |  -----------                \    \ + *       |  | CLOSING |  socket event;  \    \ + *       |  -----------  await close     \    \ + *       |       ^                        \   | + *       |       |                         \  | + *       |       + con_sock_state_closing() \ | + *       |      / \                         | | + *       |     /   ---------------          | | + *       |    /                   \         v v + *       |   /                    -------------- + *       |  /    -----------------| CONNECTING |  socket created, TCP + *       |  |   /                 --------------  connect initiated + *       |  |   | con_sock_state_connected() + *       |  |   v + *      ------------- + *      | CONNECTED |  TCP connection established + *      ------------- + * + * State values for ceph_connection->sock_state; NEW is assumed to be 0. + */ + +#define CON_SOCK_STATE_NEW		0	/* -> CLOSED */ +#define CON_SOCK_STATE_CLOSED		1	/* -> CONNECTING */ +#define CON_SOCK_STATE_CONNECTING	2	/* -> CONNECTED or -> CLOSING */ +#define CON_SOCK_STATE_CONNECTED	3	/* -> CLOSING or -> CLOSED */ +#define CON_SOCK_STATE_CLOSING		4	/* -> CLOSED */ + +/* + * connection states + */ +#define CON_STATE_CLOSED        1  /* -> PREOPEN */ +#define CON_STATE_PREOPEN       2  /* -> CONNECTING, CLOSED */ +#define CON_STATE_CONNECTING    3  /* -> NEGOTIATING, CLOSED */ +#define CON_STATE_NEGOTIATING   4  /* -> OPEN, CLOSED */ +#define CON_STATE_OPEN          5  /* -> STANDBY, CLOSED */ +#define CON_STATE_STANDBY       6  /* -> PREOPEN, CLOSED */ + +/* + * ceph_connection flag bits + */ +#define CON_FLAG_LOSSYTX           0  /* we can close channel or drop +				       * messages on errors */ +#define CON_FLAG_KEEPALIVE_PENDING 1  /* we need to send a keepalive */ +#define CON_FLAG_WRITE_PENDING	   2  /* we have data ready to send */ +#define CON_FLAG_SOCK_CLOSED	   3  /* socket state changed to closed */ +#define CON_FLAG_BACKOFF           4  /* need to retry queuing delayed work */ + +static bool con_flag_valid(unsigned long con_flag) +{ +	switch (con_flag) { +	case CON_FLAG_LOSSYTX: +	case CON_FLAG_KEEPALIVE_PENDING: +	case CON_FLAG_WRITE_PENDING: +	case CON_FLAG_SOCK_CLOSED: +	case CON_FLAG_BACKOFF: +		return true; +	default: +		return false; +	} +} + +static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag) +{ +	BUG_ON(!con_flag_valid(con_flag)); + +	clear_bit(con_flag, &con->flags); +} + +static void con_flag_set(struct ceph_connection *con, unsigned long con_flag) +{ +	BUG_ON(!con_flag_valid(con_flag)); + +	set_bit(con_flag, &con->flags); +} + +static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag) +{ +	BUG_ON(!con_flag_valid(con_flag)); + +	return test_bit(con_flag, &con->flags); +} + +static bool con_flag_test_and_clear(struct ceph_connection *con, +					unsigned long con_flag) +{ +	BUG_ON(!con_flag_valid(con_flag)); + +	return test_and_clear_bit(con_flag, &con->flags); +} + +static bool con_flag_test_and_set(struct ceph_connection *con, +					unsigned long con_flag) +{ +	BUG_ON(!con_flag_valid(con_flag)); + +	return test_and_set_bit(con_flag, &con->flags); +} + +/* Slab caches for frequently-allocated structures */ + +static struct kmem_cache	*ceph_msg_cache; +static struct kmem_cache	*ceph_msg_data_cache; +  /* static tag bytes (protocol control messages) */  static char tag_msg = CEPH_MSGR_TAG_MSG;  static char tag_ack = CEPH_MSGR_TAG_ACK; @@ -36,47 +167,54 @@ static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;  static struct lock_class_key socket_class;  #endif +/* + * When skipping (ignoring) a block of input we read it into a "skip + * buffer," which is this many bytes in size. + */ +#define SKIP_BUF_SIZE	1024  static void queue_con(struct ceph_connection *con);  static void con_work(struct work_struct *); -static void ceph_fault(struct ceph_connection *con); +static void con_fault(struct ceph_connection *con);  /* - * nicely render a sockaddr as a string. + * Nicely render a sockaddr as a string.  An array of formatted + * strings is used, to approximate reentrancy.   */ -#define MAX_ADDR_STR 20 -#define MAX_ADDR_STR_LEN 60 -static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; -static DEFINE_SPINLOCK(addr_str_lock); -static int last_addr_str; +#define ADDR_STR_COUNT_LOG	5	/* log2(# address strings in array) */ +#define ADDR_STR_COUNT		(1 << ADDR_STR_COUNT_LOG) +#define ADDR_STR_COUNT_MASK	(ADDR_STR_COUNT - 1) +#define MAX_ADDR_STR_LEN	64	/* 54 is enough */ + +static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; +static atomic_t addr_str_seq = ATOMIC_INIT(0); + +static struct page *zero_page;		/* used in certain error cases */  const char *ceph_pr_addr(const struct sockaddr_storage *ss)  {  	int i;  	char *s; -	struct sockaddr_in *in4 = (void *)ss; -	struct sockaddr_in6 *in6 = (void *)ss; - -	spin_lock(&addr_str_lock); -	i = last_addr_str++; -	if (last_addr_str == MAX_ADDR_STR) -		last_addr_str = 0; -	spin_unlock(&addr_str_lock); +	struct sockaddr_in *in4 = (struct sockaddr_in *) ss; +	struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; + +	i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK;  	s = addr_str[i];  	switch (ss->ss_family) {  	case AF_INET: -		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr, -			 (unsigned int)ntohs(in4->sin_port)); +		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr, +			 ntohs(in4->sin_port));  		break;  	case AF_INET6: -		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr, -			 (unsigned int)ntohs(in6->sin6_port)); +		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr, +			 ntohs(in6->sin6_port));  		break;  	default: -		sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family); +		snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)", +			 ss->ss_family);  	}  	return s; @@ -92,24 +230,83 @@ static void encode_my_addr(struct ceph_messenger *msgr)  /*   * work queue for all reading and writing to/from the socket.   */ -struct workqueue_struct *ceph_msgr_wq; +static struct workqueue_struct *ceph_msgr_wq; -int ceph_msgr_init(void) +static int ceph_msgr_slab_init(void)  { -	ceph_msgr_wq = create_workqueue("ceph-msgr"); -	if (IS_ERR(ceph_msgr_wq)) { -		int ret = PTR_ERR(ceph_msgr_wq); -		pr_err("msgr_init failed to create workqueue: %d\n", ret); +	BUG_ON(ceph_msg_cache); +	ceph_msg_cache = kmem_cache_create("ceph_msg", +					sizeof (struct ceph_msg), +					__alignof__(struct ceph_msg), 0, NULL); + +	if (!ceph_msg_cache) +		return -ENOMEM; + +	BUG_ON(ceph_msg_data_cache); +	ceph_msg_data_cache = kmem_cache_create("ceph_msg_data", +					sizeof (struct ceph_msg_data), +					__alignof__(struct ceph_msg_data), +					0, NULL); +	if (ceph_msg_data_cache) +		return 0; + +	kmem_cache_destroy(ceph_msg_cache); +	ceph_msg_cache = NULL; + +	return -ENOMEM; +} + +static void ceph_msgr_slab_exit(void) +{ +	BUG_ON(!ceph_msg_data_cache); +	kmem_cache_destroy(ceph_msg_data_cache); +	ceph_msg_data_cache = NULL; + +	BUG_ON(!ceph_msg_cache); +	kmem_cache_destroy(ceph_msg_cache); +	ceph_msg_cache = NULL; +} + +static void _ceph_msgr_exit(void) +{ +	if (ceph_msgr_wq) { +		destroy_workqueue(ceph_msgr_wq);  		ceph_msgr_wq = NULL; -		return ret;  	} -	return 0; + +	ceph_msgr_slab_exit(); + +	BUG_ON(zero_page == NULL); +	kunmap(zero_page); +	page_cache_release(zero_page); +	zero_page = NULL; +} + +int ceph_msgr_init(void) +{ +	BUG_ON(zero_page != NULL); +	zero_page = ZERO_PAGE(0); +	page_cache_get(zero_page); + +	if (ceph_msgr_slab_init()) +		return -ENOMEM; + +	ceph_msgr_wq = alloc_workqueue("ceph-msgr", 0, 0); +	if (ceph_msgr_wq) +		return 0; + +	pr_err("msgr_init failed to create workqueue\n"); +	_ceph_msgr_exit(); + +	return -ENOMEM;  }  EXPORT_SYMBOL(ceph_msgr_init);  void ceph_msgr_exit(void)  { -	destroy_workqueue(ceph_msgr_wq); +	BUG_ON(ceph_msgr_wq == NULL); + +	_ceph_msgr_exit();  }  EXPORT_SYMBOL(ceph_msgr_exit); @@ -119,70 +316,134 @@ void ceph_msgr_flush(void)  }  EXPORT_SYMBOL(ceph_msgr_flush); +/* Connection socket state transition functions */ + +static void con_sock_state_init(struct ceph_connection *con) +{ +	int old_state; + +	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); +	if (WARN_ON(old_state != CON_SOCK_STATE_NEW)) +		printk("%s: unexpected old state %d\n", __func__, old_state); +	dout("%s con %p sock %d -> %d\n", __func__, con, old_state, +	     CON_SOCK_STATE_CLOSED); +} + +static void con_sock_state_connecting(struct ceph_connection *con) +{ +	int old_state; + +	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING); +	if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED)) +		printk("%s: unexpected old state %d\n", __func__, old_state); +	dout("%s con %p sock %d -> %d\n", __func__, con, old_state, +	     CON_SOCK_STATE_CONNECTING); +} + +static void con_sock_state_connected(struct ceph_connection *con) +{ +	int old_state; + +	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED); +	if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING)) +		printk("%s: unexpected old state %d\n", __func__, old_state); +	dout("%s con %p sock %d -> %d\n", __func__, con, old_state, +	     CON_SOCK_STATE_CONNECTED); +} + +static void con_sock_state_closing(struct ceph_connection *con) +{ +	int old_state; + +	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING); +	if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING && +			old_state != CON_SOCK_STATE_CONNECTED && +			old_state != CON_SOCK_STATE_CLOSING)) +		printk("%s: unexpected old state %d\n", __func__, old_state); +	dout("%s con %p sock %d -> %d\n", __func__, con, old_state, +	     CON_SOCK_STATE_CLOSING); +} + +static void con_sock_state_closed(struct ceph_connection *con) +{ +	int old_state; + +	old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); +	if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED && +		    old_state != CON_SOCK_STATE_CLOSING && +		    old_state != CON_SOCK_STATE_CONNECTING && +		    old_state != CON_SOCK_STATE_CLOSED)) +		printk("%s: unexpected old state %d\n", __func__, old_state); +	dout("%s con %p sock %d -> %d\n", __func__, con, old_state, +	     CON_SOCK_STATE_CLOSED); +}  /*   * socket callback functions   */  /* data available on socket, or listen socket received a connect */ -static void ceph_data_ready(struct sock *sk, int count_unused) +static void ceph_sock_data_ready(struct sock *sk)  { -	struct ceph_connection *con = -		(struct ceph_connection *)sk->sk_user_data; +	struct ceph_connection *con = sk->sk_user_data; +	if (atomic_read(&con->msgr->stopping)) { +		return; +	} +  	if (sk->sk_state != TCP_CLOSE_WAIT) { -		dout("ceph_data_ready on %p state = %lu, queueing work\n", +		dout("%s on %p state = %lu, queueing work\n", __func__,  		     con, con->state);  		queue_con(con);  	}  }  /* socket has buffer space for writing */ -static void ceph_write_space(struct sock *sk) +static void ceph_sock_write_space(struct sock *sk)  { -	struct ceph_connection *con = -		(struct ceph_connection *)sk->sk_user_data; +	struct ceph_connection *con = sk->sk_user_data; -	/* only queue to workqueue if there is data we want to write. */ -	if (test_bit(WRITE_PENDING, &con->state)) { -		dout("ceph_write_space %p queueing write work\n", con); -		queue_con(con); +	/* only queue to workqueue if there is data we want to write, +	 * and there is sufficient space in the socket buffer to accept +	 * more data.  clear SOCK_NOSPACE so that ceph_sock_write_space() +	 * doesn't get called again until try_write() fills the socket +	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space() +	 * and net/core/stream.c:sk_stream_write_space(). +	 */ +	if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) { +		if (sk_stream_is_writeable(sk)) { +			dout("%s %p queueing write work\n", __func__, con); +			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +			queue_con(con); +		}  	} else { -		dout("ceph_write_space %p nothing to write\n", con); +		dout("%s %p nothing to write\n", __func__, con);  	} - -	/* since we have our own write_space, clear the SOCK_NOSPACE flag */ -	clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);  }  /* socket's state has changed */ -static void ceph_state_change(struct sock *sk) +static void ceph_sock_state_change(struct sock *sk)  { -	struct ceph_connection *con = -		(struct ceph_connection *)sk->sk_user_data; +	struct ceph_connection *con = sk->sk_user_data; -	dout("ceph_state_change %p state = %lu sk_state = %u\n", +	dout("%s %p state = %lu sk_state = %u\n", __func__,  	     con, con->state, sk->sk_state); -	if (test_bit(CLOSED, &con->state)) -		return; -  	switch (sk->sk_state) {  	case TCP_CLOSE: -		dout("ceph_state_change TCP_CLOSE\n"); +		dout("%s TCP_CLOSE\n", __func__);  	case TCP_CLOSE_WAIT: -		dout("ceph_state_change TCP_CLOSE_WAIT\n"); -		if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) { -			if (test_bit(CONNECTING, &con->state)) -				con->error_msg = "connection failed"; -			else -				con->error_msg = "socket closed"; -			queue_con(con); -		} +		dout("%s TCP_CLOSE_WAIT\n", __func__); +		con_sock_state_closing(con); +		con_flag_set(con, CON_FLAG_SOCK_CLOSED); +		queue_con(con);  		break;  	case TCP_ESTABLISHED: -		dout("ceph_state_change TCP_ESTABLISHED\n"); +		dout("%s TCP_ESTABLISHED\n", __func__); +		con_sock_state_connected(con);  		queue_con(con);  		break; +	default:	/* Everything else is uninteresting */ +		break;  	}  } @@ -193,10 +454,10 @@ static void set_sock_callbacks(struct socket *sock,  			       struct ceph_connection *con)  {  	struct sock *sk = sock->sk; -	sk->sk_user_data = (void *)con; -	sk->sk_data_ready = ceph_data_ready; -	sk->sk_write_space = ceph_write_space; -	sk->sk_state_change = ceph_state_change; +	sk->sk_user_data = con; +	sk->sk_data_ready = ceph_sock_data_ready; +	sk->sk_write_space = ceph_sock_write_space; +	sk->sk_state_change = ceph_sock_state_change;  } @@ -207,7 +468,7 @@ static void set_sock_callbacks(struct socket *sock,  /*   * initiate connection to a remote socket.   */ -static struct socket *ceph_tcp_connect(struct ceph_connection *con) +static int ceph_tcp_connect(struct ceph_connection *con)  {  	struct sockaddr_storage *paddr = &con->peer_addr.in_addr;  	struct socket *sock; @@ -217,8 +478,7 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)  	ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,  			       IPPROTO_TCP, &sock);  	if (ret) -		return ERR_PTR(ret); -	con->sock = sock; +		return ret;  	sock->sk->sk_allocation = GFP_NOFS;  #ifdef CONFIG_LOCKDEP @@ -229,33 +489,51 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)  	dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); +	con_sock_state_connecting(con);  	ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),  				 O_NONBLOCK);  	if (ret == -EINPROGRESS) {  		dout("connect %s EINPROGRESS sk_state = %u\n",  		     ceph_pr_addr(&con->peer_addr.in_addr),  		     sock->sk->sk_state); -		ret = 0; -	} -	if (ret < 0) { +	} else if (ret < 0) {  		pr_err("connect %s error %d\n",  		       ceph_pr_addr(&con->peer_addr.in_addr), ret);  		sock_release(sock); -		con->sock = NULL;  		con->error_msg = "connect error"; -	} -	if (ret < 0) -		return ERR_PTR(ret); -	return sock; +		return ret; +	} +	con->sock = sock; +	return 0;  }  static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)  {  	struct kvec iov = {buf, len};  	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; +	int r; -	return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); +	r = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); +	if (r == -EAGAIN) +		r = 0; +	return r; +} + +static int ceph_tcp_recvpage(struct socket *sock, struct page *page, +		     int page_offset, size_t length) +{ +	void *kaddr; +	int ret; + +	BUG_ON(page_offset + length > PAGE_SIZE); + +	kaddr = kmap(page); +	BUG_ON(!kaddr); +	ret = ceph_tcp_recvmsg(sock, kaddr + page_offset, length); +	kunmap(page); + +	return ret;  }  /* @@ -266,31 +544,74 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,  		     size_t kvlen, size_t len, int more)  {  	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; +	int r;  	if (more)  		msg.msg_flags |= MSG_MORE;  	else  		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */ -	return kernel_sendmsg(sock, &msg, iov, kvlen, len); +	r = kernel_sendmsg(sock, &msg, iov, kvlen, len); +	if (r == -EAGAIN) +		r = 0; +	return r;  } +static int __ceph_tcp_sendpage(struct socket *sock, struct page *page, +		     int offset, size_t size, bool more) +{ +	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR); +	int ret; + +	ret = kernel_sendpage(sock, page, offset, size, flags); +	if (ret == -EAGAIN) +		ret = 0; + +	return ret; +} + +static int ceph_tcp_sendpage(struct socket *sock, struct page *page, +		     int offset, size_t size, bool more) +{ +	int ret; +	struct kvec iov; + +	/* sendpage cannot properly handle pages with page_count == 0, +	 * we need to fallback to sendmsg if that's the case */ +	if (page_count(page) >= 1) +		return __ceph_tcp_sendpage(sock, page, offset, size, more); + +	iov.iov_base = kmap(page) + offset; +	iov.iov_len = size; +	ret = ceph_tcp_sendmsg(sock, &iov, 1, size, more); +	kunmap(page); + +	return ret; +}  /*   * Shutdown/close the socket for the given connection.   */  static int con_close_socket(struct ceph_connection *con)  { -	int rc; +	int rc = 0;  	dout("con_close_socket on %p sock %p\n", con, con->sock); -	if (!con->sock) -		return 0; -	set_bit(SOCK_CLOSED, &con->state); -	rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); -	sock_release(con->sock); -	con->sock = NULL; -	clear_bit(SOCK_CLOSED, &con->state); +	if (con->sock) { +		rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); +		sock_release(con->sock); +		con->sock = NULL; +	} + +	/* +	 * Forcibly clear the SOCK_CLOSED flag.  It gets set +	 * independent of the connection mutex, and we could have +	 * received a socket close event before we had the chance to +	 * shut the socket down. +	 */ +	con_flag_clear(con, CON_FLAG_SOCK_CLOSED); + +	con_sock_state_closed(con);  	return rc;  } @@ -301,6 +622,10 @@ static int con_close_socket(struct ceph_connection *con)  static void ceph_msg_remove(struct ceph_msg *msg)  {  	list_del_init(&msg->list_head); +	BUG_ON(msg->con == NULL); +	msg->con->ops->put(msg->con); +	msg->con = NULL; +  	ceph_msg_put(msg);  }  static void ceph_msg_remove_list(struct list_head *head) @@ -316,12 +641,16 @@ static void reset_connection(struct ceph_connection *con)  {  	/* reset connection, out_queue, msg_ and connect_seq */  	/* discard existing out_queue and msg_seq */ +	dout("reset_connection %p\n", con);  	ceph_msg_remove_list(&con->out_queue);  	ceph_msg_remove_list(&con->out_sent);  	if (con->in_msg) { +		BUG_ON(con->in_msg->con != con); +		con->in_msg->con = NULL;  		ceph_msg_put(con->in_msg);  		con->in_msg = NULL; +		con->ops->put(con);  	}  	con->connect_seq = 0; @@ -330,7 +659,6 @@ static void reset_connection(struct ceph_connection *con)  		ceph_msg_put(con->out_msg);  		con->out_msg = NULL;  	} -	con->out_keepalive_pending = false;  	con->in_seq = 0;  	con->in_seq_acked = 0;  } @@ -340,32 +668,43 @@ static void reset_connection(struct ceph_connection *con)   */  void ceph_con_close(struct ceph_connection *con)  { +	mutex_lock(&con->mutex);  	dout("con_close %p peer %s\n", con,  	     ceph_pr_addr(&con->peer_addr.in_addr)); -	set_bit(CLOSED, &con->state);  /* in case there's queued work */ -	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */ -	clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */ -	clear_bit(KEEPALIVE_PENDING, &con->state); -	clear_bit(WRITE_PENDING, &con->state); -	mutex_lock(&con->mutex); +	con->state = CON_STATE_CLOSED; + +	con_flag_clear(con, CON_FLAG_LOSSYTX);	/* so we retry next connect */ +	con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING); +	con_flag_clear(con, CON_FLAG_WRITE_PENDING); +	con_flag_clear(con, CON_FLAG_BACKOFF); +  	reset_connection(con);  	con->peer_global_seq = 0;  	cancel_delayed_work(&con->work); +	con_close_socket(con);  	mutex_unlock(&con->mutex); -	queue_con(con);  }  EXPORT_SYMBOL(ceph_con_close);  /*   * Reopen a closed connection, with a new peer address.   */ -void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) +void ceph_con_open(struct ceph_connection *con, +		   __u8 entity_type, __u64 entity_num, +		   struct ceph_entity_addr *addr)  { +	mutex_lock(&con->mutex);  	dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); -	set_bit(OPENING, &con->state); -	clear_bit(CLOSED, &con->state); + +	WARN_ON(con->state != CON_STATE_CLOSED); +	con->state = CON_STATE_PREOPEN; + +	con->peer_name.type = (__u8) entity_type; +	con->peer_name.num = cpu_to_le64(entity_num); +  	memcpy(&con->peer_addr, addr, sizeof(*addr));  	con->delay = 0;      /* reset backoff memory */ +	mutex_unlock(&con->mutex);  	queue_con(con);  }  EXPORT_SYMBOL(ceph_con_open); @@ -379,41 +718,26 @@ bool ceph_con_opened(struct ceph_connection *con)  }  /* - * generic get/put - */ -struct ceph_connection *ceph_con_get(struct ceph_connection *con) -{ -	dout("con_get %p nref = %d -> %d\n", con, -	     atomic_read(&con->nref), atomic_read(&con->nref) + 1); -	if (atomic_inc_not_zero(&con->nref)) -		return con; -	return NULL; -} - -void ceph_con_put(struct ceph_connection *con) -{ -	dout("con_put %p nref = %d -> %d\n", con, -	     atomic_read(&con->nref), atomic_read(&con->nref) - 1); -	BUG_ON(atomic_read(&con->nref) == 0); -	if (atomic_dec_and_test(&con->nref)) { -		BUG_ON(con->sock); -		kfree(con); -	} -} - -/*   * initialize a new connection.   */ -void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) +void ceph_con_init(struct ceph_connection *con, void *private, +	const struct ceph_connection_operations *ops, +	struct ceph_messenger *msgr)  {  	dout("con_init %p\n", con);  	memset(con, 0, sizeof(*con)); -	atomic_set(&con->nref, 1); +	con->private = private; +	con->ops = ops;  	con->msgr = msgr; + +	con_sock_state_init(con); +  	mutex_init(&con->mutex);  	INIT_LIST_HEAD(&con->out_queue);  	INIT_LIST_HEAD(&con->out_sent);  	INIT_DELAYED_WORK(&con->work, con_work); + +	con->state = CON_STATE_CLOSED;  }  EXPORT_SYMBOL(ceph_con_init); @@ -434,14 +758,432 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)  	return ret;  } +static void con_out_kvec_reset(struct ceph_connection *con) +{ +	con->out_kvec_left = 0; +	con->out_kvec_bytes = 0; +	con->out_kvec_cur = &con->out_kvec[0]; +} + +static void con_out_kvec_add(struct ceph_connection *con, +				size_t size, void *data) +{ +	int index; + +	index = con->out_kvec_left; +	BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); + +	con->out_kvec[index].iov_len = size; +	con->out_kvec[index].iov_base = data; +	con->out_kvec_left++; +	con->out_kvec_bytes += size; +} + +#ifdef CONFIG_BLOCK + +/* + * For a bio data item, a piece is whatever remains of the next + * entry in the current bio iovec, or the first entry in the next + * bio in the list. + */ +static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, +					size_t length) +{ +	struct ceph_msg_data *data = cursor->data; +	struct bio *bio; + +	BUG_ON(data->type != CEPH_MSG_DATA_BIO); + +	bio = data->bio; +	BUG_ON(!bio); + +	cursor->resid = min(length, data->bio_length); +	cursor->bio = bio; +	cursor->bvec_iter = bio->bi_iter; +	cursor->last_piece = +		cursor->resid <= bio_iter_len(bio, cursor->bvec_iter); +} + +static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, +						size_t *page_offset, +						size_t *length) +{ +	struct ceph_msg_data *data = cursor->data; +	struct bio *bio; +	struct bio_vec bio_vec; + +	BUG_ON(data->type != CEPH_MSG_DATA_BIO); + +	bio = cursor->bio; +	BUG_ON(!bio); + +	bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); + +	*page_offset = (size_t) bio_vec.bv_offset; +	BUG_ON(*page_offset >= PAGE_SIZE); +	if (cursor->last_piece) /* pagelist offset is always 0 */ +		*length = cursor->resid; +	else +		*length = (size_t) bio_vec.bv_len; +	BUG_ON(*length > cursor->resid); +	BUG_ON(*page_offset + *length > PAGE_SIZE); + +	return bio_vec.bv_page; +} + +static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, +					size_t bytes) +{ +	struct bio *bio; +	struct bio_vec bio_vec; + +	BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO); + +	bio = cursor->bio; +	BUG_ON(!bio); + +	bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); + +	/* Advance the cursor offset */ + +	BUG_ON(cursor->resid < bytes); +	cursor->resid -= bytes; + +	bio_advance_iter(bio, &cursor->bvec_iter, bytes); + +	if (bytes < bio_vec.bv_len) +		return false;	/* more bytes to process in this segment */ + +	/* Move on to the next segment, and possibly the next bio */ + +	if (!cursor->bvec_iter.bi_size) { +		bio = bio->bi_next; +		cursor->bio = bio; +		if (bio) +			cursor->bvec_iter = bio->bi_iter; +		else +			memset(&cursor->bvec_iter, 0, +			       sizeof(cursor->bvec_iter)); +	} + +	if (!cursor->last_piece) { +		BUG_ON(!cursor->resid); +		BUG_ON(!bio); +		/* A short read is OK, so use <= rather than == */ +		if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter)) +			cursor->last_piece = true; +	} + +	return true; +} +#endif /* CONFIG_BLOCK */ + +/* + * For a page array, a piece comes from the first page in the array + * that has not already been fully consumed. + */ +static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, +					size_t length) +{ +	struct ceph_msg_data *data = cursor->data; +	int page_count; + +	BUG_ON(data->type != CEPH_MSG_DATA_PAGES); + +	BUG_ON(!data->pages); +	BUG_ON(!data->length); + +	cursor->resid = min(length, data->length); +	page_count = calc_pages_for(data->alignment, (u64)data->length); +	cursor->page_offset = data->alignment & ~PAGE_MASK; +	cursor->page_index = 0; +	BUG_ON(page_count > (int)USHRT_MAX); +	cursor->page_count = (unsigned short)page_count; +	BUG_ON(length > SIZE_MAX - cursor->page_offset); +	cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE; +} + +static struct page * +ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor, +					size_t *page_offset, size_t *length) +{ +	struct ceph_msg_data *data = cursor->data; + +	BUG_ON(data->type != CEPH_MSG_DATA_PAGES); + +	BUG_ON(cursor->page_index >= cursor->page_count); +	BUG_ON(cursor->page_offset >= PAGE_SIZE); + +	*page_offset = cursor->page_offset; +	if (cursor->last_piece) +		*length = cursor->resid; +	else +		*length = PAGE_SIZE - *page_offset; + +	return data->pages[cursor->page_index]; +} + +static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, +						size_t bytes) +{ +	BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES); + +	BUG_ON(cursor->page_offset + bytes > PAGE_SIZE); + +	/* Advance the cursor page offset */ + +	cursor->resid -= bytes; +	cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK; +	if (!bytes || cursor->page_offset) +		return false;	/* more bytes to process in the current page */ + +	if (!cursor->resid) +		return false;   /* no more data */ + +	/* Move on to the next page; offset is already at 0 */ + +	BUG_ON(cursor->page_index >= cursor->page_count); +	cursor->page_index++; +	cursor->last_piece = cursor->resid <= PAGE_SIZE; + +	return true; +} + +/* + * For a pagelist, a piece is whatever remains to be consumed in the + * first page in the list, or the front of the next page. + */ +static void +ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, +					size_t length) +{ +	struct ceph_msg_data *data = cursor->data; +	struct ceph_pagelist *pagelist; +	struct page *page; + +	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); + +	pagelist = data->pagelist; +	BUG_ON(!pagelist); + +	if (!length) +		return;		/* pagelist can be assigned but empty */ + +	BUG_ON(list_empty(&pagelist->head)); +	page = list_first_entry(&pagelist->head, struct page, lru); + +	cursor->resid = min(length, pagelist->length); +	cursor->page = page; +	cursor->offset = 0; +	cursor->last_piece = cursor->resid <= PAGE_SIZE; +} + +static struct page * +ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor, +				size_t *page_offset, size_t *length) +{ +	struct ceph_msg_data *data = cursor->data; +	struct ceph_pagelist *pagelist; + +	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); + +	pagelist = data->pagelist; +	BUG_ON(!pagelist); + +	BUG_ON(!cursor->page); +	BUG_ON(cursor->offset + cursor->resid != pagelist->length); + +	/* offset of first page in pagelist is always 0 */ +	*page_offset = cursor->offset & ~PAGE_MASK; +	if (cursor->last_piece) +		*length = cursor->resid; +	else +		*length = PAGE_SIZE - *page_offset; + +	return cursor->page; +} + +static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, +						size_t bytes) +{ +	struct ceph_msg_data *data = cursor->data; +	struct ceph_pagelist *pagelist; + +	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); + +	pagelist = data->pagelist; +	BUG_ON(!pagelist); + +	BUG_ON(cursor->offset + cursor->resid != pagelist->length); +	BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE); + +	/* Advance the cursor offset */ + +	cursor->resid -= bytes; +	cursor->offset += bytes; +	/* offset of first page in pagelist is always 0 */ +	if (!bytes || cursor->offset & ~PAGE_MASK) +		return false;	/* more bytes to process in the current page */ + +	if (!cursor->resid) +		return false;   /* no more data */ + +	/* Move on to the next page */ + +	BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); +	cursor->page = list_entry_next(cursor->page, lru); +	cursor->last_piece = cursor->resid <= PAGE_SIZE; + +	return true; +} + +/* + * Message data is handled (sent or received) in pieces, where each + * piece resides on a single page.  The network layer might not + * consume an entire piece at once.  A data item's cursor keeps + * track of which piece is next to process and how much remains to + * be processed in that piece.  It also tracks whether the current + * piece is the last one in the data item. + */ +static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) +{ +	size_t length = cursor->total_resid; + +	switch (cursor->data->type) { +	case CEPH_MSG_DATA_PAGELIST: +		ceph_msg_data_pagelist_cursor_init(cursor, length); +		break; +	case CEPH_MSG_DATA_PAGES: +		ceph_msg_data_pages_cursor_init(cursor, length); +		break; +#ifdef CONFIG_BLOCK +	case CEPH_MSG_DATA_BIO: +		ceph_msg_data_bio_cursor_init(cursor, length); +		break; +#endif /* CONFIG_BLOCK */ +	case CEPH_MSG_DATA_NONE: +	default: +		/* BUG(); */ +		break; +	} +	cursor->need_crc = true; +} + +static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) +{ +	struct ceph_msg_data_cursor *cursor = &msg->cursor; +	struct ceph_msg_data *data; + +	BUG_ON(!length); +	BUG_ON(length > msg->data_length); +	BUG_ON(list_empty(&msg->data)); + +	cursor->data_head = &msg->data; +	cursor->total_resid = length; +	data = list_first_entry(&msg->data, struct ceph_msg_data, links); +	cursor->data = data; + +	__ceph_msg_data_cursor_init(cursor); +} + +/* + * Return the page containing the next piece to process for a given + * data item, and supply the page offset and length of that piece. + * Indicate whether this is the last piece in this data item. + */ +static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, +					size_t *page_offset, size_t *length, +					bool *last_piece) +{ +	struct page *page; + +	switch (cursor->data->type) { +	case CEPH_MSG_DATA_PAGELIST: +		page = ceph_msg_data_pagelist_next(cursor, page_offset, length); +		break; +	case CEPH_MSG_DATA_PAGES: +		page = ceph_msg_data_pages_next(cursor, page_offset, length); +		break; +#ifdef CONFIG_BLOCK +	case CEPH_MSG_DATA_BIO: +		page = ceph_msg_data_bio_next(cursor, page_offset, length); +		break; +#endif /* CONFIG_BLOCK */ +	case CEPH_MSG_DATA_NONE: +	default: +		page = NULL; +		break; +	} +	BUG_ON(!page); +	BUG_ON(*page_offset + *length > PAGE_SIZE); +	BUG_ON(!*length); +	if (last_piece) +		*last_piece = cursor->last_piece; + +	return page; +} + +/* + * Returns true if the result moves the cursor on to the next piece + * of the data item. + */ +static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, +				size_t bytes) +{ +	bool new_piece; + +	BUG_ON(bytes > cursor->resid); +	switch (cursor->data->type) { +	case CEPH_MSG_DATA_PAGELIST: +		new_piece = ceph_msg_data_pagelist_advance(cursor, bytes); +		break; +	case CEPH_MSG_DATA_PAGES: +		new_piece = ceph_msg_data_pages_advance(cursor, bytes); +		break; +#ifdef CONFIG_BLOCK +	case CEPH_MSG_DATA_BIO: +		new_piece = ceph_msg_data_bio_advance(cursor, bytes); +		break; +#endif /* CONFIG_BLOCK */ +	case CEPH_MSG_DATA_NONE: +	default: +		BUG(); +		break; +	} +	cursor->total_resid -= bytes; + +	if (!cursor->resid && cursor->total_resid) { +		WARN_ON(!cursor->last_piece); +		BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); +		cursor->data = list_entry_next(cursor->data, links); +		__ceph_msg_data_cursor_init(cursor); +		new_piece = true; +	} +	cursor->need_crc = new_piece; + +	return new_piece; +} + +static void prepare_message_data(struct ceph_msg *msg, u32 data_len) +{ +	BUG_ON(!msg); +	BUG_ON(!data_len); + +	/* Initialize data cursor */ + +	ceph_msg_data_cursor_init(msg, (size_t)data_len); +}  /*   * Prepare footer for currently outgoing message, and finish things   * off.  Assumes out_kvec* are already valid.. we just add on to the end.   */ -static void prepare_write_message_footer(struct ceph_connection *con, int v) +static void prepare_write_message_footer(struct ceph_connection *con)  {  	struct ceph_msg *m = con->out_msg; +	int v = con->out_kvec_left; + +	m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;  	dout("prepare_write_message_footer %p\n", con);  	con->out_kvec_is_msg = true; @@ -459,9 +1201,9 @@ static void prepare_write_message_footer(struct ceph_connection *con, int v)  static void prepare_write_message(struct ceph_connection *con)  {  	struct ceph_msg *m; -	int v = 0; +	u32 crc; -	con->out_kvec_bytes = 0; +	con_out_kvec_reset(con);  	con->out_kvec_is_msg = true;  	con->out_msg_done = false; @@ -469,24 +1211,20 @@ static void prepare_write_message(struct ceph_connection *con)  	 * TCP packet that's a good thing. */  	if (con->in_seq > con->in_seq_acked) {  		con->in_seq_acked = con->in_seq; -		con->out_kvec[v].iov_base = &tag_ack; -		con->out_kvec[v++].iov_len = 1; +		con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);  		con->out_temp_ack = cpu_to_le64(con->in_seq_acked); -		con->out_kvec[v].iov_base = &con->out_temp_ack; -		con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack); -		con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); +		con_out_kvec_add(con, sizeof (con->out_temp_ack), +			&con->out_temp_ack);  	} -	m = list_first_entry(&con->out_queue, -		       struct ceph_msg, list_head); +	BUG_ON(list_empty(&con->out_queue)); +	m = list_first_entry(&con->out_queue, struct ceph_msg, list_head);  	con->out_msg = m; -	if (test_bit(LOSSYTX, &con->state)) { -		list_del_init(&m->list_head); -	} else { -		/* put message on sent list */ -		ceph_msg_get(m); -		list_move_tail(&m->list_head, &con->out_sent); -	} +	BUG_ON(m->con != con); + +	/* put message on sent list */ +	ceph_msg_get(m); +	list_move_tail(&m->list_head, &con->out_sent);  	/*  	 * only assign outgoing seq # if we haven't sent this message @@ -496,63 +1234,51 @@ static void prepare_write_message(struct ceph_connection *con)  		m->hdr.seq = cpu_to_le64(++con->out_seq);  		m->needs_out_seq = false;  	} +	WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); -	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", +	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",  	     m, con->out_seq, le16_to_cpu(m->hdr.type),  	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), -	     le32_to_cpu(m->hdr.data_len), -	     m->nr_pages); +	     m->data_length);  	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);  	/* tag + hdr + front + middle */ -	con->out_kvec[v].iov_base = &tag_msg; -	con->out_kvec[v++].iov_len = 1; -	con->out_kvec[v].iov_base = &m->hdr; -	con->out_kvec[v++].iov_len = sizeof(m->hdr); -	con->out_kvec[v++] = m->front; +	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); +	con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); +	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); +  	if (m->middle) -		con->out_kvec[v++] = m->middle->vec; -	con->out_kvec_left = v; -	con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len + -		(m->middle ? m->middle->vec.iov_len : 0); -	con->out_kvec_cur = con->out_kvec; +		con_out_kvec_add(con, m->middle->vec.iov_len, +			m->middle->vec.iov_base);  	/* fill in crc (except data pages), footer */ -	con->out_msg->hdr.crc = -		cpu_to_le32(crc32c(0, (void *)&m->hdr, -				      sizeof(m->hdr) - sizeof(m->hdr.crc))); -	con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; -	con->out_msg->footer.front_crc = -		cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len)); -	if (m->middle) -		con->out_msg->footer.middle_crc = -			cpu_to_le32(crc32c(0, m->middle->vec.iov_base, -					   m->middle->vec.iov_len)); -	else +	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); +	con->out_msg->hdr.crc = cpu_to_le32(crc); +	con->out_msg->footer.flags = 0; + +	crc = crc32c(0, m->front.iov_base, m->front.iov_len); +	con->out_msg->footer.front_crc = cpu_to_le32(crc); +	if (m->middle) { +		crc = crc32c(0, m->middle->vec.iov_base, +				m->middle->vec.iov_len); +		con->out_msg->footer.middle_crc = cpu_to_le32(crc); +	} else  		con->out_msg->footer.middle_crc = 0; -	con->out_msg->footer.data_crc = 0; -	dout("prepare_write_message front_crc %u data_crc %u\n", +	dout("%s front_crc %u middle_crc %u\n", __func__,  	     le32_to_cpu(con->out_msg->footer.front_crc),  	     le32_to_cpu(con->out_msg->footer.middle_crc));  	/* is there a data payload? */ -	if (le32_to_cpu(m->hdr.data_len) > 0) { -		/* initialize page iterator */ -		con->out_msg_pos.page = 0; -		if (m->pages) -			con->out_msg_pos.page_pos = -				le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; -		else -			con->out_msg_pos.page_pos = 0; -		con->out_msg_pos.data_pos = 0; -		con->out_msg_pos.did_page_crc = 0; +	con->out_msg->footer.data_crc = 0; +	if (m->data_length) { +		prepare_message_data(con->out_msg, m->data_length);  		con->out_more = 1;  /* data + footer will follow */  	} else {  		/* no, queue up footer too and be done */ -		prepare_write_message_footer(con, v); +		prepare_write_message_footer(con);  	} -	set_bit(WRITE_PENDING, &con->state); +	con_flag_set(con, CON_FLAG_WRITE_PENDING);  }  /* @@ -564,16 +1290,34 @@ static void prepare_write_ack(struct ceph_connection *con)  	     con->in_seq_acked, con->in_seq);  	con->in_seq_acked = con->in_seq; -	con->out_kvec[0].iov_base = &tag_ack; -	con->out_kvec[0].iov_len = 1; +	con_out_kvec_reset(con); + +	con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); +  	con->out_temp_ack = cpu_to_le64(con->in_seq_acked); -	con->out_kvec[1].iov_base = &con->out_temp_ack; -	con->out_kvec[1].iov_len = sizeof(con->out_temp_ack); -	con->out_kvec_left = 2; -	con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); -	con->out_kvec_cur = con->out_kvec; +	con_out_kvec_add(con, sizeof (con->out_temp_ack), +				&con->out_temp_ack); +  	con->out_more = 1;  /* more will follow.. eventually.. */ -	set_bit(WRITE_PENDING, &con->state); +	con_flag_set(con, CON_FLAG_WRITE_PENDING); +} + +/* + * Prepare to share the seq during handshake + */ +static void prepare_write_seq(struct ceph_connection *con) +{ +	dout("prepare_write_seq %p %llu -> %llu\n", con, +	     con->in_seq_acked, con->in_seq); +	con->in_seq_acked = con->in_seq; + +	con_out_kvec_reset(con); + +	con->out_temp_ack = cpu_to_le64(con->in_seq_acked); +	con_out_kvec_add(con, sizeof (con->out_temp_ack), +			 &con->out_temp_ack); + +	con_flag_set(con, CON_FLAG_WRITE_PENDING);  }  /* @@ -582,66 +1326,60 @@ static void prepare_write_ack(struct ceph_connection *con)  static void prepare_write_keepalive(struct ceph_connection *con)  {  	dout("prepare_write_keepalive %p\n", con); -	con->out_kvec[0].iov_base = &tag_keepalive; -	con->out_kvec[0].iov_len = 1; -	con->out_kvec_left = 1; -	con->out_kvec_bytes = 1; -	con->out_kvec_cur = con->out_kvec; -	set_bit(WRITE_PENDING, &con->state); +	con_out_kvec_reset(con); +	con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); +	con_flag_set(con, CON_FLAG_WRITE_PENDING);  }  /*   * Connection negotiation.   */ -static void prepare_connect_authorizer(struct ceph_connection *con) +static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection *con, +						int *auth_proto)  { -	void *auth_buf; -	int auth_len = 0; -	int auth_protocol = 0; +	struct ceph_auth_handshake *auth; +	if (!con->ops->get_authorizer) { +		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; +		con->out_connect.authorizer_len = 0; +		return NULL; +	} + +	/* Can't hold the mutex while getting authorizer */  	mutex_unlock(&con->mutex); -	if (con->ops->get_authorizer) -		con->ops->get_authorizer(con, &auth_buf, &auth_len, -					 &auth_protocol, &con->auth_reply_buf, -					 &con->auth_reply_buf_len, -					 con->auth_retry); +	auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry);  	mutex_lock(&con->mutex); -	con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); -	con->out_connect.authorizer_len = cpu_to_le32(auth_len); +	if (IS_ERR(auth)) +		return auth; +	if (con->state != CON_STATE_NEGOTIATING) +		return ERR_PTR(-EAGAIN); -	con->out_kvec[con->out_kvec_left].iov_base = auth_buf; -	con->out_kvec[con->out_kvec_left].iov_len = auth_len; -	con->out_kvec_left++; -	con->out_kvec_bytes += auth_len; +	con->auth_reply_buf = auth->authorizer_reply_buf; +	con->auth_reply_buf_len = auth->authorizer_reply_buf_len; +	return auth;  }  /*   * We connected to a peer and are saying hello.   */ -static void prepare_write_banner(struct ceph_messenger *msgr, -				 struct ceph_connection *con) -{ -	int len = strlen(CEPH_BANNER); - -	con->out_kvec[0].iov_base = CEPH_BANNER; -	con->out_kvec[0].iov_len = len; -	con->out_kvec[1].iov_base = &msgr->my_enc_addr; -	con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr); -	con->out_kvec_left = 2; -	con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr); -	con->out_kvec_cur = con->out_kvec; +static void prepare_write_banner(struct ceph_connection *con) +{ +	con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); +	con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), +					&con->msgr->my_enc_addr); +  	con->out_more = 0; -	set_bit(WRITE_PENDING, &con->state); +	con_flag_set(con, CON_FLAG_WRITE_PENDING);  } -static void prepare_write_connect(struct ceph_messenger *msgr, -				  struct ceph_connection *con, -				  int after_banner) +static int prepare_write_connect(struct ceph_connection *con)  { -	unsigned global_seq = get_global_seq(con->msgr, 0); +	unsigned int global_seq = get_global_seq(con->msgr, 0);  	int proto; +	int auth_proto; +	struct ceph_auth_handshake *auth;  	switch (con->peer_name.type) {  	case CEPH_ENTITY_TYPE_MON: @@ -660,29 +1398,34 @@ static void prepare_write_connect(struct ceph_messenger *msgr,  	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,  	     con->connect_seq, global_seq, proto); -	con->out_connect.features = cpu_to_le64(msgr->supported_features); +	con->out_connect.features = cpu_to_le64(con->msgr->supported_features);  	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);  	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);  	con->out_connect.global_seq = cpu_to_le32(global_seq);  	con->out_connect.protocol_version = cpu_to_le32(proto);  	con->out_connect.flags = 0; -	if (!after_banner) { -		con->out_kvec_left = 0; -		con->out_kvec_bytes = 0; -	} -	con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect; -	con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect); -	con->out_kvec_left++; -	con->out_kvec_bytes += sizeof(con->out_connect); -	con->out_kvec_cur = con->out_kvec; +	auth_proto = CEPH_AUTH_UNKNOWN; +	auth = get_connect_authorizer(con, &auth_proto); +	if (IS_ERR(auth)) +		return PTR_ERR(auth); + +	con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto); +	con->out_connect.authorizer_len = auth ? +		cpu_to_le32(auth->authorizer_buf_len) : 0; + +	con_out_kvec_add(con, sizeof (con->out_connect), +					&con->out_connect); +	if (auth && auth->authorizer_buf_len) +		con_out_kvec_add(con, auth->authorizer_buf_len, +					auth->authorizer_buf); +  	con->out_more = 0; -	set_bit(WRITE_PENDING, &con->state); +	con_flag_set(con, CON_FLAG_WRITE_PENDING); -	prepare_connect_authorizer(con); +	return 0;  } -  /*   * write as much of pending kvecs to the socket as we can.   *  1 -> done @@ -703,17 +1446,18 @@ static int write_partial_kvec(struct ceph_connection *con)  		con->out_kvec_bytes -= ret;  		if (con->out_kvec_bytes == 0)  			break;            /* done */ -		while (ret > 0) { -			if (ret >= con->out_kvec_cur->iov_len) { -				ret -= con->out_kvec_cur->iov_len; -				con->out_kvec_cur++; -				con->out_kvec_left--; -			} else { -				con->out_kvec_cur->iov_len -= ret; -				con->out_kvec_cur->iov_base += ret; -				ret = 0; -				break; -			} + +		/* account for full iov entries consumed */ +		while (ret >= con->out_kvec_cur->iov_len) { +			BUG_ON(!con->out_kvec_left); +			ret -= con->out_kvec_cur->iov_len; +			con->out_kvec_cur++; +			con->out_kvec_left--; +		} +		/* and for a partially-consumed entry */ +		if (ret) { +			con->out_kvec_cur->iov_len -= ret; +			con->out_kvec_cur->iov_base += ret;  		}  	}  	con->out_kvec_left = 0; @@ -725,31 +1469,19 @@ out:  	return ret;  /* done! */  } -#ifdef CONFIG_BLOCK -static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) -{ -	if (!bio) { -		*iter = NULL; -		*seg = 0; -		return; -	} -	*iter = bio; -	*seg = bio->bi_idx; -} - -static void iter_bio_next(struct bio **bio_iter, int *seg) +static u32 ceph_crc32c_page(u32 crc, struct page *page, +				unsigned int page_offset, +				unsigned int length)  { -	if (*bio_iter == NULL) -		return; +	char *kaddr; -	BUG_ON(*seg >= (*bio_iter)->bi_vcnt); +	kaddr = kmap(page); +	BUG_ON(kaddr == NULL); +	crc = crc32c(crc, kaddr + page_offset, length); +	kunmap(page); -	(*seg)++; -	if (*seg == (*bio_iter)->bi_vcnt) -		init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); +	return crc;  } -#endif -  /*   * Write as much message data payload as we can.  If we finish, queue   * up the footer. @@ -757,133 +1489,61 @@ static void iter_bio_next(struct bio **bio_iter, int *seg)   *  0 -> socket full, but more to do   * <0 -> error   */ -static int write_partial_msg_pages(struct ceph_connection *con) +static int write_partial_message_data(struct ceph_connection *con)  {  	struct ceph_msg *msg = con->out_msg; -	unsigned data_len = le32_to_cpu(msg->hdr.data_len); -	size_t len; -	int crc = con->msgr->nocrc; -	int ret; -	int total_max_write; -	int in_trail = 0; -	size_t trail_len = (msg->trail ? msg->trail->length : 0); +	struct ceph_msg_data_cursor *cursor = &msg->cursor; +	bool do_datacrc = !con->msgr->nocrc; +	u32 crc; -	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", -	     con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, -	     con->out_msg_pos.page_pos); +	dout("%s %p msg %p\n", __func__, con, msg); -#ifdef CONFIG_BLOCK -	if (msg->bio && !msg->bio_iter) -		init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); -#endif +	if (list_empty(&msg->data)) +		return -EINVAL; -	while (data_len > con->out_msg_pos.data_pos) { -		struct page *page = NULL; -		void *kaddr = NULL; -		int max_write = PAGE_SIZE; -		int page_shift = 0; - -		total_max_write = data_len - trail_len - -			con->out_msg_pos.data_pos; - -		/* -		 * if we are calculating the data crc (the default), we need -		 * to map the page.  if our pages[] has been revoked, use the -		 * zero page. -		 */ - -		/* have we reached the trail part of the data? */ -		if (con->out_msg_pos.data_pos >= data_len - trail_len) { -			in_trail = 1; - -			total_max_write = data_len - con->out_msg_pos.data_pos; - -			page = list_first_entry(&msg->trail->head, -						struct page, lru); -			if (crc) -				kaddr = kmap(page); -			max_write = PAGE_SIZE; -		} else if (msg->pages) { -			page = msg->pages[con->out_msg_pos.page]; -			if (crc) -				kaddr = kmap(page); -		} else if (msg->pagelist) { -			page = list_first_entry(&msg->pagelist->head, -						struct page, lru); -			if (crc) -				kaddr = kmap(page); -#ifdef CONFIG_BLOCK -		} else if (msg->bio) { -			struct bio_vec *bv; - -			bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); -			page = bv->bv_page; -			page_shift = bv->bv_offset; -			if (crc) -				kaddr = kmap(page) + page_shift; -			max_write = bv->bv_len; -#endif -		} else { -			page = con->msgr->zero_page; -			if (crc) -				kaddr = page_address(con->msgr->zero_page); -		} -		len = min_t(int, max_write - con->out_msg_pos.page_pos, -			    total_max_write); - -		if (crc && !con->out_msg_pos.did_page_crc) { -			void *base = kaddr + con->out_msg_pos.page_pos; -			u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); - -			BUG_ON(kaddr == NULL); -			con->out_msg->footer.data_crc = -				cpu_to_le32(crc32c(tmpcrc, base, len)); -			con->out_msg_pos.did_page_crc = 1; -		} -		ret = kernel_sendpage(con->sock, page, -				      con->out_msg_pos.page_pos + page_shift, -				      len, -				      MSG_DONTWAIT | MSG_NOSIGNAL | -				      MSG_MORE); - -		if (crc && -		    (msg->pages || msg->pagelist || msg->bio || in_trail)) -			kunmap(page); - -		if (ret <= 0) -			goto out; +	/* +	 * Iterate through each page that contains data to be +	 * written, and send as much as possible for each. +	 * +	 * If we are calculating the data crc (the default), we will +	 * need to map the page.  If we have no pages, they have +	 * been revoked, so use the zero page. +	 */ +	crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; +	while (cursor->resid) { +		struct page *page; +		size_t page_offset; +		size_t length; +		bool last_piece; +		bool need_crc; +		int ret; + +		page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, +							&last_piece); +		ret = ceph_tcp_sendpage(con->sock, page, page_offset, +				      length, last_piece); +		if (ret <= 0) { +			if (do_datacrc) +				msg->footer.data_crc = cpu_to_le32(crc); -		con->out_msg_pos.data_pos += ret; -		con->out_msg_pos.page_pos += ret; -		if (ret == len) { -			con->out_msg_pos.page_pos = 0; -			con->out_msg_pos.page++; -			con->out_msg_pos.did_page_crc = 0; -			if (in_trail) -				list_move_tail(&page->lru, -					       &msg->trail->head); -			else if (msg->pagelist) -				list_move_tail(&page->lru, -					       &msg->pagelist->head); -#ifdef CONFIG_BLOCK -			else if (msg->bio) -				iter_bio_next(&msg->bio_iter, &msg->bio_seg); -#endif +			return ret;  		} +		if (do_datacrc && cursor->need_crc) +			crc = ceph_crc32c_page(crc, page, page_offset, length); +		need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret);  	} -	dout("write_partial_msg_pages %p msg %p done\n", con, msg); +	dout("%s %p msg %p done\n", __func__, con, msg);  	/* prepare and queue up footer, too */ -	if (!crc) -		con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; -	con->out_kvec_bytes = 0; -	con->out_kvec_left = 0; -	con->out_kvec_cur = con->out_kvec; -	prepare_write_message_footer(con, 0); -	ret = 1; -out: -	return ret; +	if (do_datacrc) +		msg->footer.data_crc = cpu_to_le32(crc); +	else +		msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; +	con_out_kvec_reset(con); +	prepare_write_message_footer(con); + +	return 1;	/* must return > 0 to indicate success */  }  /* @@ -894,12 +1554,9 @@ static int write_partial_skip(struct ceph_connection *con)  	int ret;  	while (con->out_skip > 0) { -		struct kvec iov = { -			.iov_base = page_address(con->msgr->zero_page), -			.iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE) -		}; +		size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); -		ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1); +		ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true);  		if (ret <= 0)  			goto out;  		con->out_skip -= ret; @@ -930,6 +1587,13 @@ static void prepare_read_ack(struct ceph_connection *con)  	con->in_base_pos = 0;  } +static void prepare_read_seq(struct ceph_connection *con) +{ +	dout("prepare_read_seq %p\n", con); +	con->in_base_pos = 0; +	con->in_tag = CEPH_MSGR_TAG_SEQ; +} +  static void prepare_read_tag(struct ceph_connection *con)  {  	dout("prepare_read_tag %p\n", con); @@ -951,11 +1615,10 @@ static int prepare_read_message(struct ceph_connection *con)  static int read_partial(struct ceph_connection *con, -			int *to, int size, void *object) +			int end, int size, void *object)  { -	*to += size; -	while (con->in_base_pos < *to) { -		int left = *to - con->in_base_pos; +	while (con->in_base_pos < end) { +		int left = end - con->in_base_pos;  		int have = size - left;  		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);  		if (ret <= 0) @@ -971,37 +1634,52 @@ static int read_partial(struct ceph_connection *con,   */  static int read_partial_banner(struct ceph_connection *con)  { -	int ret, to = 0; +	int size; +	int end; +	int ret;  	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);  	/* peer's banner */ -	ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner); +	size = strlen(CEPH_BANNER); +	end = size; +	ret = read_partial(con, end, size, con->in_banner);  	if (ret <= 0)  		goto out; -	ret = read_partial(con, &to, sizeof(con->actual_peer_addr), -			   &con->actual_peer_addr); + +	size = sizeof (con->actual_peer_addr); +	end += size; +	ret = read_partial(con, end, size, &con->actual_peer_addr);  	if (ret <= 0)  		goto out; -	ret = read_partial(con, &to, sizeof(con->peer_addr_for_me), -			   &con->peer_addr_for_me); + +	size = sizeof (con->peer_addr_for_me); +	end += size; +	ret = read_partial(con, end, size, &con->peer_addr_for_me);  	if (ret <= 0)  		goto out; +  out:  	return ret;  }  static int read_partial_connect(struct ceph_connection *con)  { -	int ret, to = 0; +	int size; +	int end; +	int ret;  	dout("read_partial_connect %p at %d\n", con, con->in_base_pos); -	ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply); +	size = sizeof (con->in_reply); +	end = size; +	ret = read_partial(con, end, size, &con->in_reply);  	if (ret <= 0)  		goto out; -	ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len), -			   con->auth_reply_buf); + +	size = le32_to_cpu(con->in_reply.authorizer_len); +	end += size; +	ret = read_partial(con, end, size, con->auth_reply_buf);  	if (ret <= 0)  		goto out; @@ -1059,12 +1737,109 @@ static void addr_set_port(struct sockaddr_storage *ss, int p)  	switch (ss->ss_family) {  	case AF_INET:  		((struct sockaddr_in *)ss)->sin_port = htons(p); +		break;  	case AF_INET6:  		((struct sockaddr_in6 *)ss)->sin6_port = htons(p); +		break;  	}  }  /* + * Unlike other *_pton function semantics, zero indicates success. + */ +static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss, +		char delim, const char **ipend) +{ +	struct sockaddr_in *in4 = (struct sockaddr_in *) ss; +	struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; + +	memset(ss, 0, sizeof(*ss)); + +	if (in4_pton(str, len, (u8 *)&in4->sin_addr.s_addr, delim, ipend)) { +		ss->ss_family = AF_INET; +		return 0; +	} + +	if (in6_pton(str, len, (u8 *)&in6->sin6_addr.s6_addr, delim, ipend)) { +		ss->ss_family = AF_INET6; +		return 0; +	} + +	return -EINVAL; +} + +/* + * Extract hostname string and resolve using kernel DNS facility. + */ +#ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER +static int ceph_dns_resolve_name(const char *name, size_t namelen, +		struct sockaddr_storage *ss, char delim, const char **ipend) +{ +	const char *end, *delim_p; +	char *colon_p, *ip_addr = NULL; +	int ip_len, ret; + +	/* +	 * The end of the hostname occurs immediately preceding the delimiter or +	 * the port marker (':') where the delimiter takes precedence. +	 */ +	delim_p = memchr(name, delim, namelen); +	colon_p = memchr(name, ':', namelen); + +	if (delim_p && colon_p) +		end = delim_p < colon_p ? delim_p : colon_p; +	else if (!delim_p && colon_p) +		end = colon_p; +	else { +		end = delim_p; +		if (!end) /* case: hostname:/ */ +			end = name + namelen; +	} + +	if (end <= name) +		return -EINVAL; + +	/* do dns_resolve upcall */ +	ip_len = dns_query(NULL, name, end - name, NULL, &ip_addr, NULL); +	if (ip_len > 0) +		ret = ceph_pton(ip_addr, ip_len, ss, -1, NULL); +	else +		ret = -ESRCH; + +	kfree(ip_addr); + +	*ipend = end; + +	pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name, +			ret, ret ? "failed" : ceph_pr_addr(ss)); + +	return ret; +} +#else +static inline int ceph_dns_resolve_name(const char *name, size_t namelen, +		struct sockaddr_storage *ss, char delim, const char **ipend) +{ +	return -EINVAL; +} +#endif + +/* + * Parse a server name (IP or hostname). If a valid IP address is not found + * then try to extract a hostname to resolve using userspace DNS upcall. + */ +static int ceph_parse_server_name(const char *name, size_t namelen, +			struct sockaddr_storage *ss, char delim, const char **ipend) +{ +	int ret; + +	ret = ceph_pton(name, namelen, ss, delim, ipend); +	if (ret) +		ret = ceph_dns_resolve_name(name, namelen, ss, delim, ipend); + +	return ret; +} + +/*   * Parse an ip[:port] list into an addr array.  Use the default   * monitor port if a port isn't specified.   */ @@ -1072,15 +1847,13 @@ int ceph_parse_ips(const char *c, const char *end,  		   struct ceph_entity_addr *addr,  		   int max_count, int *count)  { -	int i; +	int i, ret = -EINVAL;  	const char *p = c;  	dout("parse_ips on '%.*s'\n", (int)(end-c), c);  	for (i = 0; i < max_count; i++) {  		const char *ipend;  		struct sockaddr_storage *ss = &addr[i].in_addr; -		struct sockaddr_in *in4 = (void *)ss; -		struct sockaddr_in6 *in6 = (void *)ss;  		int port;  		char delim = ','; @@ -1089,15 +1862,11 @@ int ceph_parse_ips(const char *c, const char *end,  			p++;  		} -		memset(ss, 0, sizeof(*ss)); -		if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, -			     delim, &ipend)) -			ss->ss_family = AF_INET; -		else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, -				  delim, &ipend)) -			ss->ss_family = AF_INET6; -		else +		ret = ceph_parse_server_name(p, end - p, ss, delim, &ipend); +		if (ret)  			goto bad; +		ret = -EINVAL; +  		p = ipend;  		if (delim == ']') { @@ -1116,7 +1885,9 @@ int ceph_parse_ips(const char *c, const char *end,  				port = (port * 10) + (*p - '0');  				p++;  			} -			if (port > 65535 || port == 0) +			if (port == 0) +				port = CEPH_MON_PORT; +			else if (port > 65535)  				goto bad;  		} else {  			port = CEPH_MON_PORT; @@ -1142,7 +1913,7 @@ int ceph_parse_ips(const char *c, const char *end,  bad:  	pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); -	return -EINVAL; +	return ret;  }  EXPORT_SYMBOL(ceph_parse_ips); @@ -1189,27 +1960,16 @@ static int process_banner(struct ceph_connection *con)  		     ceph_pr_addr(&con->msgr->inst.addr.in_addr));  	} -	set_bit(NEGOTIATING, &con->state); -	prepare_read_connect(con);  	return 0;  } -static void fail_protocol(struct ceph_connection *con) -{ -	reset_connection(con); -	set_bit(CLOSED, &con->state);  /* in case there's queued work */ - -	mutex_unlock(&con->mutex); -	if (con->ops->bad_proto) -		con->ops->bad_proto(con); -	mutex_lock(&con->mutex); -} -  static int process_connect(struct ceph_connection *con)  {  	u64 sup_feat = con->msgr->supported_features;  	u64 req_feat = con->msgr->required_features; -	u64 server_feat = le64_to_cpu(con->in_reply.features); +	u64 server_feat = ceph_sanitize_features( +				le64_to_cpu(con->in_reply.features)); +	int ret;  	dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -1221,7 +1981,7 @@ static int process_connect(struct ceph_connection *con)  		       ceph_pr_addr(&con->peer_addr.in_addr),  		       sup_feat, server_feat, server_feat & ~sup_feat);  		con->error_msg = "missing required protocol features"; -		fail_protocol(con); +		reset_connection(con);  		return -1;  	case CEPH_MSGR_TAG_BADPROTOVER: @@ -1232,7 +1992,7 @@ static int process_connect(struct ceph_connection *con)  		       le32_to_cpu(con->out_connect.protocol_version),  		       le32_to_cpu(con->in_reply.protocol_version));  		con->error_msg = "protocol version mismatch"; -		fail_protocol(con); +		reset_connection(con);  		return -1;  	case CEPH_MSGR_TAG_BADAUTHORIZER: @@ -1241,12 +2001,12 @@ static int process_connect(struct ceph_connection *con)  		     con->auth_retry);  		if (con->auth_retry == 2) {  			con->error_msg = "connect authorization failure"; -			reset_connection(con); -			set_bit(CLOSED, &con->state);  			return -1;  		} -		con->auth_retry = 1; -		prepare_write_connect(con->msgr, con, 0); +		con_out_kvec_reset(con); +		ret = prepare_write_connect(con); +		if (ret < 0) +			return ret;  		prepare_read_connect(con);  		break; @@ -1259,12 +2019,15 @@ static int process_connect(struct ceph_connection *con)  		 * dropped messages.  		 */  		dout("process_connect got RESET peer seq %u\n", -		     le32_to_cpu(con->in_connect.connect_seq)); +		     le32_to_cpu(con->in_reply.connect_seq));  		pr_err("%s%lld %s connection reset\n",  		       ENTITY_NAME(con->peer_name),  		       ceph_pr_addr(&con->peer_addr.in_addr));  		reset_connection(con); -		prepare_write_connect(con->msgr, con, 0); +		con_out_kvec_reset(con); +		ret = prepare_write_connect(con); +		if (ret < 0) +			return ret;  		prepare_read_connect(con);  		/* Tell ceph about it. */ @@ -1273,6 +2036,8 @@ static int process_connect(struct ceph_connection *con)  		if (con->ops->peer_reset)  			con->ops->peer_reset(con);  		mutex_lock(&con->mutex); +		if (con->state != CON_STATE_NEGOTIATING) +			return -EAGAIN;  		break;  	case CEPH_MSGR_TAG_RETRY_SESSION: @@ -1280,11 +2045,14 @@ static int process_connect(struct ceph_connection *con)  		 * If we sent a smaller connect_seq than the peer has, try  		 * again with a larger value.  		 */ -		dout("process_connect got RETRY my seq = %u, peer_seq = %u\n", +		dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",  		     le32_to_cpu(con->out_connect.connect_seq), -		     le32_to_cpu(con->in_connect.connect_seq)); -		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); -		prepare_write_connect(con->msgr, con, 0); +		     le32_to_cpu(con->in_reply.connect_seq)); +		con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); +		con_out_kvec_reset(con); +		ret = prepare_write_connect(con); +		if (ret < 0) +			return ret;  		prepare_read_connect(con);  		break; @@ -1295,13 +2063,17 @@ static int process_connect(struct ceph_connection *con)  		 */  		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",  		     con->peer_global_seq, -		     le32_to_cpu(con->in_connect.global_seq)); +		     le32_to_cpu(con->in_reply.global_seq));  		get_global_seq(con->msgr, -			       le32_to_cpu(con->in_connect.global_seq)); -		prepare_write_connect(con->msgr, con, 0); +			       le32_to_cpu(con->in_reply.global_seq)); +		con_out_kvec_reset(con); +		ret = prepare_write_connect(con); +		if (ret < 0) +			return ret;  		prepare_read_connect(con);  		break; +	case CEPH_MSGR_TAG_SEQ:  	case CEPH_MSGR_TAG_READY:  		if (req_feat & ~server_feat) {  			pr_err("%s%lld %s protocol feature mismatch," @@ -1310,10 +2082,13 @@ static int process_connect(struct ceph_connection *con)  			       ceph_pr_addr(&con->peer_addr.in_addr),  			       req_feat, server_feat, req_feat & ~server_feat);  			con->error_msg = "missing required protocol features"; -			fail_protocol(con); +			reset_connection(con);  			return -1;  		} -		clear_bit(CONNECTING, &con->state); + +		WARN_ON(con->state != CON_STATE_NEGOTIATING); +		con->state = CON_STATE_OPEN; +		con->auth_retry = 0;    /* we authenticated; clear flag */  		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);  		con->connect_seq++;  		con->peer_features = server_feat; @@ -1325,9 +2100,16 @@ static int process_connect(struct ceph_connection *con)  			le32_to_cpu(con->in_reply.connect_seq));  		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) -			set_bit(LOSSYTX, &con->state); +			con_flag_set(con, CON_FLAG_LOSSYTX); + +		con->delay = 0;      /* reset backoff memory */ -		prepare_read_tag(con); +		if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) { +			prepare_write_seq(con); +			prepare_read_seq(con); +		} else { +			prepare_read_tag(con); +		}  		break;  	case CEPH_MSGR_TAG_WAIT: @@ -1337,7 +2119,9 @@ static int process_connect(struct ceph_connection *con)  		 * to WAIT.  This shouldn't happen if we are the  		 * client.  		 */ -		pr_err("process_connect peer connecting WAIT\n"); +		pr_err("process_connect got WAIT as client\n"); +		con->error_msg = "protocol error, got WAIT as client"; +		return -1;  	default:  		pr_err("connect protocol error, will retry\n"); @@ -1353,13 +2137,12 @@ static int process_connect(struct ceph_connection *con)   */  static int read_partial_ack(struct ceph_connection *con)  { -	int to = 0; +	int size = sizeof (con->in_temp_ack); +	int end = size; -	return read_partial(con, &to, sizeof(con->in_temp_ack), -			    &con->in_temp_ack); +	return read_partial(con, end, size, &con->in_temp_ack);  } -  /*   * We can finally discard anything that's been acked.   */ @@ -1377,14 +2160,13 @@ static void process_ack(struct ceph_connection *con)  			break;  		dout("got ack for seq %llu type %d at %p\n", seq,  		     le16_to_cpu(m->hdr.type), m); +		m->ack_stamp = jiffies;  		ceph_msg_remove(m);  	}  	prepare_read_tag(con);  } - -  static int read_partial_message_section(struct ceph_connection *con,  					struct kvec *section,  					unsigned int sec_len, u32 *crc) @@ -1401,145 +2183,104 @@ static int read_partial_message_section(struct ceph_connection *con,  		if (ret <= 0)  			return ret;  		section->iov_len += ret; -		if (section->iov_len == sec_len) -			*crc = crc32c(0, section->iov_base, -				      section->iov_len);  	} +	if (section->iov_len == sec_len) +		*crc = crc32c(0, section->iov_base, section->iov_len);  	return 1;  } -static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, -				struct ceph_msg_header *hdr, -				int *skip); - - -static int read_partial_message_pages(struct ceph_connection *con, -				      struct page **pages, -				      unsigned data_len, int datacrc) +static int read_partial_msg_data(struct ceph_connection *con)  { -	void *p; +	struct ceph_msg *msg = con->in_msg; +	struct ceph_msg_data_cursor *cursor = &msg->cursor; +	const bool do_datacrc = !con->msgr->nocrc; +	struct page *page; +	size_t page_offset; +	size_t length; +	u32 crc = 0;  	int ret; -	int left; - -	left = min((int)(data_len - con->in_msg_pos.data_pos), -		   (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); -	/* (page) data */ -	BUG_ON(pages == NULL); -	p = kmap(pages[con->in_msg_pos.page]); -	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, -			       left); -	if (ret > 0 && datacrc) -		con->in_data_crc = -			crc32c(con->in_data_crc, -				  p + con->in_msg_pos.page_pos, ret); -	kunmap(pages[con->in_msg_pos.page]); -	if (ret <= 0) -		return ret; -	con->in_msg_pos.data_pos += ret; -	con->in_msg_pos.page_pos += ret; -	if (con->in_msg_pos.page_pos == PAGE_SIZE) { -		con->in_msg_pos.page_pos = 0; -		con->in_msg_pos.page++; -	} -	return ret; -} - -#ifdef CONFIG_BLOCK -static int read_partial_message_bio(struct ceph_connection *con, -				    struct bio **bio_iter, int *bio_seg, -				    unsigned data_len, int datacrc) -{ -	struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); -	void *p; -	int ret, left; - -	if (IS_ERR(bv)) -		return PTR_ERR(bv); +	BUG_ON(!msg); +	if (list_empty(&msg->data)) +		return -EIO; -	left = min((int)(data_len - con->in_msg_pos.data_pos), -		   (int)(bv->bv_len - con->in_msg_pos.page_pos)); +	if (do_datacrc) +		crc = con->in_data_crc; +	while (cursor->resid) { +		page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, +							NULL); +		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); +		if (ret <= 0) { +			if (do_datacrc) +				con->in_data_crc = crc; -	p = kmap(bv->bv_page) + bv->bv_offset; +			return ret; +		} -	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, -			       left); -	if (ret > 0 && datacrc) -		con->in_data_crc = -			crc32c(con->in_data_crc, -				  p + con->in_msg_pos.page_pos, ret); -	kunmap(bv->bv_page); -	if (ret <= 0) -		return ret; -	con->in_msg_pos.data_pos += ret; -	con->in_msg_pos.page_pos += ret; -	if (con->in_msg_pos.page_pos == bv->bv_len) { -		con->in_msg_pos.page_pos = 0; -		iter_bio_next(bio_iter, bio_seg); +		if (do_datacrc) +			crc = ceph_crc32c_page(crc, page, page_offset, ret); +		(void) ceph_msg_data_advance(&msg->cursor, (size_t)ret);  	} +	if (do_datacrc) +		con->in_data_crc = crc; -	return ret; +	return 1;	/* must return > 0 to indicate success */  } -#endif  /*   * read (part of) a message.   */ +static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); +  static int read_partial_message(struct ceph_connection *con)  {  	struct ceph_msg *m = con->in_msg; +	int size; +	int end;  	int ret; -	int to, left; -	unsigned front_len, middle_len, data_len, data_off; -	int datacrc = con->msgr->nocrc; -	int skip; +	unsigned int front_len, middle_len, data_len; +	bool do_datacrc = !con->msgr->nocrc;  	u64 seq; +	u32 crc;  	dout("read_partial_message con %p msg %p\n", con, m);  	/* header */ -	while (con->in_base_pos < sizeof(con->in_hdr)) { -		left = sizeof(con->in_hdr) - con->in_base_pos; -		ret = ceph_tcp_recvmsg(con->sock, -				       (char *)&con->in_hdr + con->in_base_pos, -				       left); -		if (ret <= 0) -			return ret; -		con->in_base_pos += ret; -		if (con->in_base_pos == sizeof(con->in_hdr)) { -			u32 crc = crc32c(0, (void *)&con->in_hdr, -				 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc)); -			if (crc != le32_to_cpu(con->in_hdr.crc)) { -				pr_err("read_partial_message bad hdr " -				       " crc %u != expected %u\n", -				       crc, con->in_hdr.crc); -				return -EBADMSG; -			} -		} +	size = sizeof (con->in_hdr); +	end = size; +	ret = read_partial(con, end, size, &con->in_hdr); +	if (ret <= 0) +		return ret; + +	crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); +	if (cpu_to_le32(crc) != con->in_hdr.crc) { +		pr_err("read_partial_message bad hdr " +		       " crc %u != expected %u\n", +		       crc, con->in_hdr.crc); +		return -EBADMSG;  	} +  	front_len = le32_to_cpu(con->in_hdr.front_len);  	if (front_len > CEPH_MSG_MAX_FRONT_LEN)  		return -EIO;  	middle_len = le32_to_cpu(con->in_hdr.middle_len); -	if (middle_len > CEPH_MSG_MAX_DATA_LEN) +	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)  		return -EIO;  	data_len = le32_to_cpu(con->in_hdr.data_len);  	if (data_len > CEPH_MSG_MAX_DATA_LEN)  		return -EIO; -	data_off = le16_to_cpu(con->in_hdr.data_off);  	/* verify seq# */  	seq = le64_to_cpu(con->in_hdr.seq);  	if ((s64)seq - (s64)con->in_seq < 1) { -		pr_info("skipping %s%lld %s seq %lld, expected %lld\n", +		pr_info("skipping %s%lld %s seq %lld expected %lld\n",  			ENTITY_NAME(con->peer_name),  			ceph_pr_addr(&con->peer_addr.in_addr),  			seq, con->in_seq + 1);  		con->in_base_pos = -front_len - middle_len - data_len -  			sizeof(m->footer);  		con->in_tag = CEPH_MSGR_TAG_READY; -		con->in_seq++;  		return 0;  	} else if ((s64)seq - (s64)con->in_seq > 1) {  		pr_err("read_partial_message bad seq %lld expected %lld\n", @@ -1550,36 +2291,43 @@ static int read_partial_message(struct ceph_connection *con)  	/* allocate message? */  	if (!con->in_msg) { +		int skip = 0; +  		dout("got hdr type %d front %d data %d\n", con->in_hdr.type, -		     con->in_hdr.front_len, con->in_hdr.data_len); -		skip = 0; -		con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); +		     front_len, data_len); +		ret = ceph_con_in_msg_alloc(con, &skip); +		if (ret < 0) +			return ret; + +		BUG_ON(!con->in_msg ^ skip); +		if (con->in_msg && data_len > con->in_msg->data_length) { +			pr_warning("%s skipping long message (%u > %zd)\n", +				__func__, data_len, con->in_msg->data_length); +			ceph_msg_put(con->in_msg); +			con->in_msg = NULL; +			skip = 1; +		}  		if (skip) {  			/* skip this message */  			dout("alloc_msg said skip message\n"); -			BUG_ON(con->in_msg);  			con->in_base_pos = -front_len - middle_len - data_len -  				sizeof(m->footer);  			con->in_tag = CEPH_MSGR_TAG_READY;  			con->in_seq++;  			return 0;  		} -		if (!con->in_msg) { -			con->error_msg = -				"error allocating memory for incoming message"; -			return -ENOMEM; -		} + +		BUG_ON(!con->in_msg); +		BUG_ON(con->in_msg->con != con);  		m = con->in_msg;  		m->front.iov_len = 0;    /* haven't read it yet */  		if (m->middle)  			m->middle->vec.iov_len = 0; -		con->in_msg_pos.page = 0; -		if (m->pages) -			con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; -		else -			con->in_msg_pos.page_pos = 0; -		con->in_msg_pos.data_pos = 0; +		/* prepare for data payload, if any */ + +		if (data_len) +			prepare_message_data(con->in_msg, data_len);  	}  	/* front */ @@ -1596,43 +2344,21 @@ static int read_partial_message(struct ceph_connection *con)  		if (ret <= 0)  			return ret;  	} -#ifdef CONFIG_BLOCK -	if (m->bio && !m->bio_iter) -		init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); -#endif  	/* (page) data */ -	while (con->in_msg_pos.data_pos < data_len) { -		if (m->pages) { -			ret = read_partial_message_pages(con, m->pages, -						 data_len, datacrc); -			if (ret <= 0) -				return ret; -#ifdef CONFIG_BLOCK -		} else if (m->bio) { - -			ret = read_partial_message_bio(con, -						 &m->bio_iter, &m->bio_seg, -						 data_len, datacrc); -			if (ret <= 0) -				return ret; -#endif -		} else { -			BUG_ON(1); -		} -	} - -	/* footer */ -	to = sizeof(m->hdr) + sizeof(m->footer); -	while (con->in_base_pos < to) { -		left = to - con->in_base_pos; -		ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer + -				       (con->in_base_pos - sizeof(m->hdr)), -				       left); +	if (data_len) { +		ret = read_partial_msg_data(con);  		if (ret <= 0)  			return ret; -		con->in_base_pos += ret;  	} + +	/* footer */ +	size = sizeof (m->footer); +	end += size; +	ret = read_partial(con, end, size, &m->footer); +	if (ret <= 0) +		return ret; +  	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",  	     m, front_len, m->footer.front_crc, middle_len,  	     m->footer.middle_crc, data_len, m->footer.data_crc); @@ -1648,7 +2374,7 @@ static int read_partial_message(struct ceph_connection *con)  		       m, con->in_middle_crc, m->footer.middle_crc);  		return -EBADMSG;  	} -	if (datacrc && +	if (do_datacrc &&  	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&  	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {  		pr_err("read_partial_message %p data crc %u != exp. %u\n", m, @@ -1668,8 +2394,11 @@ static void process_message(struct ceph_connection *con)  {  	struct ceph_msg *msg; +	BUG_ON(con->in_msg->con != con); +	con->in_msg->con = NULL;  	msg = con->in_msg;  	con->in_msg = NULL; +	con->ops->put(con);  	/* if first message, set peer_name */  	if (con->peer_name.type == 0) @@ -1689,7 +2418,6 @@ static void process_message(struct ceph_connection *con)  	con->ops->dispatch(con, msg);  	mutex_lock(&con->mutex); -	prepare_read_tag(con);  } @@ -1699,40 +2427,29 @@ static void process_message(struct ceph_connection *con)   */  static int try_write(struct ceph_connection *con)  { -	struct ceph_messenger *msgr = con->msgr;  	int ret = 1; -	dout("try_write start %p state %lu nref %d\n", con, con->state, -	     atomic_read(&con->nref)); +	dout("try_write start %p state %lu\n", con, con->state);  more:  	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);  	/* open the socket first? */ -	if (con->sock == NULL) { -		/* -		 * if we were STANDBY and are reconnecting _this_ -		 * connection, bump connect_seq now.  Always bump -		 * global_seq. -		 */ -		if (test_and_clear_bit(STANDBY, &con->state)) -			con->connect_seq++; +	if (con->state == CON_STATE_PREOPEN) { +		BUG_ON(con->sock); +		con->state = CON_STATE_CONNECTING; -		prepare_write_banner(msgr, con); -		prepare_write_connect(msgr, con, 1); +		con_out_kvec_reset(con); +		prepare_write_banner(con);  		prepare_read_banner(con); -		set_bit(CONNECTING, &con->state); -		clear_bit(NEGOTIATING, &con->state);  		BUG_ON(con->in_msg);  		con->in_tag = CEPH_MSGR_TAG_READY;  		dout("try_write initiating connect on %p new state %lu\n",  		     con, con->state); -		con->sock = ceph_tcp_connect(con); -		if (IS_ERR(con->sock)) { -			con->sock = NULL; +		ret = ceph_tcp_connect(con); +		if (ret < 0) {  			con->error_msg = "connect error"; -			ret = -1;  			goto out;  		}  	} @@ -1742,16 +2459,12 @@ more_kvec:  	if (con->out_skip) {  		ret = write_partial_skip(con);  		if (ret <= 0) -			goto done; -		if (ret < 0) { -			dout("try_write write_partial_skip err %d\n", ret); -			goto done; -		} +			goto out;  	}  	if (con->out_kvec_left) {  		ret = write_partial_kvec(con);  		if (ret <= 0) -			goto done; +			goto out;  	}  	/* msg pages? */ @@ -1762,20 +2475,20 @@ more_kvec:  			goto do_next;  		} -		ret = write_partial_msg_pages(con); +		ret = write_partial_message_data(con);  		if (ret == 1)  			goto more_kvec;  /* we need to send the footer, too! */  		if (ret == 0) -			goto done; +			goto out;  		if (ret < 0) { -			dout("try_write write_partial_msg_pages err %d\n", +			dout("try_write write_partial_message_data err %d\n",  			     ret); -			goto done; +			goto out;  		}  	}  do_next: -	if (!test_bit(CONNECTING, &con->state)) { +	if (con->state == CON_STATE_OPEN) {  		/* is anything else pending? */  		if (!list_empty(&con->out_queue)) {  			prepare_write_message(con); @@ -1785,19 +2498,18 @@ do_next:  			prepare_write_ack(con);  			goto more;  		} -		if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) { +		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {  			prepare_write_keepalive(con);  			goto more;  		}  	}  	/* Nothing to do! */ -	clear_bit(WRITE_PENDING, &con->state); +	con_flag_clear(con, CON_FLAG_WRITE_PENDING);  	dout("try_write nothing else to write.\n"); -done:  	ret = 0;  out: -	dout("try_write done on %p\n", con); +	dout("try_write done on %p ret %d\n", con, ret);  	return ret;  } @@ -1810,50 +2522,69 @@ static int try_read(struct ceph_connection *con)  {  	int ret = -1; -	if (!con->sock) -		return 0; - -	if (test_bit(STANDBY, &con->state)) +more: +	dout("try_read start on %p state %lu\n", con, con->state); +	if (con->state != CON_STATE_CONNECTING && +	    con->state != CON_STATE_NEGOTIATING && +	    con->state != CON_STATE_OPEN)  		return 0; -	dout("try_read start on %p\n", con); +	BUG_ON(!con->sock); -more:  	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,  	     con->in_base_pos); -	if (test_bit(CONNECTING, &con->state)) { -		if (!test_bit(NEGOTIATING, &con->state)) { -			dout("try_read connecting\n"); -			ret = read_partial_banner(con); -			if (ret <= 0) -				goto done; -			if (process_banner(con) < 0) { -				ret = -1; -				goto out; -			} -		} + +	if (con->state == CON_STATE_CONNECTING) { +		dout("try_read connecting\n"); +		ret = read_partial_banner(con); +		if (ret <= 0) +			goto out; +		ret = process_banner(con); +		if (ret < 0) +			goto out; + +		con->state = CON_STATE_NEGOTIATING; + +		/* +		 * Received banner is good, exchange connection info. +		 * Do not reset out_kvec, as sending our banner raced +		 * with receiving peer banner after connect completed. +		 */ +		ret = prepare_write_connect(con); +		if (ret < 0) +			goto out; +		prepare_read_connect(con); + +		/* Send connection info before awaiting response */ +		goto out; +	} + +	if (con->state == CON_STATE_NEGOTIATING) { +		dout("try_read negotiating\n");  		ret = read_partial_connect(con);  		if (ret <= 0) -			goto done; -		if (process_connect(con) < 0) { -			ret = -1;  			goto out; -		} +		ret = process_connect(con); +		if (ret < 0) +			goto out;  		goto more;  	} +	WARN_ON(con->state != CON_STATE_OPEN); +  	if (con->in_base_pos < 0) {  		/*  		 * skipping + discarding content.  		 *  		 * FIXME: there must be a better way to do this!  		 */ -		static char buf[1024]; -		int skip = min(1024, -con->in_base_pos); +		static char buf[SKIP_BUF_SIZE]; +		int skip = min((int) sizeof (buf), -con->in_base_pos); +  		dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);  		ret = ceph_tcp_recvmsg(con->sock, buf, skip);  		if (ret <= 0) -			goto done; +			goto out;  		con->in_base_pos += ret;  		if (con->in_base_pos)  			goto more; @@ -1864,7 +2595,7 @@ more:  		 */  		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);  		if (ret <= 0) -			goto done; +			goto out;  		dout("try_read got tag %d\n", (int)con->in_tag);  		switch (con->in_tag) {  		case CEPH_MSGR_TAG_MSG: @@ -1874,8 +2605,9 @@ more:  			prepare_read_ack(con);  			break;  		case CEPH_MSGR_TAG_CLOSE: -			set_bit(CLOSED, &con->state);   /* fixme */ -			goto done; +			con_close_socket(con); +			con->state = CON_STATE_CLOSED; +			goto out;  		default:  			goto bad_tag;  		} @@ -1887,31 +2619,35 @@ more:  			case -EBADMSG:  				con->error_msg = "bad crc";  				ret = -EIO; -				goto out; +				break;  			case -EIO:  				con->error_msg = "io error"; -				goto out; -			default: -				goto done; +				break;  			} +			goto out;  		}  		if (con->in_tag == CEPH_MSGR_TAG_READY)  			goto more;  		process_message(con); +		if (con->state == CON_STATE_OPEN) +			prepare_read_tag(con);  		goto more;  	} -	if (con->in_tag == CEPH_MSGR_TAG_ACK) { +	if (con->in_tag == CEPH_MSGR_TAG_ACK || +	    con->in_tag == CEPH_MSGR_TAG_SEQ) { +		/* +		 * the final handshake seq exchange is semantically +		 * equivalent to an ACK +		 */  		ret = read_partial_ack(con);  		if (ret <= 0) -			goto done; +			goto out;  		process_ack(con);  		goto more;  	} -done: -	ret = 0;  out: -	dout("try_read done on %p\n", con); +	dout("try_read done on %p ret %d\n", con, ret);  	return ret;  bad_tag: @@ -1923,46 +2659,97 @@ bad_tag:  /* - * Atomically queue work on a connection.  Bump @con reference to - * avoid races with connection teardown. - * - * There is some trickery going on with QUEUED and BUSY because we - * only want a _single_ thread operating on each connection at any - * point in time, but we want to use all available CPUs. - * - * The worker thread only proceeds if it can atomically set BUSY.  It - * clears QUEUED and does it's thing.  When it thinks it's done, it - * clears BUSY, then rechecks QUEUED.. if it's set again, it loops - * (tries again to set BUSY). - * - * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we - * try to queue work.  If that fails (work is already queued, or BUSY) - * we give up (work also already being done or is queued) but leave QUEUED - * set so that the worker thread will loop if necessary. + * Atomically queue work on a connection after the specified delay. + * Bump @con reference to avoid races with connection teardown. + * Returns 0 if work was queued, or an error code otherwise.   */ +static int queue_con_delay(struct ceph_connection *con, unsigned long delay) +{ +	if (!con->ops->get(con)) { +		dout("%s %p ref count 0\n", __func__, con); + +		return -ENOENT; +	} + +	if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { +		dout("%s %p - already queued\n", __func__, con); +		con->ops->put(con); + +		return -EBUSY; +	} + +	dout("%s %p %lu\n", __func__, con, delay); + +	return 0; +} +  static void queue_con(struct ceph_connection *con)  { -	if (test_bit(DEAD, &con->state)) { -		dout("queue_con %p ignoring: DEAD\n", -		     con); -		return; +	(void) queue_con_delay(con, 0); +} + +static bool con_sock_closed(struct ceph_connection *con) +{ +	if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED)) +		return false; + +#define CASE(x)								\ +	case CON_STATE_ ## x:						\ +		con->error_msg = "socket closed (con state " #x ")";	\ +		break; + +	switch (con->state) { +	CASE(CLOSED); +	CASE(PREOPEN); +	CASE(CONNECTING); +	CASE(NEGOTIATING); +	CASE(OPEN); +	CASE(STANDBY); +	default: +		pr_warning("%s con %p unrecognized state %lu\n", +			__func__, con, con->state); +		con->error_msg = "unrecognized con state"; +		BUG(); +		break;  	} +#undef CASE -	if (!con->ops->get(con)) { -		dout("queue_con %p ref count 0\n", con); -		return; +	return true; +} + +static bool con_backoff(struct ceph_connection *con) +{ +	int ret; + +	if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF)) +		return false; + +	ret = queue_con_delay(con, round_jiffies_relative(con->delay)); +	if (ret) { +		dout("%s: con %p FAILED to back off %lu\n", __func__, +			con, con->delay); +		BUG_ON(ret == -ENOENT); +		con_flag_set(con, CON_FLAG_BACKOFF);  	} -	set_bit(QUEUED, &con->state); -	if (test_bit(BUSY, &con->state)) { -		dout("queue_con %p - already BUSY\n", con); -		con->ops->put(con); -	} else if (!queue_work(ceph_msgr_wq, &con->work.work)) { -		dout("queue_con %p - already queued\n", con); -		con->ops->put(con); -	} else { -		dout("queue_con %p\n", con); +	return true; +} + +/* Finish fault handling; con->mutex must *not* be held here */ + +static void con_fault_finish(struct ceph_connection *con) +{ +	/* +	 * in case we faulted due to authentication, invalidate our +	 * current tickets so that we can get new ones. +	 */ +	if (con->auth_retry && con->ops->invalidate_authorizer) { +		dout("calling invalidate_authorizer()\n"); +		con->ops->invalidate_authorizer(con);  	} + +	if (con->ops->fault) +		con->ops->fault(con);  }  /* @@ -1972,152 +2759,132 @@ static void con_work(struct work_struct *work)  {  	struct ceph_connection *con = container_of(work, struct ceph_connection,  						   work.work); -	int backoff = 0; - -more: -	if (test_and_set_bit(BUSY, &con->state) != 0) { -		dout("con_work %p BUSY already set\n", con); -		goto out; -	} -	dout("con_work %p start, clearing QUEUED\n", con); -	clear_bit(QUEUED, &con->state); +	bool fault;  	mutex_lock(&con->mutex); +	while (true) { +		int ret; -	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ -		dout("con_work CLOSED\n"); -		con_close_socket(con); -		goto done; -	} -	if (test_and_clear_bit(OPENING, &con->state)) { -		/* reopen w/ new peer */ -		dout("con_work OPENING\n"); -		con_close_socket(con); -	} - -	if (test_and_clear_bit(SOCK_CLOSED, &con->state) || -	    try_read(con) < 0 || -	    try_write(con) < 0) { -		mutex_unlock(&con->mutex); -		backoff = 1; -		ceph_fault(con);     /* error/fault path */ -		goto done_unlocked; -	} +		if ((fault = con_sock_closed(con))) { +			dout("%s: con %p SOCK_CLOSED\n", __func__, con); +			break; +		} +		if (con_backoff(con)) { +			dout("%s: con %p BACKOFF\n", __func__, con); +			break; +		} +		if (con->state == CON_STATE_STANDBY) { +			dout("%s: con %p STANDBY\n", __func__, con); +			break; +		} +		if (con->state == CON_STATE_CLOSED) { +			dout("%s: con %p CLOSED\n", __func__, con); +			BUG_ON(con->sock); +			break; +		} +		if (con->state == CON_STATE_PREOPEN) { +			dout("%s: con %p PREOPEN\n", __func__, con); +			BUG_ON(con->sock); +		} -done: -	mutex_unlock(&con->mutex); +		ret = try_read(con); +		if (ret < 0) { +			if (ret == -EAGAIN) +				continue; +			con->error_msg = "socket error on read"; +			fault = true; +			break; +		} -done_unlocked: -	clear_bit(BUSY, &con->state); -	dout("con->state=%lu\n", con->state); -	if (test_bit(QUEUED, &con->state)) { -		if (!backoff || test_bit(OPENING, &con->state)) { -			dout("con_work %p QUEUED reset, looping\n", con); -			goto more; +		ret = try_write(con); +		if (ret < 0) { +			if (ret == -EAGAIN) +				continue; +			con->error_msg = "socket error on write"; +			fault = true;  		} -		dout("con_work %p QUEUED reset, but just faulted\n", con); -		clear_bit(QUEUED, &con->state); + +		break;	/* If we make it to here, we're done */  	} -	dout("con_work %p done\n", con); +	if (fault) +		con_fault(con); +	mutex_unlock(&con->mutex); + +	if (fault) +		con_fault_finish(con); -out:  	con->ops->put(con);  } -  /*   * Generic error/fault handler.  A retry mechanism is used with   * exponential backoff   */ -static void ceph_fault(struct ceph_connection *con) +static void con_fault(struct ceph_connection *con)  { -	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), +	pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),  	       ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);  	dout("fault %p state %lu to peer %s\n",  	     con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); -	if (test_bit(LOSSYTX, &con->state)) { -		dout("fault on LOSSYTX channel\n"); -		goto out; -	} - -	mutex_lock(&con->mutex); -	if (test_bit(CLOSED, &con->state)) -		goto out_unlock; +	WARN_ON(con->state != CON_STATE_CONNECTING && +	       con->state != CON_STATE_NEGOTIATING && +	       con->state != CON_STATE_OPEN);  	con_close_socket(con); +	if (con_flag_test(con, CON_FLAG_LOSSYTX)) { +		dout("fault on LOSSYTX channel, marking CLOSED\n"); +		con->state = CON_STATE_CLOSED; +		return; +	} +  	if (con->in_msg) { +		BUG_ON(con->in_msg->con != con); +		con->in_msg->con = NULL;  		ceph_msg_put(con->in_msg);  		con->in_msg = NULL; +		con->ops->put(con);  	}  	/* Requeue anything that hasn't been acked */  	list_splice_init(&con->out_sent, &con->out_queue); -	/* If there are no messages in the queue, place the connection -	 * in a STANDBY state (i.e., don't try to reconnect just yet). */ -	if (list_empty(&con->out_queue) && !con->out_keepalive_pending) { -		dout("fault setting STANDBY\n"); -		set_bit(STANDBY, &con->state); +	/* If there are no messages queued or keepalive pending, place +	 * the connection in a STANDBY state */ +	if (list_empty(&con->out_queue) && +	    !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) { +		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); +		con_flag_clear(con, CON_FLAG_WRITE_PENDING); +		con->state = CON_STATE_STANDBY;  	} else {  		/* retry after a delay. */ +		con->state = CON_STATE_PREOPEN;  		if (con->delay == 0)  			con->delay = BASE_DELAY_INTERVAL;  		else if (con->delay < MAX_DELAY_INTERVAL)  			con->delay *= 2; -		dout("fault queueing %p delay %lu\n", con, con->delay); -		con->ops->get(con); -		if (queue_delayed_work(ceph_msgr_wq, &con->work, -				       round_jiffies_relative(con->delay)) == 0) -			con->ops->put(con); -	} - -out_unlock: -	mutex_unlock(&con->mutex); -out: -	/* -	 * in case we faulted due to authentication, invalidate our -	 * current tickets so that we can get new ones. -	 */ -	if (con->auth_retry && con->ops->invalidate_authorizer) { -		dout("calling invalidate_authorizer()\n"); -		con->ops->invalidate_authorizer(con); +		con_flag_set(con, CON_FLAG_BACKOFF); +		queue_con(con);  	} - -	if (con->ops->fault) -		con->ops->fault(con);  }  /* - * create a new messenger instance + * initialize a new messenger instance   */ -struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, -					     u32 supported_features, -					     u32 required_features) +void ceph_messenger_init(struct ceph_messenger *msgr, +			struct ceph_entity_addr *myaddr, +			u64 supported_features, +			u64 required_features, +			bool nocrc)  { -	struct ceph_messenger *msgr; - -	msgr = kzalloc(sizeof(*msgr), GFP_KERNEL); -	if (msgr == NULL) -		return ERR_PTR(-ENOMEM); -  	msgr->supported_features = supported_features;  	msgr->required_features = required_features;  	spin_lock_init(&msgr->global_seq_lock); -	/* the zero page is needed if a request is "canceled" while the message -	 * is being written over the socket */ -	msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO); -	if (!msgr->zero_page) { -		kfree(msgr); -		return ERR_PTR(-ENOMEM); -	} -	kmap(msgr->zero_page); -  	if (myaddr)  		msgr->inst.addr = *myaddr; @@ -2125,42 +2892,49 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,  	msgr->inst.addr.type = 0;  	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));  	encode_my_addr(msgr); +	msgr->nocrc = nocrc; -	dout("messenger_create %p\n", msgr); -	return msgr; +	atomic_set(&msgr->stopping, 0); + +	dout("%s %p\n", __func__, msgr);  } -EXPORT_SYMBOL(ceph_messenger_create); +EXPORT_SYMBOL(ceph_messenger_init); -void ceph_messenger_destroy(struct ceph_messenger *msgr) +static void clear_standby(struct ceph_connection *con)  { -	dout("destroy %p\n", msgr); -	kunmap(msgr->zero_page); -	__free_page(msgr->zero_page); -	kfree(msgr); -	dout("destroyed messenger %p\n", msgr); +	/* come back from STANDBY? */ +	if (con->state == CON_STATE_STANDBY) { +		dout("clear_standby %p and ++connect_seq\n", con); +		con->state = CON_STATE_PREOPEN; +		con->connect_seq++; +		WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING)); +		WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)); +	}  } -EXPORT_SYMBOL(ceph_messenger_destroy);  /*   * Queue up an outgoing message on the given connection.   */  void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)  { -	if (test_bit(CLOSED, &con->state)) { -		dout("con_send %p closed, dropping %p\n", con, msg); -		ceph_msg_put(msg); -		return; -	} -  	/* set src+dst */  	msg->hdr.src = con->msgr->inst.name; -  	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); -  	msg->needs_out_seq = true; -	/* queue */  	mutex_lock(&con->mutex); + +	if (con->state == CON_STATE_CLOSED) { +		dout("con_send %p closed, dropping %p\n", con, msg); +		ceph_msg_put(msg); +		mutex_unlock(&con->mutex); +		return; +	} + +	BUG_ON(msg->con != NULL); +	msg->con = con->ops->get(con); +	BUG_ON(msg->con == NULL); +  	BUG_ON(!list_empty(&msg->list_head));  	list_add_tail(&msg->list_head, &con->out_queue);  	dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, @@ -2169,11 +2943,13 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)  	     le32_to_cpu(msg->hdr.front_len),  	     le32_to_cpu(msg->hdr.middle_len),  	     le32_to_cpu(msg->hdr.data_len)); + +	clear_standby(con);  	mutex_unlock(&con->mutex);  	/* if there wasn't anything waiting to send before, queue  	 * new work */ -	if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) +	if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)  		queue_con(con);  }  EXPORT_SYMBOL(ceph_con_send); @@ -2181,24 +2957,34 @@ EXPORT_SYMBOL(ceph_con_send);  /*   * Revoke a message that was previously queued for send   */ -void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) +void ceph_msg_revoke(struct ceph_msg *msg)  { +	struct ceph_connection *con = msg->con; + +	if (!con) +		return;		/* Message not in our possession */ +  	mutex_lock(&con->mutex);  	if (!list_empty(&msg->list_head)) { -		dout("con_revoke %p msg %p - was on queue\n", con, msg); +		dout("%s %p msg %p - was on queue\n", __func__, con, msg);  		list_del_init(&msg->list_head); -		ceph_msg_put(msg); +		BUG_ON(msg->con == NULL); +		msg->con->ops->put(msg->con); +		msg->con = NULL;  		msg->hdr.seq = 0; + +		ceph_msg_put(msg);  	}  	if (con->out_msg == msg) { -		dout("con_revoke %p msg %p - was sending\n", con, msg); +		dout("%s %p msg %p - was sending\n", __func__, con, msg);  		con->out_msg = NULL;  		if (con->out_kvec_is_msg) {  			con->out_skip = con->out_kvec_bytes;  			con->out_kvec_is_msg = false;  		} -		ceph_msg_put(msg);  		msg->hdr.seq = 0; + +		ceph_msg_put(msg);  	}  	mutex_unlock(&con->mutex);  } @@ -2206,17 +2992,27 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)  /*   * Revoke a message that we may be reading data into   */ -void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) +void ceph_msg_revoke_incoming(struct ceph_msg *msg)  { +	struct ceph_connection *con; + +	BUG_ON(msg == NULL); +	if (!msg->con) { +		dout("%s msg %p null con\n", __func__, msg); + +		return;		/* Message not in our possession */ +	} + +	con = msg->con;  	mutex_lock(&con->mutex); -	if (con->in_msg && con->in_msg == msg) { -		unsigned front_len = le32_to_cpu(con->in_hdr.front_len); -		unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len); -		unsigned data_len = le32_to_cpu(con->in_hdr.data_len); +	if (con->in_msg == msg) { +		unsigned int front_len = le32_to_cpu(con->in_hdr.front_len); +		unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len); +		unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);  		/* skip rest of message */ -		dout("con_revoke_pages %p msg %p revoked\n", con, msg); -			con->in_base_pos = con->in_base_pos - +		dout("%s %p msg %p revoked\n", __func__, con, msg); +		con->in_base_pos = con->in_base_pos -  				sizeof(struct ceph_msg_header) -  				front_len -  				middle_len - @@ -2227,8 +3023,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)  		con->in_tag = CEPH_MSGR_TAG_READY;  		con->in_seq++;  	} else { -		dout("con_revoke_pages %p msg %p pages %p no-op\n", -		     con, con->in_msg, msg); +		dout("%s %p in_msg %p msg %p no-op\n", +		     __func__, con, con->in_msg, msg);  	}  	mutex_unlock(&con->mutex);  } @@ -2238,75 +3034,132 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)   */  void ceph_con_keepalive(struct ceph_connection *con)  { -	if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && -	    test_and_set_bit(WRITE_PENDING, &con->state) == 0) +	dout("con_keepalive %p\n", con); +	mutex_lock(&con->mutex); +	clear_standby(con); +	mutex_unlock(&con->mutex); +	if (con_flag_test_and_set(con, CON_FLAG_KEEPALIVE_PENDING) == 0 && +	    con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)  		queue_con(con);  }  EXPORT_SYMBOL(ceph_con_keepalive); +static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) +{ +	struct ceph_msg_data *data; + +	if (WARN_ON(!ceph_msg_data_type_valid(type))) +		return NULL; + +	data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS); +	if (data) +		data->type = type; +	INIT_LIST_HEAD(&data->links); + +	return data; +} + +static void ceph_msg_data_destroy(struct ceph_msg_data *data) +{ +	if (!data) +		return; + +	WARN_ON(!list_empty(&data->links)); +	if (data->type == CEPH_MSG_DATA_PAGELIST) { +		ceph_pagelist_release(data->pagelist); +		kfree(data->pagelist); +	} +	kmem_cache_free(ceph_msg_data_cache, data); +} + +void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, +		size_t length, size_t alignment) +{ +	struct ceph_msg_data *data; + +	BUG_ON(!pages); +	BUG_ON(!length); + +	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES); +	BUG_ON(!data); +	data->pages = pages; +	data->length = length; +	data->alignment = alignment & ~PAGE_MASK; + +	list_add_tail(&data->links, &msg->data); +	msg->data_length += length; +} +EXPORT_SYMBOL(ceph_msg_data_add_pages); + +void ceph_msg_data_add_pagelist(struct ceph_msg *msg, +				struct ceph_pagelist *pagelist) +{ +	struct ceph_msg_data *data; + +	BUG_ON(!pagelist); +	BUG_ON(!pagelist->length); + +	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST); +	BUG_ON(!data); +	data->pagelist = pagelist; + +	list_add_tail(&data->links, &msg->data); +	msg->data_length += pagelist->length; +} +EXPORT_SYMBOL(ceph_msg_data_add_pagelist); + +#ifdef	CONFIG_BLOCK +void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, +		size_t length) +{ +	struct ceph_msg_data *data; + +	BUG_ON(!bio); + +	data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); +	BUG_ON(!data); +	data->bio = bio; +	data->bio_length = length; + +	list_add_tail(&data->links, &msg->data); +	msg->data_length += length; +} +EXPORT_SYMBOL(ceph_msg_data_add_bio); +#endif	/* CONFIG_BLOCK */  /*   * construct a new message with given type, size   * the new msg has a ref count of 1.   */ -struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) +struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, +			      bool can_fail)  {  	struct ceph_msg *m; -	m = kmalloc(sizeof(*m), flags); +	m = kmem_cache_zalloc(ceph_msg_cache, flags);  	if (m == NULL)  		goto out; -	kref_init(&m->kref); -	INIT_LIST_HEAD(&m->list_head); -	m->hdr.tid = 0;  	m->hdr.type = cpu_to_le16(type);  	m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); -	m->hdr.version = 0;  	m->hdr.front_len = cpu_to_le32(front_len); -	m->hdr.middle_len = 0; -	m->hdr.data_len = 0; -	m->hdr.data_off = 0; -	m->hdr.reserved = 0; -	m->footer.front_crc = 0; -	m->footer.middle_crc = 0; -	m->footer.data_crc = 0; -	m->footer.flags = 0; -	m->front_max = front_len; -	m->front_is_vmalloc = false; -	m->more_to_follow = false; -	m->pool = NULL; + +	INIT_LIST_HEAD(&m->list_head); +	kref_init(&m->kref); +	INIT_LIST_HEAD(&m->data);  	/* front */  	if (front_len) { -		if (front_len > PAGE_CACHE_SIZE) { -			m->front.iov_base = __vmalloc(front_len, flags, -						      PAGE_KERNEL); -			m->front_is_vmalloc = true; -		} else { -			m->front.iov_base = kmalloc(front_len, flags); -		} +		m->front.iov_base = ceph_kvmalloc(front_len, flags);  		if (m->front.iov_base == NULL) { -			pr_err("msg_new can't allocate %d bytes\n", +			dout("ceph_msg_new can't allocate %d bytes\n",  			     front_len);  			goto out2;  		}  	} else {  		m->front.iov_base = NULL;  	} -	m->front.iov_len = front_len; - -	/* middle */ -	m->middle = NULL; - -	/* data */ -	m->nr_pages = 0; -	m->pages = NULL; -	m->pagelist = NULL; -	m->bio = NULL; -	m->bio_iter = NULL; -	m->bio_seg = 0; -	m->trail = NULL; +	m->front_alloc_len = m->front.iov_len = front_len;  	dout("ceph_msg_new %p front %d\n", m, front_len);  	return m; @@ -2314,7 +3167,14 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)  out2:  	ceph_msg_put(m);  out: -	pr_err("msg_new can't create type %d front %d\n", type, front_len); +	if (!can_fail) { +		pr_err("msg_new can't create type %d front %d\n", type, +		       front_len); +		WARN_ON(1); +	} else { +		dout("msg_new can't create type %d front %d\n", type, +		     front_len); +	}  	return NULL;  }  EXPORT_SYMBOL(ceph_msg_new); @@ -2343,45 +3203,66 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)  }  /* - * Generic message allocator, for incoming messages. + * Allocate a message for receiving an incoming message on a + * connection, and save the result in con->in_msg.  Uses the + * connection's private alloc_msg op if available. + * + * Returns 0 on success, or a negative error code. + * + * On success, if we set *skip = 1: + *  - the next message should be skipped and ignored. + *  - con->in_msg == NULL + * or if we set *skip = 0: + *  - con->in_msg is non-null. + * On error (ENOMEM, EAGAIN, ...), + *  - con->in_msg == NULL   */ -static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, -				struct ceph_msg_header *hdr, -				int *skip) +static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)  { -	int type = le16_to_cpu(hdr->type); -	int front_len = le32_to_cpu(hdr->front_len); +	struct ceph_msg_header *hdr = &con->in_hdr;  	int middle_len = le32_to_cpu(hdr->middle_len); -	struct ceph_msg *msg = NULL; -	int ret; +	struct ceph_msg *msg; +	int ret = 0; -	if (con->ops->alloc_msg) { -		mutex_unlock(&con->mutex); -		msg = con->ops->alloc_msg(con, hdr, skip); -		mutex_lock(&con->mutex); -		if (!msg || *skip) -			return NULL; -	} -	if (!msg) { -		*skip = 0; -		msg = ceph_msg_new(type, front_len, GFP_NOFS); -		if (!msg) { -			pr_err("unable to allocate msg type %d len %d\n", -			       type, front_len); -			return NULL; -		} +	BUG_ON(con->in_msg != NULL); +	BUG_ON(!con->ops->alloc_msg); + +	mutex_unlock(&con->mutex); +	msg = con->ops->alloc_msg(con, hdr, skip); +	mutex_lock(&con->mutex); +	if (con->state != CON_STATE_OPEN) { +		if (msg) +			ceph_msg_put(msg); +		return -EAGAIN;  	} -	memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); +	if (msg) { +		BUG_ON(*skip); +		con->in_msg = msg; +		con->in_msg->con = con->ops->get(con); +		BUG_ON(con->in_msg->con == NULL); +	} else { +		/* +		 * Null message pointer means either we should skip +		 * this message or we couldn't allocate memory.  The +		 * former is not an error. +		 */ +		if (*skip) +			return 0; +		con->error_msg = "error allocating memory for incoming message"; + +		return -ENOMEM; +	} +	memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); -	if (middle_len && !msg->middle) { -		ret = ceph_alloc_middle(con, msg); +	if (middle_len && !con->in_msg->middle) { +		ret = ceph_alloc_middle(con, con->in_msg);  		if (ret < 0) { -			ceph_msg_put(msg); -			return NULL; +			ceph_msg_put(con->in_msg); +			con->in_msg = NULL;  		}  	} -	return msg; +	return ret;  } @@ -2391,11 +3272,8 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,  void ceph_msg_kfree(struct ceph_msg *m)  {  	dout("msg_kfree %p\n", m); -	if (m->front_is_vmalloc) -		vfree(m->front.iov_base); -	else -		kfree(m->front.iov_base); -	kfree(m); +	ceph_kvfree(m->front.iov_base); +	kmem_cache_free(ceph_msg_cache, m);  }  /* @@ -2404,6 +3282,9 @@ void ceph_msg_kfree(struct ceph_msg *m)  void ceph_msg_last_put(struct kref *kref)  {  	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); +	LIST_HEAD(data); +	struct list_head *links; +	struct list_head *next;  	dout("ceph_msg_put last one on %p\n", m);  	WARN_ON(!list_empty(&m->list_head)); @@ -2413,16 +3294,16 @@ void ceph_msg_last_put(struct kref *kref)  		ceph_buffer_put(m->middle);  		m->middle = NULL;  	} -	m->nr_pages = 0; -	m->pages = NULL; -	if (m->pagelist) { -		ceph_pagelist_release(m->pagelist); -		kfree(m->pagelist); -		m->pagelist = NULL; -	} +	list_splice_init(&m->data, &data); +	list_for_each_safe(links, next, &data) { +		struct ceph_msg_data *data; -	m->trail = NULL; +		data = list_entry(links, struct ceph_msg_data, links); +		list_del_init(links); +		ceph_msg_data_destroy(data); +	} +	m->data_length = 0;  	if (m->pool)  		ceph_msgpool_put(m->pool, m); @@ -2433,8 +3314,8 @@ EXPORT_SYMBOL(ceph_msg_last_put);  void ceph_msg_dump(struct ceph_msg *msg)  { -	pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg, -		 msg->front_max, msg->nr_pages); +	pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg, +		 msg->front_alloc_len, msg->data_length);  	print_hex_dump(KERN_DEBUG, "header: ",  		       DUMP_PREFIX_OFFSET, 16, 1,  		       &msg->hdr, sizeof(msg->hdr), true); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 8a079399174..067d3af2eaf 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -8,8 +8,8 @@  #include <linux/ceph/mon_client.h>  #include <linux/ceph/libceph.h> +#include <linux/ceph/debugfs.h>  #include <linux/ceph/decode.h> -  #include <linux/ceph/auth.h>  /* @@ -106,9 +106,9 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)  	monc->pending_auth = 1;  	monc->m_auth->front.iov_len = len;  	monc->m_auth->hdr.front_len = cpu_to_le32(len); -	ceph_con_revoke(monc->con, monc->m_auth); +	ceph_msg_revoke(monc->m_auth);  	ceph_msg_get(monc->m_auth);  /* keep our ref */ -	ceph_con_send(monc->con, monc->m_auth); +	ceph_con_send(&monc->con, monc->m_auth);  }  /* @@ -116,14 +116,15 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)   */  static void __close_session(struct ceph_mon_client *monc)  { -	if (monc->con) { -		dout("__close_session closing mon%d\n", monc->cur_mon); -		ceph_con_revoke(monc->con, monc->m_auth); -		ceph_con_close(monc->con); -		monc->cur_mon = -1; -		monc->pending_auth = 0; -		ceph_auth_reset(monc->auth); -	} +	dout("__close_session closing mon%d\n", monc->cur_mon); +	ceph_msg_revoke(monc->m_auth); +	ceph_msg_revoke_incoming(monc->m_auth_reply); +	ceph_msg_revoke(monc->m_subscribe); +	ceph_msg_revoke_incoming(monc->m_subscribe_ack); +	ceph_con_close(&monc->con); +	monc->cur_mon = -1; +	monc->pending_auth = 0; +	ceph_auth_reset(monc->auth);  }  /* @@ -144,15 +145,14 @@ static int __open_session(struct ceph_mon_client *monc)  		monc->want_next_osdmap = !!monc->want_next_osdmap;  		dout("open_session mon%d opening\n", monc->cur_mon); -		monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON; -		monc->con->peer_name.num = cpu_to_le64(monc->cur_mon); -		ceph_con_open(monc->con, +		ceph_con_open(&monc->con, +			      CEPH_ENTITY_TYPE_MON, monc->cur_mon,  			      &monc->monmap->mon_inst[monc->cur_mon].addr);  		/* initiatiate authentication handshake */  		ret = ceph_auth_build_hello(monc->auth,  					    monc->m_auth->front.iov_base, -					    monc->m_auth->front_max); +					    monc->m_auth->front_alloc_len);  		__send_prepared_auth_request(monc, ret);  	} else {  		dout("open_session mon%d already open\n", monc->cur_mon); @@ -170,7 +170,7 @@ static bool __sub_expired(struct ceph_mon_client *monc)   */  static void __schedule_delayed(struct ceph_mon_client *monc)  { -	unsigned delay; +	unsigned int delay;  	if (monc->cur_mon < 0 || __sub_expired(monc))  		delay = 10 * HZ; @@ -186,7 +186,7 @@ static void __schedule_delayed(struct ceph_mon_client *monc)  static void __send_subscribe(struct ceph_mon_client *monc)  {  	dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", -	     (unsigned)monc->sub_sent, __sub_expired(monc), +	     (unsigned int)monc->sub_sent, __sub_expired(monc),  	     monc->want_next_osdmap);  	if ((__sub_expired(monc) && !monc->sub_sent) ||  	    monc->want_next_osdmap == 1) { @@ -196,14 +196,14 @@ static void __send_subscribe(struct ceph_mon_client *monc)  		int num;  		p = msg->front.iov_base; -		end = p + msg->front_max; +		end = p + msg->front_alloc_len;  		num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;  		ceph_encode_32(&p, num);  		if (monc->want_next_osdmap) {  			dout("__send_subscribe to 'osdmap' %u\n", -			     (unsigned)monc->have_osdmap); +			     (unsigned int)monc->have_osdmap);  			ceph_encode_string(&p, end, "osdmap", 6);  			i = p;  			i->have = cpu_to_le64(monc->have_osdmap); @@ -213,7 +213,7 @@ static void __send_subscribe(struct ceph_mon_client *monc)  		}  		if (monc->want_mdsmap) {  			dout("__send_subscribe to 'mdsmap' %u+\n", -			     (unsigned)monc->have_mdsmap); +			     (unsigned int)monc->have_mdsmap);  			ceph_encode_string(&p, end, "mdsmap", 6);  			i = p;  			i->have = cpu_to_le64(monc->have_mdsmap); @@ -228,8 +228,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)  		msg->front.iov_len = p - msg->front.iov_base;  		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); -		ceph_con_revoke(monc->con, msg); -		ceph_con_send(monc->con, ceph_msg_get(msg)); +		ceph_msg_revoke(msg); +		ceph_con_send(&monc->con, ceph_msg_get(msg));  		monc->sub_sent = jiffies | 1;  /* never 0 */  	} @@ -238,7 +238,7 @@ static void __send_subscribe(struct ceph_mon_client *monc)  static void handle_subscribe_ack(struct ceph_mon_client *monc,  				 struct ceph_msg *msg)  { -	unsigned seconds; +	unsigned int seconds;  	struct ceph_mon_subscribe_ack *h = msg->front.iov_base;  	if (msg->front.iov_len < sizeof(*h)) @@ -249,7 +249,7 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,  	if (monc->hunting) {  		pr_info("mon%d %s session established\n",  			monc->cur_mon, -			ceph_pr_addr(&monc->con->peer_addr.in_addr)); +			ceph_pr_addr(&monc->con.peer_addr.in_addr));  		monc->hunting = false;  	}  	dout("handle_subscribe_ack after %d seconds\n", seconds); @@ -296,21 +296,39 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)  		__send_subscribe(monc);  	mutex_unlock(&monc->mutex);  } +EXPORT_SYMBOL(ceph_monc_request_next_osdmap); + +int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, +			  unsigned long timeout) +{ +	unsigned long started = jiffies; +	int ret; + +	mutex_lock(&monc->mutex); +	while (monc->have_osdmap < epoch) { +		mutex_unlock(&monc->mutex); + +		if (timeout != 0 && time_after_eq(jiffies, started + timeout)) +			return -ETIMEDOUT; + +		ret = wait_event_interruptible_timeout(monc->client->auth_wq, +					 monc->have_osdmap >= epoch, timeout); +		if (ret < 0) +			return ret; + +		mutex_lock(&monc->mutex); +	} + +	mutex_unlock(&monc->mutex); +	return 0; +} +EXPORT_SYMBOL(ceph_monc_wait_osdmap);  /*   *   */  int ceph_monc_open_session(struct ceph_mon_client *monc)  { -	if (!monc->con) { -		monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL); -		if (!monc->con) -			return -ENOMEM; -		ceph_con_init(monc->client->msgr, monc->con); -		monc->con->private = monc; -		monc->con->ops = &mon_con_ops; -	} -  	mutex_lock(&monc->mutex);  	__open_session(monc);  	__schedule_delayed(monc); @@ -320,6 +338,17 @@ int ceph_monc_open_session(struct ceph_mon_client *monc)  EXPORT_SYMBOL(ceph_monc_open_session);  /* + * We require the fsid and global_id in order to initialize our + * debugfs dir. + */ +static bool have_debugfs_info(struct ceph_mon_client *monc) +{ +	dout("have_debugfs_info fsid %d globalid %lld\n", +	     (int)monc->client->have_fsid, monc->auth->global_id); +	return monc->client->have_fsid && monc->auth->global_id > 0; +} + +/*   * The monitor responds with mount ack indicate mount success.  The   * included client ticket allows the client to talk to MDSs and OSDs.   */ @@ -329,9 +358,12 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,  	struct ceph_client *client = monc->client;  	struct ceph_monmap *monmap = NULL, *old = monc->monmap;  	void *p, *end; +	int had_debugfs_info, init_debugfs = 0;  	mutex_lock(&monc->mutex); +	had_debugfs_info = have_debugfs_info(monc); +  	dout("handle_monmap\n");  	p = msg->front.iov_base;  	end = p + msg->front.iov_len; @@ -351,8 +383,29 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,  	client->monc.monmap = monmap;  	kfree(old); +	if (!client->have_fsid) { +		client->have_fsid = true; +		if (!had_debugfs_info && have_debugfs_info(monc)) { +			pr_info("client%lld fsid %pU\n", +				ceph_client_id(monc->client), +				&monc->client->fsid); +			init_debugfs = 1; +		} +		mutex_unlock(&monc->mutex); + +		if (init_debugfs) { +			/* +			 * do debugfs initialization without mutex to avoid +			 * creating a locking dependency +			 */ +			ceph_debugfs_client_init(monc->client); +		} + +		goto out_unlocked; +	}  out:  	mutex_unlock(&monc->mutex); +out_unlocked:  	wake_up_all(&client->auth_wq);  } @@ -439,6 +492,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,  		m = NULL;  	} else {  		dout("get_generic_reply %lld got %p\n", tid, req->reply); +		*skip = 0;  		m = ceph_msg_get(req->reply);  		/*  		 * we don't need to track the connection reading into @@ -450,18 +504,17 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,  	return m;  } -static int do_generic_request(struct ceph_mon_client *monc, -			      struct ceph_mon_generic_request *req) +static int __do_generic_request(struct ceph_mon_client *monc, u64 tid, +				struct ceph_mon_generic_request *req)  {  	int err;  	/* register request */ -	mutex_lock(&monc->mutex); -	req->tid = ++monc->last_tid; +	req->tid = tid != 0 ? tid : ++monc->last_tid;  	req->request->hdr.tid = cpu_to_le64(req->tid);  	__insert_generic_request(monc, req);  	monc->num_generic_requests++; -	ceph_con_send(monc->con, ceph_msg_get(req->request)); +	ceph_con_send(&monc->con, ceph_msg_get(req->request));  	mutex_unlock(&monc->mutex);  	err = wait_for_completion_interruptible(&req->completion); @@ -469,13 +522,24 @@ static int do_generic_request(struct ceph_mon_client *monc,  	mutex_lock(&monc->mutex);  	rb_erase(&req->node, &monc->generic_request_tree);  	monc->num_generic_requests--; -	mutex_unlock(&monc->mutex);  	if (!err)  		err = req->result;  	return err;  } +static int do_generic_request(struct ceph_mon_client *monc, +			      struct ceph_mon_generic_request *req) +{ +	int err; + +	mutex_lock(&monc->mutex); +	err = __do_generic_request(monc, 0, req); +	mutex_unlock(&monc->mutex); + +	return err; +} +  /*   * statfs   */ @@ -528,10 +592,12 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)  	init_completion(&req->completion);  	err = -ENOMEM; -	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS); +	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, +				    true);  	if (!req->request)  		goto out; -	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS); +	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, +				  true);  	if (!req->reply)  		goto out; @@ -550,6 +616,96 @@ out:  }  EXPORT_SYMBOL(ceph_monc_do_statfs); +static void handle_get_version_reply(struct ceph_mon_client *monc, +				     struct ceph_msg *msg) +{ +	struct ceph_mon_generic_request *req; +	u64 tid = le64_to_cpu(msg->hdr.tid); +	void *p = msg->front.iov_base; +	void *end = p + msg->front_alloc_len; +	u64 handle; + +	dout("%s %p tid %llu\n", __func__, msg, tid); + +	ceph_decode_need(&p, end, 2*sizeof(u64), bad); +	handle = ceph_decode_64(&p); +	if (tid != 0 && tid != handle) +		goto bad; + +	mutex_lock(&monc->mutex); +	req = __lookup_generic_req(monc, handle); +	if (req) { +		*(u64 *)req->buf = ceph_decode_64(&p); +		req->result = 0; +		get_generic_request(req); +	} +	mutex_unlock(&monc->mutex); +	if (req) { +		complete_all(&req->completion); +		put_generic_request(req); +	} + +	return; +bad: +	pr_err("corrupt mon_get_version reply\n"); +	ceph_msg_dump(msg); +} + +/* + * Send MMonGetVersion and wait for the reply. + * + * @what: one of "mdsmap", "osdmap" or "monmap" + */ +int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what, +			     u64 *newest) +{ +	struct ceph_mon_generic_request *req; +	void *p, *end; +	u64 tid; +	int err; + +	req = kzalloc(sizeof(*req), GFP_NOFS); +	if (!req) +		return -ENOMEM; + +	kref_init(&req->kref); +	req->buf = newest; +	req->buf_len = sizeof(*newest); +	init_completion(&req->completion); + +	req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, +				    sizeof(u64) + sizeof(u32) + strlen(what), +				    GFP_NOFS, true); +	if (!req->request) { +		err = -ENOMEM; +		goto out; +	} + +	req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024, +				  GFP_NOFS, true); +	if (!req->reply) { +		err = -ENOMEM; +		goto out; +	} + +	p = req->request->front.iov_base; +	end = p + req->request->front_alloc_len; + +	/* fill out request */ +	mutex_lock(&monc->mutex); +	tid = ++monc->last_tid; +	ceph_encode_64(&p, tid); /* handle */ +	ceph_encode_string(&p, end, what, strlen(what)); + +	err = __do_generic_request(monc, tid, req); + +	mutex_unlock(&monc->mutex); +out: +	kref_put(&req->kref, release_generic_request); +	return err; +} +EXPORT_SYMBOL(ceph_monc_do_get_version); +  /*   * pool ops   */ @@ -608,7 +764,7 @@ bad:  /*   * Do a synchronous pool op.   */ -int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op, +static int do_poolop(struct ceph_mon_client *monc, u32 op,  			u32 pool, u64 snapid,  			char *buf, int len)  { @@ -626,10 +782,12 @@ int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,  	init_completion(&req->completion);  	err = -ENOMEM; -	req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS); +	req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS, +				    true);  	if (!req->request)  		goto out; -	req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS); +	req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS, +				  true);  	if (!req->reply)  		goto out; @@ -656,7 +814,7 @@ out:  int ceph_monc_create_snapid(struct ceph_mon_client *monc,  			    u32 pool, u64 *snapid)  { -	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP, +	return do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,  				   pool, 0, (char *)snapid, sizeof(*snapid));  } @@ -665,8 +823,8 @@ EXPORT_SYMBOL(ceph_monc_create_snapid);  int ceph_monc_delete_snapid(struct ceph_mon_client *monc,  			    u32 pool, u64 snapid)  { -	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP, -				   pool, snapid, 0, 0); +	return do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP, +				   pool, snapid, NULL, 0);  } @@ -680,8 +838,9 @@ static void __resend_generic_request(struct ceph_mon_client *monc)  	for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {  		req = rb_entry(p, struct ceph_mon_generic_request, node); -		ceph_con_revoke(monc->con, req->request); -		ceph_con_send(monc->con, ceph_msg_get(req->request)); +		ceph_msg_revoke(req->request); +		ceph_msg_revoke_incoming(req->reply); +		ceph_con_send(&monc->con, ceph_msg_get(req->request));  	}  } @@ -701,11 +860,11 @@ static void delayed_work(struct work_struct *work)  		__close_session(monc);  		__open_session(monc);  /* continue hunting */  	} else { -		ceph_con_keepalive(monc->con); +		ceph_con_keepalive(&monc->con);  		__validate_auth(monc); -		if (monc->auth->ops->is_authenticated(monc->auth)) +		if (ceph_auth_is_authenticated(monc->auth))  			__send_subscribe(monc);  	}  	__schedule_delayed(monc); @@ -737,7 +896,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc)  		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);  	}  	monc->monmap->num_mon = num_mon; -	monc->have_fsid = false;  	return 0;  } @@ -755,13 +913,14 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)  	if (err)  		goto out; -	monc->con = NULL; - +	/* connection */  	/* authentication */  	monc->auth = ceph_auth_init(cl->options->name, -				    cl->options->secret); -	if (IS_ERR(monc->auth)) -		return PTR_ERR(monc->auth); +				    cl->options->key); +	if (IS_ERR(monc->auth)) { +		err = PTR_ERR(monc->auth); +		goto out_monmap; +	}  	monc->auth->want_keys =  		CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |  		CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; @@ -770,23 +929,28 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)  	err = -ENOMEM;  	monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,  				     sizeof(struct ceph_mon_subscribe_ack), -				     GFP_NOFS); +				     GFP_NOFS, true);  	if (!monc->m_subscribe_ack) -		goto out_monmap; +		goto out_auth; -	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS); +	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, +					 true);  	if (!monc->m_subscribe)  		goto out_subscribe_ack; -	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS); +	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS, +					  true);  	if (!monc->m_auth_reply)  		goto out_subscribe; -	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS); +	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true);  	monc->pending_auth = 0;  	if (!monc->m_auth)  		goto out_auth_reply; +	ceph_con_init(&monc->con, monc, &mon_con_ops, +		      &monc->client->msgr); +  	monc->cur_mon = -1;  	monc->hunting = true;  	monc->sub_renew_after = jiffies; @@ -808,6 +972,8 @@ out_subscribe:  	ceph_msg_put(monc->m_subscribe);  out_subscribe_ack:  	ceph_msg_put(monc->m_subscribe_ack); +out_auth: +	ceph_auth_destroy(monc->auth);  out_monmap:  	kfree(monc->monmap);  out: @@ -822,13 +988,17 @@ void ceph_monc_stop(struct ceph_mon_client *monc)  	mutex_lock(&monc->mutex);  	__close_session(monc); -	if (monc->con) { -		monc->con->private = NULL; -		monc->con->ops->put(monc->con); -		monc->con = NULL; -	} +  	mutex_unlock(&monc->mutex); +	/* +	 * flush msgr queue before we destroy ourselves to ensure that: +	 *  - any work that references our embedded con is finished. +	 *  - any osd_client or other work that may reference an authorizer +	 *    finishes before we shut down the auth subsystem. +	 */ +	ceph_msgr_flush(); +  	ceph_auth_destroy(monc->auth);  	ceph_msg_put(monc->m_auth); @@ -845,31 +1015,47 @@ static void handle_auth_reply(struct ceph_mon_client *monc,  {  	int ret;  	int was_auth = 0; +	int had_debugfs_info, init_debugfs = 0;  	mutex_lock(&monc->mutex); -	if (monc->auth->ops) -		was_auth = monc->auth->ops->is_authenticated(monc->auth); +	had_debugfs_info = have_debugfs_info(monc); +	was_auth = ceph_auth_is_authenticated(monc->auth);  	monc->pending_auth = 0;  	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,  				     msg->front.iov_len,  				     monc->m_auth->front.iov_base, -				     monc->m_auth->front_max); +				     monc->m_auth->front_alloc_len);  	if (ret < 0) {  		monc->client->auth_err = ret;  		wake_up_all(&monc->client->auth_wq);  	} else if (ret > 0) {  		__send_prepared_auth_request(monc, ret); -	} else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { +	} else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {  		dout("authenticated, starting session\n"); -		monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; -		monc->client->msgr->inst.name.num = +		monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; +		monc->client->msgr.inst.name.num =  					cpu_to_le64(monc->auth->global_id);  		__send_subscribe(monc);  		__resend_generic_request(monc);  	} + +	if (!had_debugfs_info && have_debugfs_info(monc)) { +		pr_info("client%lld fsid %pU\n", +			ceph_client_id(monc->client), +			&monc->client->fsid); +		init_debugfs = 1; +	}  	mutex_unlock(&monc->mutex); + +	if (init_debugfs) { +		/* +		 * do debugfs initialization without mutex to avoid +		 * creating a locking dependency +		 */ +		ceph_debugfs_client_init(monc->client); +	}  }  static int __validate_auth(struct ceph_mon_client *monc) @@ -880,7 +1066,7 @@ static int __validate_auth(struct ceph_mon_client *monc)  		return 0;  	ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, -			      monc->m_auth->front_max); +			      monc->m_auth->front_alloc_len);  	if (ret <= 0)  		return ret; /* either an error, or no need to authenticate */  	__send_prepared_auth_request(monc, ret); @@ -922,6 +1108,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)  		handle_statfs_reply(monc, msg);  		break; +	case CEPH_MSG_MON_GET_VERSION_REPLY: +		handle_get_version_reply(monc, msg); +		break; +  	case CEPH_MSG_POOLOP_REPLY:  		handle_poolop_reply(monc, msg);  		break; @@ -970,10 +1160,21 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,  	case CEPH_MSG_AUTH_REPLY:  		m = ceph_msg_get(monc->m_auth_reply);  		break; +	case CEPH_MSG_MON_GET_VERSION_REPLY: +		if (le64_to_cpu(hdr->tid) != 0) +			return get_generic_reply(con, hdr, skip); + +		/* +		 * Older OSDs don't set reply tid even if the orignal +		 * request had a non-zero tid.  Workaround this weirdness +		 * by falling through to the allocate case. +		 */  	case CEPH_MSG_MON_MAP:  	case CEPH_MSG_MDS_MAP:  	case CEPH_MSG_OSD_MAP: -		m = ceph_msg_new(type, front_len, GFP_NOFS); +		m = ceph_msg_new(type, front_len, GFP_NOFS, false); +		if (!m) +			return NULL;	/* ENOMEM--return skip == 0 */  		break;  	} @@ -1000,10 +1201,10 @@ static void mon_fault(struct ceph_connection *con)  	if (!con->private)  		goto out; -	if (monc->con && !monc->hunting) +	if (!monc->hunting)  		pr_info("mon%d %s session lost, "  			"hunting for new mon\n", monc->cur_mon, -			ceph_pr_addr(&monc->con->peer_addr.in_addr)); +			ceph_pr_addr(&monc->con.peer_addr.in_addr));  	__close_session(monc);  	if (!monc->hunting) { @@ -1018,9 +1219,23 @@ out:  	mutex_unlock(&monc->mutex);  } +/* + * We can ignore refcounting on the connection struct, as all references + * will come from the messenger workqueue, which is drained prior to + * mon_client destruction. + */ +static struct ceph_connection *con_get(struct ceph_connection *con) +{ +	return con; +} + +static void con_put(struct ceph_connection *con) +{ +} +  static const struct ceph_connection_operations mon_con_ops = { -	.get = ceph_con_get, -	.put = ceph_con_put, +	.get = con_get, +	.put = con_put,  	.dispatch = dispatch,  	.fault = mon_fault,  	.alloc_msg = mon_alloc_msg, diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c index d5f2d97ac05..ddec1c10ac8 100644 --- a/net/ceph/msgpool.c +++ b/net/ceph/msgpool.c @@ -7,27 +7,38 @@  #include <linux/ceph/msgpool.h> -static void *alloc_fn(gfp_t gfp_mask, void *arg) +static void *msgpool_alloc(gfp_t gfp_mask, void *arg)  {  	struct ceph_msgpool *pool = arg; -	void *p; +	struct ceph_msg *msg; -	p = ceph_msg_new(0, pool->front_len, gfp_mask); -	if (!p) -		pr_err("msgpool %s alloc failed\n", pool->name); -	return p; +	msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true); +	if (!msg) { +		dout("msgpool_alloc %s failed\n", pool->name); +	} else { +		dout("msgpool_alloc %s %p\n", pool->name, msg); +		msg->pool = pool; +	} +	return msg;  } -static void free_fn(void *element, void *arg) +static void msgpool_free(void *element, void *arg)  { -	ceph_msg_put(element); +	struct ceph_msgpool *pool = arg; +	struct ceph_msg *msg = element; + +	dout("msgpool_release %s %p\n", pool->name, msg); +	msg->pool = NULL; +	ceph_msg_put(msg);  } -int ceph_msgpool_init(struct ceph_msgpool *pool, +int ceph_msgpool_init(struct ceph_msgpool *pool, int type,  		      int front_len, int size, bool blocking, const char *name)  { +	dout("msgpool %s init\n", name); +	pool->type = type;  	pool->front_len = front_len; -	pool->pool = mempool_create(size, alloc_fn, free_fn, pool); +	pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool);  	if (!pool->pool)  		return -ENOMEM;  	pool->name = name; @@ -36,29 +47,37 @@ int ceph_msgpool_init(struct ceph_msgpool *pool,  void ceph_msgpool_destroy(struct ceph_msgpool *pool)  { +	dout("msgpool %s destroy\n", pool->name);  	mempool_destroy(pool->pool);  }  struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,  				  int front_len)  { +	struct ceph_msg *msg; +  	if (front_len > pool->front_len) { -		pr_err("msgpool_get pool %s need front %d, pool size is %d\n", +		dout("msgpool_get %s need front %d, pool size is %d\n",  		       pool->name, front_len, pool->front_len);  		WARN_ON(1);  		/* try to alloc a fresh message */ -		return ceph_msg_new(0, front_len, GFP_NOFS); +		return ceph_msg_new(pool->type, front_len, GFP_NOFS, false);  	} -	return mempool_alloc(pool->pool, GFP_NOFS); +	msg = mempool_alloc(pool->pool, GFP_NOFS); +	dout("msgpool_get %s %p\n", pool->name, msg); +	return msg;  }  void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)  { +	dout("msgpool_put %s %p\n", pool->name, msg); +  	/* reset msg front_len; user may have changed it */  	msg->front.iov_len = pool->front_len;  	msg->hdr.front_len = cpu_to_le32(pool->front_len);  	kref_init(&msg->kref);  /* retake single ref */ +	mempool_free(msg, pool->pool);  } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 79391994b3e..05be0c18169 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1,3 +1,4 @@ +  #include <linux/ceph/ceph_debug.h>  #include <linux/module.h> @@ -21,64 +22,18 @@  #define OSD_OP_FRONT_LEN	4096  #define OSD_OPREPLY_FRONT_LEN	512 -static const struct ceph_connection_operations osd_con_ops; -static int __kick_requests(struct ceph_osd_client *osdc, -			  struct ceph_osd *kickosd); - -static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); - -static int op_needs_trail(int op) -{ -	switch (op) { -	case CEPH_OSD_OP_GETXATTR: -	case CEPH_OSD_OP_SETXATTR: -	case CEPH_OSD_OP_CMPXATTR: -	case CEPH_OSD_OP_CALL: -		return 1; -	default: -		return 0; -	} -} - -static int op_has_extent(int op) -{ -	return (op == CEPH_OSD_OP_READ || -		op == CEPH_OSD_OP_WRITE); -} - -void ceph_calc_raw_layout(struct ceph_osd_client *osdc, -			struct ceph_file_layout *layout, -			u64 snapid, -			u64 off, u64 *plen, u64 *bno, -			struct ceph_osd_request *req, -			struct ceph_osd_req_op *op) -{ -	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; -	u64 orig_len = *plen; -	u64 objoff, objlen;    /* extent in object */ - -	reqhead->snapid = cpu_to_le64(snapid); - -	/* object extent? */ -	ceph_calc_file_object_mapping(layout, off, plen, bno, -				      &objoff, &objlen); -	if (*plen < orig_len) -		dout(" skipping last %llu, final file extent %llu~%llu\n", -		     orig_len - *plen, off, *plen); - -	if (op_has_extent(op->op)) { -		op->extent.offset = objoff; -		op->extent.length = objlen; -	} -	req->r_num_pages = calc_pages_for(off, *plen); -	if (op->op == CEPH_OSD_OP_WRITE) -		op->payload_len = *plen; +static struct kmem_cache	*ceph_osd_request_cache; -	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", -	     *bno, objoff, objlen, req->r_num_pages); +static const struct ceph_connection_operations osd_con_ops; -} -EXPORT_SYMBOL(ceph_calc_raw_layout); +static void __send_queued(struct ceph_osd_client *osdc); +static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); +static void __register_request(struct ceph_osd_client *osdc, +			       struct ceph_osd_request *req); +static void __unregister_linger_request(struct ceph_osd_client *osdc, +					struct ceph_osd_request *req); +static void __send_request(struct ceph_osd_client *osdc, +			   struct ceph_osd_request *req);  /*   * Implement client access to distributed object storage cluster. @@ -105,20 +60,238 @@ EXPORT_SYMBOL(ceph_calc_raw_layout);   *   * fill osd op in request message.   */ -static void calc_layout(struct ceph_osd_client *osdc, -			struct ceph_vino vino, -			struct ceph_file_layout *layout, -			u64 off, u64 *plen, -			struct ceph_osd_request *req, -			struct ceph_osd_req_op *op) +static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, +			u64 *objnum, u64 *objoff, u64 *objlen) +{ +	u64 orig_len = *plen; +	int r; + +	/* object extent? */ +	r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum, +					  objoff, objlen); +	if (r < 0) +		return r; +	if (*objlen < orig_len) { +		*plen = *objlen; +		dout(" skipping last %llu, final file extent %llu~%llu\n", +		     orig_len - *plen, off, *plen); +	} + +	dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen); + +	return 0; +} + +static void ceph_osd_data_init(struct ceph_osd_data *osd_data) +{ +	memset(osd_data, 0, sizeof (*osd_data)); +	osd_data->type = CEPH_OSD_DATA_TYPE_NONE; +} + +static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, +			struct page **pages, u64 length, u32 alignment, +			bool pages_from_pool, bool own_pages) +{ +	osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; +	osd_data->pages = pages; +	osd_data->length = length; +	osd_data->alignment = alignment; +	osd_data->pages_from_pool = pages_from_pool; +	osd_data->own_pages = own_pages; +} + +static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, +			struct ceph_pagelist *pagelist) +{ +	osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST; +	osd_data->pagelist = pagelist; +} + +#ifdef CONFIG_BLOCK +static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, +			struct bio *bio, size_t bio_length) +{ +	osd_data->type = CEPH_OSD_DATA_TYPE_BIO; +	osd_data->bio = bio; +	osd_data->bio_length = bio_length; +} +#endif /* CONFIG_BLOCK */ + +#define osd_req_op_data(oreq, whch, typ, fld)	\ +	({						\ +		BUG_ON(whch >= (oreq)->r_num_ops);	\ +		&(oreq)->r_ops[whch].typ.fld;		\ +	}) + +static struct ceph_osd_data * +osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which) +{ +	BUG_ON(which >= osd_req->r_num_ops); + +	return &osd_req->r_ops[which].raw_data_in; +} + +struct ceph_osd_data * +osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, +			unsigned int which) +{ +	return osd_req_op_data(osd_req, which, extent, osd_data); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data); + +struct ceph_osd_data * +osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, +			unsigned int which) +{ +	return osd_req_op_data(osd_req, which, cls, response_data); +} +EXPORT_SYMBOL(osd_req_op_cls_response_data);	/* ??? */ + +void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req, +			unsigned int which, struct page **pages, +			u64 length, u32 alignment, +			bool pages_from_pool, bool own_pages) +{ +	struct ceph_osd_data *osd_data; + +	osd_data = osd_req_op_raw_data_in(osd_req, which); +	ceph_osd_data_pages_init(osd_data, pages, length, alignment, +				pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_raw_data_in_pages); + +void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, +			unsigned int which, struct page **pages, +			u64 length, u32 alignment, +			bool pages_from_pool, bool own_pages) +{ +	struct ceph_osd_data *osd_data; + +	osd_data = osd_req_op_data(osd_req, which, extent, osd_data); +	ceph_osd_data_pages_init(osd_data, pages, length, alignment, +				pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages); + +void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req, +			unsigned int which, struct ceph_pagelist *pagelist) +{ +	struct ceph_osd_data *osd_data; + +	osd_data = osd_req_op_data(osd_req, which, extent, osd_data); +	ceph_osd_data_pagelist_init(osd_data, pagelist); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); + +#ifdef CONFIG_BLOCK +void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, +			unsigned int which, struct bio *bio, size_t bio_length) +{ +	struct ceph_osd_data *osd_data; + +	osd_data = osd_req_op_data(osd_req, which, extent, osd_data); +	ceph_osd_data_bio_init(osd_data, bio, bio_length); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); +#endif /* CONFIG_BLOCK */ + +static void osd_req_op_cls_request_info_pagelist( +			struct ceph_osd_request *osd_req, +			unsigned int which, struct ceph_pagelist *pagelist) +{ +	struct ceph_osd_data *osd_data; + +	osd_data = osd_req_op_data(osd_req, which, cls, request_info); +	ceph_osd_data_pagelist_init(osd_data, pagelist); +} + +void osd_req_op_cls_request_data_pagelist( +			struct ceph_osd_request *osd_req, +			unsigned int which, struct ceph_pagelist *pagelist) +{ +	struct ceph_osd_data *osd_data; + +	osd_data = osd_req_op_data(osd_req, which, cls, request_data); +	ceph_osd_data_pagelist_init(osd_data, pagelist); +} +EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); + +void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, +			unsigned int which, struct page **pages, u64 length, +			u32 alignment, bool pages_from_pool, bool own_pages) +{ +	struct ceph_osd_data *osd_data; + +	osd_data = osd_req_op_data(osd_req, which, cls, request_data); +	ceph_osd_data_pages_init(osd_data, pages, length, alignment, +				pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); + +void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, +			unsigned int which, struct page **pages, u64 length, +			u32 alignment, bool pages_from_pool, bool own_pages)  { -	u64 bno; +	struct ceph_osd_data *osd_data; + +	osd_data = osd_req_op_data(osd_req, which, cls, response_data); +	ceph_osd_data_pages_init(osd_data, pages, length, alignment, +				pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_cls_response_data_pages); -	ceph_calc_raw_layout(osdc, layout, vino.snap, off, -			     plen, &bno, req, op); +static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) +{ +	switch (osd_data->type) { +	case CEPH_OSD_DATA_TYPE_NONE: +		return 0; +	case CEPH_OSD_DATA_TYPE_PAGES: +		return osd_data->length; +	case CEPH_OSD_DATA_TYPE_PAGELIST: +		return (u64)osd_data->pagelist->length; +#ifdef CONFIG_BLOCK +	case CEPH_OSD_DATA_TYPE_BIO: +		return (u64)osd_data->bio_length; +#endif /* CONFIG_BLOCK */ +	default: +		WARN(true, "unrecognized data type %d\n", (int)osd_data->type); +		return 0; +	} +} -	sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno); -	req->r_oid_len = strlen(req->r_oid); +static void ceph_osd_data_release(struct ceph_osd_data *osd_data) +{ +	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) { +		int num_pages; + +		num_pages = calc_pages_for((u64)osd_data->alignment, +						(u64)osd_data->length); +		ceph_release_page_vector(osd_data->pages, num_pages); +	} +	ceph_osd_data_init(osd_data); +} + +static void osd_req_op_data_release(struct ceph_osd_request *osd_req, +			unsigned int which) +{ +	struct ceph_osd_req_op *op; + +	BUG_ON(which >= osd_req->r_num_ops); +	op = &osd_req->r_ops[which]; + +	switch (op->op) { +	case CEPH_OSD_OP_READ: +	case CEPH_OSD_OP_WRITE: +		ceph_osd_data_release(&op->extent.osd_data); +		break; +	case CEPH_OSD_OP_CALL: +		ceph_osd_data_release(&op->cls.request_info); +		ceph_osd_data_release(&op->cls.request_data); +		ceph_osd_data_release(&op->cls.response_data); +		break; +	default: +		break; +	}  }  /* @@ -126,277 +299,429 @@ static void calc_layout(struct ceph_osd_client *osdc,   */  void ceph_osdc_release_request(struct kref *kref)  { -	struct ceph_osd_request *req = container_of(kref, -						    struct ceph_osd_request, -						    r_kref); +	struct ceph_osd_request *req; +	unsigned int which; +	req = container_of(kref, struct ceph_osd_request, r_kref);  	if (req->r_request)  		ceph_msg_put(req->r_request); -	if (req->r_reply) +	if (req->r_reply) { +		ceph_msg_revoke_incoming(req->r_reply);  		ceph_msg_put(req->r_reply); -	if (req->r_con_filling_msg) { -		dout("release_request revoking pages %p from con %p\n", -		     req->r_pages, req->r_con_filling_msg); -		ceph_con_revoke_message(req->r_con_filling_msg, -				      req->r_reply); -		ceph_con_put(req->r_con_filling_msg); -	} -	if (req->r_own_pages) -		ceph_release_page_vector(req->r_pages, -					 req->r_num_pages); -#ifdef CONFIG_BLOCK -	if (req->r_bio) -		bio_put(req->r_bio); -#endif -	ceph_put_snap_context(req->r_snapc); -	if (req->r_trail) { -		ceph_pagelist_release(req->r_trail); -		kfree(req->r_trail);  	} + +	for (which = 0; which < req->r_num_ops; which++) +		osd_req_op_data_release(req, which); + +	ceph_put_snap_context(req->r_snapc);  	if (req->r_mempool)  		mempool_free(req, req->r_osdc->req_mempool);  	else -		kfree(req); -} -EXPORT_SYMBOL(ceph_osdc_release_request); - -static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail) -{ -	int i = 0; +		kmem_cache_free(ceph_osd_request_cache, req); -	if (needs_trail) -		*needs_trail = 0; -	while (ops[i].op) { -		if (needs_trail && op_needs_trail(ops[i].op)) -			*needs_trail = 1; -		i++; -	} - -	return i;  } +EXPORT_SYMBOL(ceph_osdc_release_request);  struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, -					       int flags,  					       struct ceph_snap_context *snapc, -					       struct ceph_osd_req_op *ops, +					       unsigned int num_ops,  					       bool use_mempool, -					       gfp_t gfp_flags, -					       struct page **pages, -					       struct bio *bio) +					       gfp_t gfp_flags)  {  	struct ceph_osd_request *req;  	struct ceph_msg *msg; -	int needs_trail; -	int num_op = get_num_ops(ops, &needs_trail); -	size_t msg_size = sizeof(struct ceph_osd_request_head); +	size_t msg_size; -	msg_size += num_op*sizeof(struct ceph_osd_op); +	BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX); +	BUG_ON(num_ops > CEPH_OSD_MAX_OP); + +	msg_size = 4 + 4 + 8 + 8 + 4+8; +	msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ +	msg_size += 1 + 8 + 4 + 4;     /* pg_t */ +	msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ +	msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); +	msg_size += 8;  /* snapid */ +	msg_size += 8;  /* snap_seq */ +	msg_size += 8 * (snapc ? snapc->num_snaps : 0);  /* snaps */ +	msg_size += 4;  	if (use_mempool) {  		req = mempool_alloc(osdc->req_mempool, gfp_flags);  		memset(req, 0, sizeof(*req));  	} else { -		req = kzalloc(sizeof(*req), gfp_flags); +		req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags);  	}  	if (req == NULL)  		return NULL;  	req->r_osdc = osdc;  	req->r_mempool = use_mempool; +	req->r_num_ops = num_ops;  	kref_init(&req->r_kref);  	init_completion(&req->r_completion);  	init_completion(&req->r_safe_completion); +	RB_CLEAR_NODE(&req->r_node);  	INIT_LIST_HEAD(&req->r_unsafe_item); -	req->r_flags = flags; +	INIT_LIST_HEAD(&req->r_linger_item); +	INIT_LIST_HEAD(&req->r_linger_osd); +	INIT_LIST_HEAD(&req->r_req_lru_item); +	INIT_LIST_HEAD(&req->r_osd_item); -	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); +	req->r_base_oloc.pool = -1; +	req->r_target_oloc.pool = -1;  	/* create reply message */  	if (use_mempool)  		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);  	else  		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, -				   OSD_OPREPLY_FRONT_LEN, gfp_flags); +				   OSD_OPREPLY_FRONT_LEN, gfp_flags, true);  	if (!msg) {  		ceph_osdc_put_request(req);  		return NULL;  	}  	req->r_reply = msg; -	/* allocate space for the trailing data */ -	if (needs_trail) { -		req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags); -		if (!req->r_trail) { -			ceph_osdc_put_request(req); -			return NULL; -		} -		ceph_pagelist_init(req->r_trail); -	}  	/* create request message; allow space for oid */ -	msg_size += 40; -	if (snapc) -		msg_size += sizeof(u64) * snapc->num_snaps;  	if (use_mempool)  		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);  	else -		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags); +		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);  	if (!msg) {  		ceph_osdc_put_request(req);  		return NULL;  	} -	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);  	memset(msg->front.iov_base, 0, msg->front.iov_len);  	req->r_request = msg; -	req->r_pages = pages; -#ifdef CONFIG_BLOCK -	if (bio) { -		req->r_bio = bio; -		bio_get(req->r_bio); -	} -#endif  	return req;  }  EXPORT_SYMBOL(ceph_osdc_alloc_request); -static void osd_req_encode_op(struct ceph_osd_request *req, -			      struct ceph_osd_op *dst, -			      struct ceph_osd_req_op *src) +static bool osd_req_opcode_valid(u16 opcode)  { -	dst->op = cpu_to_le16(src->op); - -	switch (dst->op) { +	switch (opcode) {  	case CEPH_OSD_OP_READ: +	case CEPH_OSD_OP_STAT: +	case CEPH_OSD_OP_MAPEXT: +	case CEPH_OSD_OP_MASKTRUNC: +	case CEPH_OSD_OP_SPARSE_READ: +	case CEPH_OSD_OP_NOTIFY: +	case CEPH_OSD_OP_NOTIFY_ACK: +	case CEPH_OSD_OP_ASSERT_VER:  	case CEPH_OSD_OP_WRITE: -		dst->extent.offset = -			cpu_to_le64(src->extent.offset); -		dst->extent.length = -			cpu_to_le64(src->extent.length); -		dst->extent.truncate_size = -			cpu_to_le64(src->extent.truncate_size); -		dst->extent.truncate_seq = -			cpu_to_le32(src->extent.truncate_seq); -		break; - +	case CEPH_OSD_OP_WRITEFULL: +	case CEPH_OSD_OP_TRUNCATE: +	case CEPH_OSD_OP_ZERO: +	case CEPH_OSD_OP_DELETE: +	case CEPH_OSD_OP_APPEND: +	case CEPH_OSD_OP_STARTSYNC: +	case CEPH_OSD_OP_SETTRUNC: +	case CEPH_OSD_OP_TRIMTRUNC: +	case CEPH_OSD_OP_TMAPUP: +	case CEPH_OSD_OP_TMAPPUT: +	case CEPH_OSD_OP_TMAPGET: +	case CEPH_OSD_OP_CREATE: +	case CEPH_OSD_OP_ROLLBACK: +	case CEPH_OSD_OP_WATCH: +	case CEPH_OSD_OP_OMAPGETKEYS: +	case CEPH_OSD_OP_OMAPGETVALS: +	case CEPH_OSD_OP_OMAPGETHEADER: +	case CEPH_OSD_OP_OMAPGETVALSBYKEYS: +	case CEPH_OSD_OP_OMAPSETVALS: +	case CEPH_OSD_OP_OMAPSETHEADER: +	case CEPH_OSD_OP_OMAPCLEAR: +	case CEPH_OSD_OP_OMAPRMKEYS: +	case CEPH_OSD_OP_OMAP_CMP: +	case CEPH_OSD_OP_SETALLOCHINT: +	case CEPH_OSD_OP_CLONERANGE: +	case CEPH_OSD_OP_ASSERT_SRC_VERSION: +	case CEPH_OSD_OP_SRC_CMPXATTR:  	case CEPH_OSD_OP_GETXATTR: -	case CEPH_OSD_OP_SETXATTR: +	case CEPH_OSD_OP_GETXATTRS:  	case CEPH_OSD_OP_CMPXATTR: -		BUG_ON(!req->r_trail); - -		dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); -		dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); -		dst->xattr.cmp_op = src->xattr.cmp_op; -		dst->xattr.cmp_mode = src->xattr.cmp_mode; -		ceph_pagelist_append(req->r_trail, src->xattr.name, -				     src->xattr.name_len); -		ceph_pagelist_append(req->r_trail, src->xattr.val, -				     src->xattr.value_len); -		break; +	case CEPH_OSD_OP_SETXATTR: +	case CEPH_OSD_OP_SETXATTRS: +	case CEPH_OSD_OP_RESETXATTRS: +	case CEPH_OSD_OP_RMXATTR: +	case CEPH_OSD_OP_PULL: +	case CEPH_OSD_OP_PUSH: +	case CEPH_OSD_OP_BALANCEREADS: +	case CEPH_OSD_OP_UNBALANCEREADS: +	case CEPH_OSD_OP_SCRUB: +	case CEPH_OSD_OP_SCRUB_RESERVE: +	case CEPH_OSD_OP_SCRUB_UNRESERVE: +	case CEPH_OSD_OP_SCRUB_STOP: +	case CEPH_OSD_OP_SCRUB_MAP: +	case CEPH_OSD_OP_WRLOCK: +	case CEPH_OSD_OP_WRUNLOCK: +	case CEPH_OSD_OP_RDLOCK: +	case CEPH_OSD_OP_RDUNLOCK: +	case CEPH_OSD_OP_UPLOCK: +	case CEPH_OSD_OP_DNLOCK:  	case CEPH_OSD_OP_CALL: -		BUG_ON(!req->r_trail); - -		dst->cls.class_len = src->cls.class_len; -		dst->cls.method_len = src->cls.method_len; -		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); - -		ceph_pagelist_append(req->r_trail, src->cls.class_name, -				     src->cls.class_len); -		ceph_pagelist_append(req->r_trail, src->cls.method_name, -				     src->cls.method_len); -		ceph_pagelist_append(req->r_trail, src->cls.indata, -				     src->cls.indata_len); -		break; -	case CEPH_OSD_OP_ROLLBACK: -		dst->snap.snapid = cpu_to_le64(src->snap.snapid); -		break; -	case CEPH_OSD_OP_STARTSYNC: -		break; +	case CEPH_OSD_OP_PGLS: +	case CEPH_OSD_OP_PGLS_FILTER: +		return true;  	default: -		pr_err("unrecognized osd opcode %d\n", dst->op); -		WARN_ON(1); -		break; +		return false;  	} -	dst->payload_len = cpu_to_le32(src->payload_len);  }  /* - * build new request AND message - * + * This is an osd op init function for opcodes that have no data or + * other information associated with them.  It also serves as a + * common init routine for all the other init functions, below.   */ -void ceph_osdc_build_request(struct ceph_osd_request *req, -			     u64 off, u64 *plen, -			     struct ceph_osd_req_op *src_ops, -			     struct ceph_snap_context *snapc, -			     struct timespec *mtime, -			     const char *oid, -			     int oid_len) +static struct ceph_osd_req_op * +_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, +				u16 opcode)  { -	struct ceph_msg *msg = req->r_request; -	struct ceph_osd_request_head *head; -	struct ceph_osd_req_op *src_op; -	struct ceph_osd_op *op; -	void *p; -	int num_op = get_num_ops(src_ops, NULL); -	size_t msg_size = sizeof(*head) + num_op*sizeof(*op); -	int flags = req->r_flags; -	u64 data_len = 0; -	int i; +	struct ceph_osd_req_op *op; -	head = msg->front.iov_base; -	op = (void *)(head + 1); -	p = (void *)(op + num_op); +	BUG_ON(which >= osd_req->r_num_ops); +	BUG_ON(!osd_req_opcode_valid(opcode)); -	req->r_snapc = ceph_get_snap_context(snapc); +	op = &osd_req->r_ops[which]; +	memset(op, 0, sizeof (*op)); +	op->op = opcode; + +	return op; +} + +void osd_req_op_init(struct ceph_osd_request *osd_req, +				unsigned int which, u16 opcode) +{ +	(void)_osd_req_op_init(osd_req, which, opcode); +} +EXPORT_SYMBOL(osd_req_op_init); + +void osd_req_op_extent_init(struct ceph_osd_request *osd_req, +				unsigned int which, u16 opcode, +				u64 offset, u64 length, +				u64 truncate_size, u32 truncate_seq) +{ +	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); +	size_t payload_len = 0; + +	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && +	       opcode != CEPH_OSD_OP_DELETE && opcode != CEPH_OSD_OP_ZERO && +	       opcode != CEPH_OSD_OP_TRUNCATE); + +	op->extent.offset = offset; +	op->extent.length = length; +	op->extent.truncate_size = truncate_size; +	op->extent.truncate_seq = truncate_seq; +	if (opcode == CEPH_OSD_OP_WRITE) +		payload_len += length; + +	op->payload_len = payload_len; +} +EXPORT_SYMBOL(osd_req_op_extent_init); + +void osd_req_op_extent_update(struct ceph_osd_request *osd_req, +				unsigned int which, u64 length) +{ +	struct ceph_osd_req_op *op; +	u64 previous; + +	BUG_ON(which >= osd_req->r_num_ops); +	op = &osd_req->r_ops[which]; +	previous = op->extent.length; + +	if (length == previous) +		return;		/* Nothing to do */ +	BUG_ON(length > previous); + +	op->extent.length = length; +	op->payload_len -= previous - length; +} +EXPORT_SYMBOL(osd_req_op_extent_update); + +void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, +			u16 opcode, const char *class, const char *method) +{ +	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); +	struct ceph_pagelist *pagelist; +	size_t payload_len = 0; +	size_t size; + +	BUG_ON(opcode != CEPH_OSD_OP_CALL); + +	pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); +	BUG_ON(!pagelist); +	ceph_pagelist_init(pagelist); -	head->client_inc = cpu_to_le32(1); /* always, for now. */ -	head->flags = cpu_to_le32(flags); -	if (flags & CEPH_OSD_FLAG_WRITE) -		ceph_encode_timespec(&head->mtime, mtime); -	head->num_ops = cpu_to_le16(num_op); +	op->cls.class_name = class; +	size = strlen(class); +	BUG_ON(size > (size_t) U8_MAX); +	op->cls.class_len = size; +	ceph_pagelist_append(pagelist, class, size); +	payload_len += size; +	op->cls.method_name = method; +	size = strlen(method); +	BUG_ON(size > (size_t) U8_MAX); +	op->cls.method_len = size; +	ceph_pagelist_append(pagelist, method, size); +	payload_len += size; -	/* fill in oid */ -	head->object_len = cpu_to_le32(oid_len); -	memcpy(p, oid, oid_len); -	p += oid_len; +	osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); + +	op->cls.argc = 0;	/* currently unused */ + +	op->payload_len = payload_len; +} +EXPORT_SYMBOL(osd_req_op_cls_init); + +void osd_req_op_watch_init(struct ceph_osd_request *osd_req, +				unsigned int which, u16 opcode, +				u64 cookie, u64 version, int flag) +{ +	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + +	BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); + +	op->watch.cookie = cookie; +	op->watch.ver = version; +	if (opcode == CEPH_OSD_OP_WATCH && flag) +		op->watch.flag = (u8)1; +} +EXPORT_SYMBOL(osd_req_op_watch_init); + +void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, +				unsigned int which, +				u64 expected_object_size, +				u64 expected_write_size) +{ +	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, +						      CEPH_OSD_OP_SETALLOCHINT); + +	op->alloc_hint.expected_object_size = expected_object_size; +	op->alloc_hint.expected_write_size = expected_write_size; + +	/* +	 * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed +	 * not worth a feature bit.  Set FAILOK per-op flag to make +	 * sure older osds don't trip over an unsupported opcode. +	 */ +	op->flags |= CEPH_OSD_OP_FLAG_FAILOK; +} +EXPORT_SYMBOL(osd_req_op_alloc_hint_init); -	src_op = src_ops; -	while (src_op->op) { -		osd_req_encode_op(req, op, src_op); -		src_op++; -		op++; +static void ceph_osdc_msg_data_add(struct ceph_msg *msg, +				struct ceph_osd_data *osd_data) +{ +	u64 length = ceph_osd_data_length(osd_data); + +	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { +		BUG_ON(length > (u64) SIZE_MAX); +		if (length) +			ceph_msg_data_add_pages(msg, osd_data->pages, +					length, osd_data->alignment); +	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { +		BUG_ON(!length); +		ceph_msg_data_add_pagelist(msg, osd_data->pagelist); +#ifdef CONFIG_BLOCK +	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { +		ceph_msg_data_add_bio(msg, osd_data->bio, length); +#endif +	} else { +		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);  	} +} -	if (req->r_trail) -		data_len += req->r_trail->length; +static u64 osd_req_encode_op(struct ceph_osd_request *req, +			      struct ceph_osd_op *dst, unsigned int which) +{ +	struct ceph_osd_req_op *src; +	struct ceph_osd_data *osd_data; +	u64 request_data_len = 0; +	u64 data_length; -	if (snapc) { -		head->snap_seq = cpu_to_le64(snapc->seq); -		head->num_snaps = cpu_to_le32(snapc->num_snaps); -		for (i = 0; i < snapc->num_snaps; i++) { -			put_unaligned_le64(snapc->snaps[i], p); -			p += sizeof(u64); -		} +	BUG_ON(which >= req->r_num_ops); +	src = &req->r_ops[which]; +	if (WARN_ON(!osd_req_opcode_valid(src->op))) { +		pr_err("unrecognized osd opcode %d\n", src->op); + +		return 0;  	} -	if (flags & CEPH_OSD_FLAG_WRITE) { -		req->r_request->hdr.data_off = cpu_to_le16(off); -		req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len); -	} else if (data_len) { -		req->r_request->hdr.data_off = 0; -		req->r_request->hdr.data_len = cpu_to_le32(data_len); +	switch (src->op) { +	case CEPH_OSD_OP_STAT: +		osd_data = &src->raw_data_in; +		ceph_osdc_msg_data_add(req->r_reply, osd_data); +		break; +	case CEPH_OSD_OP_READ: +	case CEPH_OSD_OP_WRITE: +	case CEPH_OSD_OP_ZERO: +	case CEPH_OSD_OP_DELETE: +	case CEPH_OSD_OP_TRUNCATE: +		if (src->op == CEPH_OSD_OP_WRITE) +			request_data_len = src->extent.length; +		dst->extent.offset = cpu_to_le64(src->extent.offset); +		dst->extent.length = cpu_to_le64(src->extent.length); +		dst->extent.truncate_size = +			cpu_to_le64(src->extent.truncate_size); +		dst->extent.truncate_seq = +			cpu_to_le32(src->extent.truncate_seq); +		osd_data = &src->extent.osd_data; +		if (src->op == CEPH_OSD_OP_WRITE) +			ceph_osdc_msg_data_add(req->r_request, osd_data); +		else +			ceph_osdc_msg_data_add(req->r_reply, osd_data); +		break; +	case CEPH_OSD_OP_CALL: +		dst->cls.class_len = src->cls.class_len; +		dst->cls.method_len = src->cls.method_len; +		osd_data = &src->cls.request_info; +		ceph_osdc_msg_data_add(req->r_request, osd_data); +		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST); +		request_data_len = osd_data->pagelist->length; + +		osd_data = &src->cls.request_data; +		data_length = ceph_osd_data_length(osd_data); +		if (data_length) { +			BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); +			dst->cls.indata_len = cpu_to_le32(data_length); +			ceph_osdc_msg_data_add(req->r_request, osd_data); +			src->payload_len += data_length; +			request_data_len += data_length; +		} +		osd_data = &src->cls.response_data; +		ceph_osdc_msg_data_add(req->r_reply, osd_data); +		break; +	case CEPH_OSD_OP_STARTSYNC: +		break; +	case CEPH_OSD_OP_NOTIFY_ACK: +	case CEPH_OSD_OP_WATCH: +		dst->watch.cookie = cpu_to_le64(src->watch.cookie); +		dst->watch.ver = cpu_to_le64(src->watch.ver); +		dst->watch.flag = src->watch.flag; +		break; +	case CEPH_OSD_OP_SETALLOCHINT: +		dst->alloc_hint.expected_object_size = +		    cpu_to_le64(src->alloc_hint.expected_object_size); +		dst->alloc_hint.expected_write_size = +		    cpu_to_le64(src->alloc_hint.expected_write_size); +		break; +	default: +		pr_err("unsupported osd opcode %s\n", +			ceph_osd_op_name(src->op)); +		WARN_ON(1); + +		return 0;  	} -	BUG_ON(p > msg->front.iov_base + msg->front.iov_len); -	msg_size = p - msg->front.iov_base; -	msg->front.iov_len = msg_size; -	msg->hdr.front_len = cpu_to_le32(msg_size); -	return; +	dst->op = cpu_to_le16(src->op); +	dst->flags = cpu_to_le32(src->flags); +	dst->payload_len = cpu_to_le32(src->payload_len); + +	return request_data_len;  } -EXPORT_SYMBOL(ceph_osdc_build_request);  /*   * build new request AND message, calculate layout, and adjust file @@ -412,45 +737,67 @@ EXPORT_SYMBOL(ceph_osdc_build_request);  struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,  					       struct ceph_file_layout *layout,  					       struct ceph_vino vino, -					       u64 off, u64 *plen, +					       u64 off, u64 *plen, int num_ops,  					       int opcode, int flags,  					       struct ceph_snap_context *snapc, -					       int do_sync,  					       u32 truncate_seq,  					       u64 truncate_size, -					       struct timespec *mtime, -					       bool use_mempool, int num_reply) +					       bool use_mempool)  { -	struct ceph_osd_req_op ops[3];  	struct ceph_osd_request *req; +	u64 objnum = 0; +	u64 objoff = 0; +	u64 objlen = 0; +	u32 object_size; +	u64 object_base; +	int r; + +	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && +	       opcode != CEPH_OSD_OP_DELETE && opcode != CEPH_OSD_OP_ZERO && +	       opcode != CEPH_OSD_OP_TRUNCATE); + +	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, +					GFP_NOFS); +	if (!req) +		return ERR_PTR(-ENOMEM); -	ops[0].op = opcode; -	ops[0].extent.truncate_seq = truncate_seq; -	ops[0].extent.truncate_size = truncate_size; -	ops[0].payload_len = 0; - -	if (do_sync) { -		ops[1].op = CEPH_OSD_OP_STARTSYNC; -		ops[1].payload_len = 0; -		ops[2].op = 0; -	} else -		ops[1].op = 0; - -	req = ceph_osdc_alloc_request(osdc, flags, -					 snapc, ops, -					 use_mempool, -					 GFP_NOFS, NULL, NULL); -	if (IS_ERR(req)) -		return req; +	req->r_flags = flags;  	/* calculate max write size */ -	calc_layout(osdc, vino, layout, off, plen, req, ops); -	req->r_file_layout = *layout;  /* keep a copy */ +	r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen); +	if (r < 0) { +		ceph_osdc_put_request(req); +		return ERR_PTR(r); +	} + +	object_size = le32_to_cpu(layout->fl_object_size); +	object_base = off - objoff; +	if (!(truncate_seq == 1 && truncate_size == -1ULL)) { +		if (truncate_size <= object_base) { +			truncate_size = 0; +		} else { +			truncate_size -= object_base; +			if (truncate_size > object_size) +				truncate_size = object_size; +		} +	} -	ceph_osdc_build_request(req, off, plen, ops, -				snapc, -				mtime, -				req->r_oid, req->r_oid_len); +	osd_req_op_extent_init(req, 0, opcode, objoff, objlen, +				truncate_size, truncate_seq); + +	/* +	 * A second op in the ops array means the caller wants to +	 * also issue a include a 'startsync' command so that the +	 * osd will flush data quickly. +	 */ +	if (num_ops > 1) +		osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + +	req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); + +	snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name), +		 "%llx.%08llx", vino.ino, objnum); +	req->r_base_oid.name_len = strlen(req->r_base_oid.name);  	return req;  } @@ -521,6 +868,68 @@ __lookup_request_ge(struct ceph_osd_client *osdc,  	return NULL;  } +/* + * Resubmit requests pending on the given osd. + */ +static void __kick_osd_requests(struct ceph_osd_client *osdc, +				struct ceph_osd *osd) +{ +	struct ceph_osd_request *req, *nreq; +	LIST_HEAD(resend); +	int err; + +	dout("__kick_osd_requests osd%d\n", osd->o_osd); +	err = __reset_osd(osdc, osd); +	if (err) +		return; +	/* +	 * Build up a list of requests to resend by traversing the +	 * osd's list of requests.  Requests for a given object are +	 * sent in tid order, and that is also the order they're +	 * kept on this list.  Therefore all requests that are in +	 * flight will be found first, followed by all requests that +	 * have not yet been sent.  And to resend requests while +	 * preserving this order we will want to put any sent +	 * requests back on the front of the osd client's unsent +	 * list. +	 * +	 * So we build a separate ordered list of already-sent +	 * requests for the affected osd and splice it onto the +	 * front of the osd client's unsent list.  Once we've seen a +	 * request that has not yet been sent we're done.  Those +	 * requests are already sitting right where they belong. +	 */ +	list_for_each_entry(req, &osd->o_requests, r_osd_item) { +		if (!req->r_sent) +			break; +		list_move_tail(&req->r_req_lru_item, &resend); +		dout("requeueing %p tid %llu osd%d\n", req, req->r_tid, +		     osd->o_osd); +		if (!req->r_linger) +			req->r_flags |= CEPH_OSD_FLAG_RETRY; +	} +	list_splice(&resend, &osdc->req_unsent); + +	/* +	 * Linger requests are re-registered before sending, which +	 * sets up a new tid for each.  We add them to the unsent +	 * list at the end to keep things in tid order. +	 */ +	list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, +				 r_linger_osd) { +		/* +		 * reregister request prior to unregistering linger so +		 * that r_osd is preserved. +		 */ +		BUG_ON(!list_empty(&req->r_req_lru_item)); +		__register_request(osdc, req); +		list_add_tail(&req->r_req_lru_item, &osdc->req_unsent); +		list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); +		__unregister_linger_request(osdc, req); +		dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, +		     osd->o_osd); +	} +}  /*   * If the osd connection drops, we need to resubmit all requests. @@ -535,14 +944,17 @@ static void osd_reset(struct ceph_connection *con)  	dout("osd_reset osd%d\n", osd->o_osd);  	osdc = osd->o_osdc;  	down_read(&osdc->map_sem); -	kick_requests(osdc, osd); +	mutex_lock(&osdc->request_mutex); +	__kick_osd_requests(osdc, osd); +	__send_queued(osdc); +	mutex_unlock(&osdc->request_mutex);  	up_read(&osdc->map_sem);  }  /*   * Track open sessions with osds.   */ -static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) +static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)  {  	struct ceph_osd *osd; @@ -552,14 +964,14 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)  	atomic_set(&osd->o_ref, 1);  	osd->o_osdc = osdc; +	osd->o_osd = onum; +	RB_CLEAR_NODE(&osd->o_node);  	INIT_LIST_HEAD(&osd->o_requests); +	INIT_LIST_HEAD(&osd->o_linger_requests);  	INIT_LIST_HEAD(&osd->o_osd_lru);  	osd->o_incarnation = 1; -	ceph_con_init(osdc->client->msgr, &osd->o_con); -	osd->o_con.private = osd; -	osd->o_con.ops = &osd_con_ops; -	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD; +	ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);  	INIT_LIST_HEAD(&osd->o_keepalive_item);  	return osd; @@ -581,11 +993,10 @@ static void put_osd(struct ceph_osd *osd)  {  	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),  	     atomic_read(&osd->o_ref) - 1); -	if (atomic_dec_and_test(&osd->o_ref)) { +	if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) {  		struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; -		if (osd->o_authorizer) -			ac->ops->destroy_authorizer(ac, osd->o_authorizer); +		ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer);  		kfree(osd);  	}  } @@ -603,6 +1014,18 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)  	put_osd(osd);  } +static void remove_all_osds(struct ceph_osd_client *osdc) +{ +	dout("%s %p\n", __func__, osdc); +	mutex_lock(&osdc->request_mutex); +	while (!RB_EMPTY_ROOT(&osdc->osds)) { +		struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), +						struct ceph_osd, o_node); +		__remove_osd(osdc, osd); +	} +	mutex_unlock(&osdc->request_mutex); +} +  static void __move_osd_to_lru(struct ceph_osd_client *osdc,  			      struct ceph_osd *osd)  { @@ -619,14 +1042,14 @@ static void __remove_osd_from_lru(struct ceph_osd *osd)  		list_del_init(&osd->o_osd_lru);  } -static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all) +static void remove_old_osds(struct ceph_osd_client *osdc)  {  	struct ceph_osd *osd, *nosd;  	dout("__remove_old_osds %p\n", osdc);  	mutex_lock(&osdc->request_mutex);  	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { -		if (!remove_all && time_before(jiffies, osd->lru_ttl)) +		if (time_before(jiffies, osd->lru_ttl))  			break;  		__remove_osd(osdc, osd);  	} @@ -638,28 +1061,35 @@ static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)   */  static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)  { -	struct ceph_osd_request *req; -	int ret = 0; +	struct ceph_entity_addr *peer_addr;  	dout("__reset_osd %p osd%d\n", osd, osd->o_osd); -	if (list_empty(&osd->o_requests)) { +	if (list_empty(&osd->o_requests) && +	    list_empty(&osd->o_linger_requests)) {  		__remove_osd(osdc, osd); -	} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], -			  &osd->o_con.peer_addr, -			  sizeof(osd->o_con.peer_addr)) == 0 && -		   !ceph_con_opened(&osd->o_con)) { -		dout(" osd addr hasn't changed and connection never opened," -		     " letting msgr retry"); + +		return -ENODEV; +	} + +	peer_addr = &osdc->osdmap->osd_addr[osd->o_osd]; +	if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && +			!ceph_con_opened(&osd->o_con)) { +		struct ceph_osd_request *req; + +		dout("osd addr hasn't changed and connection never opened, " +		     "letting msgr retry\n");  		/* touch each r_stamp for handle_timeout()'s benfit */  		list_for_each_entry(req, &osd->o_requests, r_osd_item)  			req->r_stamp = jiffies; -		ret = -EAGAIN; -	} else { -		ceph_con_close(&osd->o_con); -		ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); -		osd->o_incarnation++; + +		return -EAGAIN;  	} -	return ret; + +	ceph_con_close(&osd->o_con); +	ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr); +	osd->o_incarnation++; + +	return 0;  }  static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) @@ -668,6 +1098,7 @@ static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)  	struct rb_node *parent = NULL;  	struct ceph_osd *osd = NULL; +	dout("__insert_osd %p osd%d\n", new, new->o_osd);  	while (*p) {  		parent = *p;  		osd = rb_entry(parent, struct ceph_osd, o_node); @@ -715,24 +1146,19 @@ static void __cancel_osd_timeout(struct ceph_osd_client *osdc)   * Register request, assign tid.  If this is the first request, set up   * the timeout event.   */ -static void register_request(struct ceph_osd_client *osdc, -			     struct ceph_osd_request *req) +static void __register_request(struct ceph_osd_client *osdc, +			       struct ceph_osd_request *req)  { -	mutex_lock(&osdc->request_mutex);  	req->r_tid = ++osdc->last_tid;  	req->r_request->hdr.tid = cpu_to_le64(req->r_tid); -	INIT_LIST_HEAD(&req->r_req_lru_item); - -	dout("register_request %p tid %lld\n", req, req->r_tid); +	dout("__register_request %p tid %lld\n", req, req->r_tid);  	__insert_request(osdc, req);  	ceph_osdc_get_request(req);  	osdc->num_requests++; -  	if (osdc->num_requests == 1) {  		dout(" first request, scheduling timeout\n");  		__schedule_osd_timeout(osdc);  	} -	mutex_unlock(&osdc->request_mutex);  }  /* @@ -741,23 +1167,33 @@ static void register_request(struct ceph_osd_client *osdc,  static void __unregister_request(struct ceph_osd_client *osdc,  				 struct ceph_osd_request *req)  { +	if (RB_EMPTY_NODE(&req->r_node)) { +		dout("__unregister_request %p tid %lld not registered\n", +			req, req->r_tid); +		return; +	} +  	dout("__unregister_request %p tid %lld\n", req, req->r_tid);  	rb_erase(&req->r_node, &osdc->requests);  	osdc->num_requests--;  	if (req->r_osd) {  		/* make sure the original request isn't in flight. */ -		ceph_con_revoke(&req->r_osd->o_con, req->r_request); +		ceph_msg_revoke(req->r_request);  		list_del_init(&req->r_osd_item); -		if (list_empty(&req->r_osd->o_requests)) +		if (list_empty(&req->r_osd->o_requests) && +		    list_empty(&req->r_osd->o_linger_requests)) { +			dout("moving osd to %p lru\n", req->r_osd);  			__move_osd_to_lru(osdc, req->r_osd); -		req->r_osd = NULL; +		} +		if (list_empty(&req->r_linger_item)) +			req->r_osd = NULL;  	} +	list_del_init(&req->r_req_lru_item);  	ceph_osdc_put_request(req); -	list_del_init(&req->r_req_lru_item);  	if (osdc->num_requests == 0) {  		dout(" no requests, canceling timeout\n");  		__cancel_osd_timeout(osdc); @@ -770,53 +1206,166 @@ static void __unregister_request(struct ceph_osd_client *osdc,  static void __cancel_request(struct ceph_osd_request *req)  {  	if (req->r_sent && req->r_osd) { -		ceph_con_revoke(&req->r_osd->o_con, req->r_request); +		ceph_msg_revoke(req->r_request);  		req->r_sent = 0;  	} -	list_del_init(&req->r_req_lru_item); +} + +static void __register_linger_request(struct ceph_osd_client *osdc, +				    struct ceph_osd_request *req) +{ +	dout("__register_linger_request %p\n", req); +	ceph_osdc_get_request(req); +	list_add_tail(&req->r_linger_item, &osdc->req_linger); +	if (req->r_osd) +		list_add_tail(&req->r_linger_osd, +			      &req->r_osd->o_linger_requests); +} + +static void __unregister_linger_request(struct ceph_osd_client *osdc, +					struct ceph_osd_request *req) +{ +	dout("__unregister_linger_request %p\n", req); +	list_del_init(&req->r_linger_item); +	if (req->r_osd) { +		list_del_init(&req->r_linger_osd); + +		if (list_empty(&req->r_osd->o_requests) && +		    list_empty(&req->r_osd->o_linger_requests)) { +			dout("moving osd to %p lru\n", req->r_osd); +			__move_osd_to_lru(osdc, req->r_osd); +		} +		if (list_empty(&req->r_osd_item)) +			req->r_osd = NULL; +	} +	ceph_osdc_put_request(req); +} + +void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, +					 struct ceph_osd_request *req) +{ +	mutex_lock(&osdc->request_mutex); +	if (req->r_linger) { +		req->r_linger = 0; +		__unregister_linger_request(osdc, req); +	} +	mutex_unlock(&osdc->request_mutex); +} +EXPORT_SYMBOL(ceph_osdc_unregister_linger_request); + +void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, +				  struct ceph_osd_request *req) +{ +	if (!req->r_linger) { +		dout("set_request_linger %p\n", req); +		req->r_linger = 1; +	} +} +EXPORT_SYMBOL(ceph_osdc_set_request_linger); + +/* + * Returns whether a request should be blocked from being sent + * based on the current osdmap and osd_client settings. + * + * Caller should hold map_sem for read. + */ +static bool __req_should_be_paused(struct ceph_osd_client *osdc, +				   struct ceph_osd_request *req) +{ +	bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); +	bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || +		ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); +	return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) || +		(req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); +} + +/* + * Calculate mapping of a request to a PG.  Takes tiering into account. + */ +static int __calc_request_pg(struct ceph_osdmap *osdmap, +			     struct ceph_osd_request *req, +			     struct ceph_pg *pg_out) +{ +	bool need_check_tiering; + +	need_check_tiering = false; +	if (req->r_target_oloc.pool == -1) { +		req->r_target_oloc = req->r_base_oloc; /* struct */ +		need_check_tiering = true; +	} +	if (req->r_target_oid.name_len == 0) { +		ceph_oid_copy(&req->r_target_oid, &req->r_base_oid); +		need_check_tiering = true; +	} + +	if (need_check_tiering && +	    (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { +		struct ceph_pg_pool_info *pi; + +		pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool); +		if (pi) { +			if ((req->r_flags & CEPH_OSD_FLAG_READ) && +			    pi->read_tier >= 0) +				req->r_target_oloc.pool = pi->read_tier; +			if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && +			    pi->write_tier >= 0) +				req->r_target_oloc.pool = pi->write_tier; +		} +		/* !pi is caught in ceph_oloc_oid_to_pg() */ +	} + +	return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc, +				   &req->r_target_oid, pg_out);  }  /*   * Pick an osd (the first 'up' osd in the pg), allocate the osd struct   * (as needed), and set the request r_osd appropriately.  If there is - * no up osd, set r_osd to NULL. + * no up osd, set r_osd to NULL.  Move the request to the appropriate list + * (unsent, homeless) or leave on in-flight lru.   *   * Return 0 if unchanged, 1 if changed, or negative on error.   *   * Caller should hold map_sem for read and request_mutex.   */ -static int __map_osds(struct ceph_osd_client *osdc, -		      struct ceph_osd_request *req) +static int __map_request(struct ceph_osd_client *osdc, +			 struct ceph_osd_request *req, int force_resend)  { -	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;  	struct ceph_pg pgid;  	int acting[CEPH_PG_MAX_SIZE]; -	int o = -1, num = 0; +	int num, o;  	int err; +	bool was_paused; -	dout("map_osds %p tid %lld\n", req, req->r_tid); -	err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, -				      &req->r_file_layout, osdc->osdmap); -	if (err) +	dout("map_request %p tid %lld\n", req, req->r_tid); + +	err = __calc_request_pg(osdc->osdmap, req, &pgid); +	if (err) { +		list_move(&req->r_req_lru_item, &osdc->req_notarget);  		return err; -	pgid = reqhead->layout.ol_pgid; +	}  	req->r_pgid = pgid; -	err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); -	if (err > 0) { -		o = acting[0]; -		num = err; -	} +	num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o); +	if (num < 0) +		num = 0; -	if ((req->r_osd && req->r_osd->o_osd == o && +	was_paused = req->r_paused; +	req->r_paused = __req_should_be_paused(osdc, req); +	if (was_paused && !req->r_paused) +		force_resend = 1; + +	if ((!force_resend && +	     req->r_osd && req->r_osd->o_osd == o &&  	     req->r_sent >= req->r_osd->o_incarnation &&  	     req->r_num_pg_osds == num &&  	     memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || -	    (req->r_osd == NULL && o == -1)) +	    (req->r_osd == NULL && o == -1) || +	    req->r_paused)  		return 0;  /* no change */ -	dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n", -	     req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, +	dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", +	     req->r_tid, pgid.pool, pgid.seed, o,  	     req->r_osd ? req->r_osd->o_osd : -1);  	/* record full pg acting set */ @@ -832,21 +1381,26 @@ static int __map_osds(struct ceph_osd_client *osdc,  	req->r_osd = __lookup_osd(osdc, o);  	if (!req->r_osd && o >= 0) {  		err = -ENOMEM; -		req->r_osd = create_osd(osdc); -		if (!req->r_osd) +		req->r_osd = create_osd(osdc, o); +		if (!req->r_osd) { +			list_move(&req->r_req_lru_item, &osdc->req_notarget);  			goto out; +		} -		dout("map_osds osd %p is osd%d\n", req->r_osd, o); -		req->r_osd->o_osd = o; -		req->r_osd->o_con.peer_name.num = cpu_to_le64(o); +		dout("map_request osd %p is osd%d\n", req->r_osd, o);  		__insert_osd(osdc, req->r_osd); -		ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]); +		ceph_con_open(&req->r_osd->o_con, +			      CEPH_ENTITY_TYPE_OSD, o, +			      &osdc->osdmap->osd_addr[o]);  	}  	if (req->r_osd) {  		__remove_osd_from_lru(req->r_osd); -		list_add(&req->r_osd_item, &req->r_osd->o_requests); +		list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); +		list_move_tail(&req->r_req_lru_item, &osdc->req_unsent); +	} else { +		list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);  	}  	err = 1;   /* osd or pg changed */ @@ -857,35 +1411,79 @@ out:  /*   * caller should hold map_sem (for read) and request_mutex   */ -static int __send_request(struct ceph_osd_client *osdc, -			  struct ceph_osd_request *req) +static void __send_request(struct ceph_osd_client *osdc, +			   struct ceph_osd_request *req)  { -	struct ceph_osd_request_head *reqhead; -	int err; - -	err = __map_osds(osdc, req); -	if (err < 0) -		return err; -	if (req->r_osd == NULL) { -		dout("send_request %p no up osds in pg\n", req); -		ceph_monc_request_next_osdmap(&osdc->client->monc); -		return 0; -	} - -	dout("send_request %p tid %llu to osd%d flags %d\n", -	     req, req->r_tid, req->r_osd->o_osd, req->r_flags); +	void *p; -	reqhead = req->r_request->front.iov_base; -	reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch); -	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */ -	reqhead->reassert_version = req->r_reassert_version; +	dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n", +	     req, req->r_tid, req->r_osd->o_osd, req->r_flags, +	     (unsigned long long)req->r_pgid.pool, req->r_pgid.seed); + +	/* fill in message content that changes each time we send it */ +	put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); +	put_unaligned_le32(req->r_flags, req->r_request_flags); +	put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool); +	p = req->r_request_pgid; +	ceph_encode_64(&p, req->r_pgid.pool); +	ceph_encode_32(&p, req->r_pgid.seed); +	put_unaligned_le64(1, req->r_request_attempts);  /* FIXME */ +	memcpy(req->r_request_reassert_version, &req->r_reassert_version, +	       sizeof(req->r_reassert_version));  	req->r_stamp = jiffies;  	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);  	ceph_msg_get(req->r_request); /* send consumes a ref */ -	ceph_con_send(&req->r_osd->o_con, req->r_request); +  	req->r_sent = req->r_osd->o_incarnation; + +	ceph_con_send(&req->r_osd->o_con, req->r_request); +} + +/* + * Send any requests in the queue (req_unsent). + */ +static void __send_queued(struct ceph_osd_client *osdc) +{ +	struct ceph_osd_request *req, *tmp; + +	dout("__send_queued\n"); +	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) +		__send_request(osdc, req); +} + +/* + * Caller should hold map_sem for read and request_mutex. + */ +static int __ceph_osdc_start_request(struct ceph_osd_client *osdc, +				     struct ceph_osd_request *req, +				     bool nofail) +{ +	int rc; + +	__register_request(osdc, req); +	req->r_sent = 0; +	req->r_got_reply = 0; +	rc = __map_request(osdc, req, 0); +	if (rc < 0) { +		if (nofail) { +			dout("osdc_start_request failed map, " +				" will retry %lld\n", req->r_tid); +			rc = 0; +		} else { +			__unregister_request(osdc, req); +		} +		return rc; +	} + +	if (req->r_osd == NULL) { +		dout("send_request %p no up osds in pg\n", req); +		ceph_monc_request_next_osdmap(&osdc->client->monc); +	} else { +		__send_queued(osdc); +	} +  	return 0;  } @@ -902,61 +1500,17 @@ static void handle_timeout(struct work_struct *work)  {  	struct ceph_osd_client *osdc =  		container_of(work, struct ceph_osd_client, timeout_work.work); -	struct ceph_osd_request *req, *last_req = NULL; +	struct ceph_osd_request *req;  	struct ceph_osd *osd; -	unsigned long timeout = osdc->client->options->osd_timeout * HZ;  	unsigned long keepalive =  		osdc->client->options->osd_keepalive_timeout * HZ; -	unsigned long last_stamp = 0; -	struct rb_node *p;  	struct list_head slow_osds; -  	dout("timeout\n");  	down_read(&osdc->map_sem);  	ceph_monc_request_next_osdmap(&osdc->client->monc);  	mutex_lock(&osdc->request_mutex); -	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { -		req = rb_entry(p, struct ceph_osd_request, r_node); - -		if (req->r_resend) { -			int err; - -			dout("osdc resending prev failed %lld\n", req->r_tid); -			err = __send_request(osdc, req); -			if (err) -				dout("osdc failed again on %lld\n", req->r_tid); -			else -				req->r_resend = false; -			continue; -		} -	} - -	/* -	 * reset osds that appear to be _really_ unresponsive.  this -	 * is a failsafe measure.. we really shouldn't be getting to -	 * this point if the system is working properly.  the monitors -	 * should mark the osd as failed and we should find out about -	 * it from an updated osd map. -	 */ -	while (timeout && !list_empty(&osdc->req_lru)) { -		req = list_entry(osdc->req_lru.next, struct ceph_osd_request, -				 r_req_lru_item); - -		if (time_before(jiffies, req->r_stamp + timeout)) -			break; - -		BUG_ON(req == last_req && req->r_stamp == last_stamp); -		last_req = req; -		last_stamp = req->r_stamp; - -		osd = req->r_osd; -		BUG_ON(!osd); -		pr_warning(" tid %llu timed out on osd%d, will reset osd\n", -			   req->r_tid, osd->o_osd); -		__kick_requests(osdc, osd); -	}  	/*  	 * ping osds that are a bit slow.  this ensures that if there @@ -982,8 +1536,8 @@ static void handle_timeout(struct work_struct *work)  	}  	__schedule_osd_timeout(osdc); +	__send_queued(osdc);  	mutex_unlock(&osdc->request_mutex); -  	up_read(&osdc->map_sem);  } @@ -997,13 +1551,121 @@ static void handle_osds_timeout(struct work_struct *work)  	dout("osds timeout\n");  	down_read(&osdc->map_sem); -	remove_old_osds(osdc, 0); +	remove_old_osds(osdc);  	up_read(&osdc->map_sem);  	schedule_delayed_work(&osdc->osds_timeout_work,  			      round_jiffies_relative(delay));  } +static int ceph_oloc_decode(void **p, void *end, +			    struct ceph_object_locator *oloc) +{ +	u8 struct_v, struct_cv; +	u32 len; +	void *struct_end; +	int ret = 0; + +	ceph_decode_need(p, end, 1 + 1 + 4, e_inval); +	struct_v = ceph_decode_8(p); +	struct_cv = ceph_decode_8(p); +	if (struct_v < 3) { +		pr_warn("got v %d < 3 cv %d of ceph_object_locator\n", +			struct_v, struct_cv); +		goto e_inval; +	} +	if (struct_cv > 6) { +		pr_warn("got v %d cv %d > 6 of ceph_object_locator\n", +			struct_v, struct_cv); +		goto e_inval; +	} +	len = ceph_decode_32(p); +	ceph_decode_need(p, end, len, e_inval); +	struct_end = *p + len; + +	oloc->pool = ceph_decode_64(p); +	*p += 4; /* skip preferred */ + +	len = ceph_decode_32(p); +	if (len > 0) { +		pr_warn("ceph_object_locator::key is set\n"); +		goto e_inval; +	} + +	if (struct_v >= 5) { +		len = ceph_decode_32(p); +		if (len > 0) { +			pr_warn("ceph_object_locator::nspace is set\n"); +			goto e_inval; +		} +	} + +	if (struct_v >= 6) { +		s64 hash = ceph_decode_64(p); +		if (hash != -1) { +			pr_warn("ceph_object_locator::hash is set\n"); +			goto e_inval; +		} +	} + +	/* skip the rest */ +	*p = struct_end; +out: +	return ret; + +e_inval: +	ret = -EINVAL; +	goto out; +} + +static int ceph_redirect_decode(void **p, void *end, +				struct ceph_request_redirect *redir) +{ +	u8 struct_v, struct_cv; +	u32 len; +	void *struct_end; +	int ret; + +	ceph_decode_need(p, end, 1 + 1 + 4, e_inval); +	struct_v = ceph_decode_8(p); +	struct_cv = ceph_decode_8(p); +	if (struct_cv > 1) { +		pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n", +			struct_v, struct_cv); +		goto e_inval; +	} +	len = ceph_decode_32(p); +	ceph_decode_need(p, end, len, e_inval); +	struct_end = *p + len; + +	ret = ceph_oloc_decode(p, end, &redir->oloc); +	if (ret) +		goto out; + +	len = ceph_decode_32(p); +	if (len > 0) { +		pr_warn("ceph_request_redirect::object_name is set\n"); +		goto e_inval; +	} + +	len = ceph_decode_32(p); +	*p += len; /* skip osd_instructions */ + +	/* skip the rest */ +	*p = struct_end; +out: +	return ret; + +e_inval: +	ret = -EINVAL; +	goto out; +} + +static void complete_request(struct ceph_osd_request *req) +{ +	complete_all(&req->r_safe_completion);  /* fsync waiter */ +} +  /*   * handle osd op reply.  either call the callback if it is specified,   * or do the completion to wake up the waiting thread. @@ -1011,66 +1673,144 @@ static void handle_osds_timeout(struct work_struct *work)  static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,  			 struct ceph_connection *con)  { -	struct ceph_osd_reply_head *rhead = msg->front.iov_base; +	void *p, *end;  	struct ceph_osd_request *req; +	struct ceph_request_redirect redir;  	u64 tid; -	int numops, object_len, flags; +	int object_len; +	unsigned int numops; +	int payload_len, flags;  	s32 result; +	s32 retry_attempt; +	struct ceph_pg pg; +	int err; +	u32 reassert_epoch; +	u64 reassert_version; +	u32 osdmap_epoch; +	int already_completed; +	u32 bytes; +	unsigned int i;  	tid = le64_to_cpu(msg->hdr.tid); -	if (msg->front.iov_len < sizeof(*rhead)) -		goto bad; -	numops = le32_to_cpu(rhead->num_ops); -	object_len = le32_to_cpu(rhead->object_len); -	result = le32_to_cpu(rhead->result); -	if (msg->front.iov_len != sizeof(*rhead) + object_len + -	    numops * sizeof(struct ceph_osd_op)) +	dout("handle_reply %p tid %llu\n", msg, tid); + +	p = msg->front.iov_base; +	end = p + msg->front.iov_len; + +	ceph_decode_need(&p, end, 4, bad); +	object_len = ceph_decode_32(&p); +	ceph_decode_need(&p, end, object_len, bad); +	p += object_len; + +	err = ceph_decode_pgid(&p, end, &pg); +	if (err)  		goto bad; -	dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); + +	ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad); +	flags = ceph_decode_64(&p); +	result = ceph_decode_32(&p); +	reassert_epoch = ceph_decode_32(&p); +	reassert_version = ceph_decode_64(&p); +	osdmap_epoch = ceph_decode_32(&p);  	/* lookup */ +	down_read(&osdc->map_sem);  	mutex_lock(&osdc->request_mutex);  	req = __lookup_request(osdc, tid);  	if (req == NULL) {  		dout("handle_reply tid %llu dne\n", tid); -		mutex_unlock(&osdc->request_mutex); -		return; +		goto bad_mutex;  	}  	ceph_osdc_get_request(req); -	flags = le32_to_cpu(rhead->flags); -	/* -	 * if this connection filled our message, drop our reference now, to -	 * avoid a (safe but slower) revoke later. -	 */ -	if (req->r_con_filling_msg == con && req->r_reply == msg) { -		dout(" dropping con_filling_msg ref %p\n", con); -		req->r_con_filling_msg = NULL; -		ceph_con_put(con); +	dout("handle_reply %p tid %llu req %p result %d\n", msg, tid, +	     req, result); + +	ceph_decode_need(&p, end, 4, bad_put); +	numops = ceph_decode_32(&p); +	if (numops > CEPH_OSD_MAX_OP) +		goto bad_put; +	if (numops != req->r_num_ops) +		goto bad_put; +	payload_len = 0; +	ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put); +	for (i = 0; i < numops; i++) { +		struct ceph_osd_op *op = p; +		int len; + +		len = le32_to_cpu(op->payload_len); +		req->r_reply_op_len[i] = len; +		dout(" op %d has %d bytes\n", i, len); +		payload_len += len; +		p += sizeof(*op); +	} +	bytes = le32_to_cpu(msg->hdr.data_len); +	if (payload_len != bytes) { +		pr_warning("sum of op payload lens %d != data_len %d", +			   payload_len, bytes); +		goto bad_put;  	} -	if (!req->r_got_reply) { -		unsigned bytes; +	ceph_decode_need(&p, end, 4 + numops * 4, bad_put); +	retry_attempt = ceph_decode_32(&p); +	for (i = 0; i < numops; i++) +		req->r_reply_op_result[i] = ceph_decode_32(&p); + +	if (le16_to_cpu(msg->hdr.version) >= 6) { +		p += 8 + 4; /* skip replay_version */ +		p += 8; /* skip user_version */ + +		err = ceph_redirect_decode(&p, end, &redir); +		if (err) +			goto bad_put; +	} else { +		redir.oloc.pool = -1; +	} + +	if (redir.oloc.pool != -1) { +		dout("redirect pool %lld\n", redir.oloc.pool); + +		__unregister_request(osdc, req); + +		req->r_target_oloc = redir.oloc; /* struct */ + +		/* +		 * Start redirect requests with nofail=true.  If +		 * mapping fails, request will end up on the notarget +		 * list, waiting for the new osdmap (which can take +		 * a while), even though the original request mapped +		 * successfully.  In the future we might want to follow +		 * original request's nofail setting here. +		 */ +		err = __ceph_osdc_start_request(osdc, req, true); +		BUG_ON(err); -		req->r_result = le32_to_cpu(rhead->result); -		bytes = le32_to_cpu(msg->hdr.data_len); +		goto out_unlock; +	} + +	already_completed = req->r_got_reply; +	if (!req->r_got_reply) { +		req->r_result = result;  		dout("handle_reply result %d bytes %d\n", req->r_result,  		     bytes);  		if (req->r_result == 0)  			req->r_result = bytes;  		/* in case this is a write and we need to replay, */ -		req->r_reassert_version = rhead->reassert_version; +		req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch); +		req->r_reassert_version.version = cpu_to_le64(reassert_version);  		req->r_got_reply = 1;  	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {  		dout("handle_reply tid %llu dup ack\n", tid); -		mutex_unlock(&osdc->request_mutex); -		goto done; +		goto out_unlock;  	}  	dout("handle_reply tid %llu flags %d\n", tid, flags); +	if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) +		__register_linger_request(osdc, req); +  	/* either this is a read, or we got the safe response */  	if (result < 0 ||  	    (flags & CEPH_OSD_FLAG_ONDISK) || @@ -1078,131 +1818,160 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,  		__unregister_request(osdc, req);  	mutex_unlock(&osdc->request_mutex); +	up_read(&osdc->map_sem); -	if (req->r_callback) -		req->r_callback(req, msg); -	else -		complete_all(&req->r_completion); +	if (!already_completed) { +		if (req->r_unsafe_callback && +		    result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK)) +			req->r_unsafe_callback(req, true); +		if (req->r_callback) +			req->r_callback(req, msg); +		else +			complete_all(&req->r_completion); +	}  	if (flags & CEPH_OSD_FLAG_ONDISK) { -		if (req->r_safe_callback) -			req->r_safe_callback(req, msg); -		complete_all(&req->r_safe_completion);  /* fsync waiter */ +		if (req->r_unsafe_callback && already_completed) +			req->r_unsafe_callback(req, false); +		complete_request(req);  	} -done: +out: +	dout("req=%p req->r_linger=%d\n", req, req->r_linger);  	ceph_osdc_put_request(req);  	return; +out_unlock: +	mutex_unlock(&osdc->request_mutex); +	up_read(&osdc->map_sem); +	goto out; +bad_put: +	req->r_result = -EIO; +	__unregister_request(osdc, req); +	if (req->r_callback) +		req->r_callback(req, msg); +	else +		complete_all(&req->r_completion); +	complete_request(req); +	ceph_osdc_put_request(req); +bad_mutex: +	mutex_unlock(&osdc->request_mutex); +	up_read(&osdc->map_sem);  bad: -	pr_err("corrupt osd_op_reply got %d %d expected %d\n", -	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len), -	       (int)sizeof(*rhead)); +	pr_err("corrupt osd_op_reply got %d %d\n", +	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));  	ceph_msg_dump(msg);  } - -static int __kick_requests(struct ceph_osd_client *osdc, -			  struct ceph_osd *kickosd) +static void reset_changed_osds(struct ceph_osd_client *osdc)  { -	struct ceph_osd_request *req;  	struct rb_node *p, *n; -	int needmap = 0; -	int err; -	dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1); -	if (kickosd) { -		err = __reset_osd(osdc, kickosd); -		if (err == -EAGAIN) -			return 1; -	} else { -		for (p = rb_first(&osdc->osds); p; p = n) { -			struct ceph_osd *osd = -				rb_entry(p, struct ceph_osd, o_node); - -			n = rb_next(p); -			if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || -			    memcmp(&osd->o_con.peer_addr, -				   ceph_osd_addr(osdc->osdmap, -						 osd->o_osd), -				   sizeof(struct ceph_entity_addr)) != 0) -				__reset_osd(osdc, osd); -		} +	for (p = rb_first(&osdc->osds); p; p = n) { +		struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); + +		n = rb_next(p); +		if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || +		    memcmp(&osd->o_con.peer_addr, +			   ceph_osd_addr(osdc->osdmap, +					 osd->o_osd), +			   sizeof(struct ceph_entity_addr)) != 0) +			__reset_osd(osdc, osd);  	} +} -	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { -		req = rb_entry(p, struct ceph_osd_request, r_node); +/* + * Requeue requests whose mapping to an OSD has changed.  If requests map to + * no osd, request a new map. + * + * Caller should hold map_sem for read. + */ +static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, +			  bool force_resend_writes) +{ +	struct ceph_osd_request *req, *nreq; +	struct rb_node *p; +	int needmap = 0; +	int err; +	bool force_resend_req; -		if (req->r_resend) { -			dout(" r_resend set on tid %llu\n", req->r_tid); -			__cancel_request(req); -			goto kick; +	dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "", +		force_resend_writes ? " (force resend writes)" : ""); +	mutex_lock(&osdc->request_mutex); +	for (p = rb_first(&osdc->requests); p; ) { +		req = rb_entry(p, struct ceph_osd_request, r_node); +		p = rb_next(p); + +		/* +		 * For linger requests that have not yet been +		 * registered, move them to the linger list; they'll +		 * be sent to the osd in the loop below.  Unregister +		 * the request before re-registering it as a linger +		 * request to ensure the __map_request() below +		 * will decide it needs to be sent. +		 */ +		if (req->r_linger && list_empty(&req->r_linger_item)) { +			dout("%p tid %llu restart on osd%d\n", +			     req, req->r_tid, +			     req->r_osd ? req->r_osd->o_osd : -1); +			ceph_osdc_get_request(req); +			__unregister_request(osdc, req); +			__register_linger_request(osdc, req); +			ceph_osdc_put_request(req); +			continue;  		} -		if (req->r_osd && kickosd == req->r_osd) { -			__cancel_request(req); -			goto kick; + +		force_resend_req = force_resend || +			(force_resend_writes && +				req->r_flags & CEPH_OSD_FLAG_WRITE); +		err = __map_request(osdc, req, force_resend_req); +		if (err < 0) +			continue;  /* error */ +		if (req->r_osd == NULL) { +			dout("%p tid %llu maps to no osd\n", req, req->r_tid); +			needmap++;  /* request a newer map */ +		} else if (err > 0) { +			if (!req->r_linger) { +				dout("%p tid %llu requeued on osd%d\n", req, +				     req->r_tid, +				     req->r_osd ? req->r_osd->o_osd : -1); +				req->r_flags |= CEPH_OSD_FLAG_RETRY; +			}  		} +	} -		err = __map_osds(osdc, req); +	list_for_each_entry_safe(req, nreq, &osdc->req_linger, +				 r_linger_item) { +		dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); + +		err = __map_request(osdc, req, +				    force_resend || force_resend_writes); +		dout("__map_request returned %d\n", err);  		if (err == 0) -			continue;  /* no change */ -		if (err < 0) { -			/* -			 * FIXME: really, we should set the request -			 * error and fail if this isn't a 'nofail' -			 * request, but that's a fair bit more -			 * complicated to do.  So retry! -			 */ -			dout(" setting r_resend on %llu\n", req->r_tid); -			req->r_resend = true; -			continue; -		} +			continue;  /* no change and no osd was specified */ +		if (err < 0) +			continue;  /* hrm! */  		if (req->r_osd == NULL) {  			dout("tid %llu maps to no valid osd\n", req->r_tid);  			needmap++;  /* request a newer map */  			continue;  		} -kick: -		dout("kicking %p tid %llu osd%d\n", req, req->r_tid, +		dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid,  		     req->r_osd ? req->r_osd->o_osd : -1); -		req->r_flags |= CEPH_OSD_FLAG_RETRY; -		err = __send_request(osdc, req); -		if (err) { -			dout(" setting r_resend on %llu\n", req->r_tid); -			req->r_resend = true; -		} +		__register_request(osdc, req); +		__unregister_linger_request(osdc, req);  	} - -	return needmap; -} - -/* - * Resubmit osd requests whose osd or osd address has changed.  Request - * a new osd map if osds are down, or we are otherwise unable to determine - * how to direct a request. - * - * Close connections to down osds. - * - * If @who is specified, resubmit requests for that specific osd. - * - * Caller should hold map_sem for read and request_mutex. - */ -static void kick_requests(struct ceph_osd_client *osdc, -			  struct ceph_osd *kickosd) -{ -	int needmap; - -	mutex_lock(&osdc->request_mutex); -	needmap = __kick_requests(osdc, kickosd); +	reset_changed_osds(osdc);  	mutex_unlock(&osdc->request_mutex);  	if (needmap) {  		dout("%d requests for down osds, need new map\n", needmap);  		ceph_monc_request_next_osdmap(&osdc->client->monc);  	} -  } + +  /*   * Process updated osd map.   * @@ -1218,6 +1987,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)  	struct ceph_osdmap *newmap = NULL, *oldmap;  	int err;  	struct ceph_fsid fsid; +	bool was_full;  	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);  	p = msg->front.iov_base; @@ -1231,6 +2001,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)  	down_write(&osdc->map_sem); +	was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); +  	/* incremental maps */  	ceph_decode_32_safe(&p, end, nr_maps, bad);  	dout(" %d inc maps\n", nr_maps); @@ -1245,7 +2017,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)  			     epoch, maplen);  			newmap = osdmap_apply_incremental(&p, next,  							  osdc->osdmap, -							  osdc->client->msgr); +							  &osdc->client->msgr);  			if (IS_ERR(newmap)) {  				err = PTR_ERR(newmap);  				goto bad; @@ -1255,6 +2027,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)  				ceph_osdmap_destroy(osdc->osdmap);  				osdc->osdmap = newmap;  			} +			was_full = was_full || +				ceph_osdmap_flag(osdc->osdmap, +						 CEPH_OSDMAP_FULL); +			kick_requests(osdc, 0, was_full);  		} else {  			dout("ignoring incremental map %u len %d\n",  			     epoch, maplen); @@ -1281,8 +2057,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)  			     "older than our %u\n", epoch, maplen,  			     osdc->osdmap->epoch);  		} else { +			int skipped_map = 0; +  			dout("taking full map %u len %d\n", epoch, maplen); -			newmap = osdmap_decode(&p, p+maplen); +			newmap = ceph_osdmap_decode(&p, p+maplen);  			if (IS_ERR(newmap)) {  				err = PTR_ERR(newmap);  				goto bad; @@ -1290,18 +2068,39 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)  			BUG_ON(!newmap);  			oldmap = osdc->osdmap;  			osdc->osdmap = newmap; -			if (oldmap) +			if (oldmap) { +				if (oldmap->epoch + 1 < newmap->epoch) +					skipped_map = 1;  				ceph_osdmap_destroy(oldmap); +			} +			was_full = was_full || +				ceph_osdmap_flag(osdc->osdmap, +						 CEPH_OSDMAP_FULL); +			kick_requests(osdc, skipped_map, was_full);  		}  		p += maplen;  		nr_maps--;  	} +	if (!osdc->osdmap) +		goto bad;  done:  	downgrade_write(&osdc->map_sem);  	ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); -	if (newmap) -		kick_requests(osdc, NULL); + +	/* +	 * subscribe to subsequent osdmap updates if full to ensure +	 * we find out when we are no longer full and stop returning +	 * ENOSPC. +	 */ +	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || +		ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || +		ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) +		ceph_monc_request_next_osdmap(&osdc->client->monc); + +	mutex_lock(&osdc->request_mutex); +	__send_queued(osdc); +	mutex_unlock(&osdc->request_mutex);  	up_read(&osdc->map_sem);  	wake_up_all(&osdc->client->auth_wq);  	return; @@ -1310,49 +2109,322 @@ bad:  	pr_err("osdc handle_map corrupt msg\n");  	ceph_msg_dump(msg);  	up_write(&osdc->map_sem); +} + +/* + * watch/notify callback event infrastructure + * + * These callbacks are used both for watch and notify operations. + */ +static void __release_event(struct kref *kref) +{ +	struct ceph_osd_event *event = +		container_of(kref, struct ceph_osd_event, kref); + +	dout("__release_event %p\n", event); +	kfree(event); +} + +static void get_event(struct ceph_osd_event *event) +{ +	kref_get(&event->kref); +} + +void ceph_osdc_put_event(struct ceph_osd_event *event) +{ +	kref_put(&event->kref, __release_event); +} +EXPORT_SYMBOL(ceph_osdc_put_event); + +static void __insert_event(struct ceph_osd_client *osdc, +			     struct ceph_osd_event *new) +{ +	struct rb_node **p = &osdc->event_tree.rb_node; +	struct rb_node *parent = NULL; +	struct ceph_osd_event *event = NULL; + +	while (*p) { +		parent = *p; +		event = rb_entry(parent, struct ceph_osd_event, node); +		if (new->cookie < event->cookie) +			p = &(*p)->rb_left; +		else if (new->cookie > event->cookie) +			p = &(*p)->rb_right; +		else +			BUG(); +	} + +	rb_link_node(&new->node, parent, p); +	rb_insert_color(&new->node, &osdc->event_tree); +} + +static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, +					        u64 cookie) +{ +	struct rb_node **p = &osdc->event_tree.rb_node; +	struct rb_node *parent = NULL; +	struct ceph_osd_event *event = NULL; + +	while (*p) { +		parent = *p; +		event = rb_entry(parent, struct ceph_osd_event, node); +		if (cookie < event->cookie) +			p = &(*p)->rb_left; +		else if (cookie > event->cookie) +			p = &(*p)->rb_right; +		else +			return event; +	} +	return NULL; +} + +static void __remove_event(struct ceph_osd_event *event) +{ +	struct ceph_osd_client *osdc = event->osdc; + +	if (!RB_EMPTY_NODE(&event->node)) { +		dout("__remove_event removed %p\n", event); +		rb_erase(&event->node, &osdc->event_tree); +		ceph_osdc_put_event(event); +	} else { +		dout("__remove_event didn't remove %p\n", event); +	} +} + +int ceph_osdc_create_event(struct ceph_osd_client *osdc, +			   void (*event_cb)(u64, u64, u8, void *), +			   void *data, struct ceph_osd_event **pevent) +{ +	struct ceph_osd_event *event; + +	event = kmalloc(sizeof(*event), GFP_NOIO); +	if (!event) +		return -ENOMEM; + +	dout("create_event %p\n", event); +	event->cb = event_cb; +	event->one_shot = 0; +	event->data = data; +	event->osdc = osdc; +	INIT_LIST_HEAD(&event->osd_node); +	RB_CLEAR_NODE(&event->node); +	kref_init(&event->kref);   /* one ref for us */ +	kref_get(&event->kref);    /* one ref for the caller */ + +	spin_lock(&osdc->event_lock); +	event->cookie = ++osdc->event_count; +	__insert_event(osdc, event); +	spin_unlock(&osdc->event_lock); + +	*pevent = event; +	return 0; +} +EXPORT_SYMBOL(ceph_osdc_create_event); + +void ceph_osdc_cancel_event(struct ceph_osd_event *event) +{ +	struct ceph_osd_client *osdc = event->osdc; + +	dout("cancel_event %p\n", event); +	spin_lock(&osdc->event_lock); +	__remove_event(event); +	spin_unlock(&osdc->event_lock); +	ceph_osdc_put_event(event); /* caller's */ +} +EXPORT_SYMBOL(ceph_osdc_cancel_event); + + +static void do_event_work(struct work_struct *work) +{ +	struct ceph_osd_event_work *event_work = +		container_of(work, struct ceph_osd_event_work, work); +	struct ceph_osd_event *event = event_work->event; +	u64 ver = event_work->ver; +	u64 notify_id = event_work->notify_id; +	u8 opcode = event_work->opcode; + +	dout("do_event_work completing %p\n", event); +	event->cb(ver, notify_id, opcode, event->data); +	dout("do_event_work completed %p\n", event); +	ceph_osdc_put_event(event); +	kfree(event_work); +} + + +/* + * Process osd watch notifications + */ +static void handle_watch_notify(struct ceph_osd_client *osdc, +				struct ceph_msg *msg) +{ +	void *p, *end; +	u8 proto_ver; +	u64 cookie, ver, notify_id; +	u8 opcode; +	struct ceph_osd_event *event; +	struct ceph_osd_event_work *event_work; + +	p = msg->front.iov_base; +	end = p + msg->front.iov_len; + +	ceph_decode_8_safe(&p, end, proto_ver, bad); +	ceph_decode_8_safe(&p, end, opcode, bad); +	ceph_decode_64_safe(&p, end, cookie, bad); +	ceph_decode_64_safe(&p, end, ver, bad); +	ceph_decode_64_safe(&p, end, notify_id, bad); + +	spin_lock(&osdc->event_lock); +	event = __find_event(osdc, cookie); +	if (event) { +		BUG_ON(event->one_shot); +		get_event(event); +	} +	spin_unlock(&osdc->event_lock); +	dout("handle_watch_notify cookie %lld ver %lld event %p\n", +	     cookie, ver, event); +	if (event) { +		event_work = kmalloc(sizeof(*event_work), GFP_NOIO); +		if (!event_work) { +			dout("ERROR: could not allocate event_work\n"); +			goto done_err; +		} +		INIT_WORK(&event_work->work, do_event_work); +		event_work->event = event; +		event_work->ver = ver; +		event_work->notify_id = notify_id; +		event_work->opcode = opcode; +		if (!queue_work(osdc->notify_wq, &event_work->work)) { +			dout("WARNING: failed to queue notify event work\n"); +			goto done_err; +		} +	} + +	return; + +done_err: +	ceph_osdc_put_event(event);  	return; + +bad: +	pr_err("osdc handle_watch_notify corrupt msg\n");  }  /* + * build new request AND message + * + */ +void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, +				struct ceph_snap_context *snapc, u64 snap_id, +				struct timespec *mtime) +{ +	struct ceph_msg *msg = req->r_request; +	void *p; +	size_t msg_size; +	int flags = req->r_flags; +	u64 data_len; +	unsigned int i; + +	req->r_snapid = snap_id; +	req->r_snapc = ceph_get_snap_context(snapc); + +	/* encode request */ +	msg->hdr.version = cpu_to_le16(4); + +	p = msg->front.iov_base; +	ceph_encode_32(&p, 1);   /* client_inc  is always 1 */ +	req->r_request_osdmap_epoch = p; +	p += 4; +	req->r_request_flags = p; +	p += 4; +	if (req->r_flags & CEPH_OSD_FLAG_WRITE) +		ceph_encode_timespec(p, mtime); +	p += sizeof(struct ceph_timespec); +	req->r_request_reassert_version = p; +	p += sizeof(struct ceph_eversion); /* will get filled in */ + +	/* oloc */ +	ceph_encode_8(&p, 4); +	ceph_encode_8(&p, 4); +	ceph_encode_32(&p, 8 + 4 + 4); +	req->r_request_pool = p; +	p += 8; +	ceph_encode_32(&p, -1);  /* preferred */ +	ceph_encode_32(&p, 0);   /* key len */ + +	ceph_encode_8(&p, 1); +	req->r_request_pgid = p; +	p += 8 + 4; +	ceph_encode_32(&p, -1);  /* preferred */ + +	/* oid */ +	ceph_encode_32(&p, req->r_base_oid.name_len); +	memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len); +	dout("oid '%.*s' len %d\n", req->r_base_oid.name_len, +	     req->r_base_oid.name, req->r_base_oid.name_len); +	p += req->r_base_oid.name_len; + +	/* ops--can imply data */ +	ceph_encode_16(&p, (u16)req->r_num_ops); +	data_len = 0; +	for (i = 0; i < req->r_num_ops; i++) { +		data_len += osd_req_encode_op(req, p, i); +		p += sizeof(struct ceph_osd_op); +	} + +	/* snaps */ +	ceph_encode_64(&p, req->r_snapid); +	ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); +	ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); +	if (req->r_snapc) { +		for (i = 0; i < snapc->num_snaps; i++) { +			ceph_encode_64(&p, req->r_snapc->snaps[i]); +		} +	} + +	req->r_request_attempts = p; +	p += 4; + +	/* data */ +	if (flags & CEPH_OSD_FLAG_WRITE) { +		u16 data_off; + +		/* +		 * The header "data_off" is a hint to the receiver +		 * allowing it to align received data into its +		 * buffers such that there's no need to re-copy +		 * it before writing it to disk (direct I/O). +		 */ +		data_off = (u16) (off & 0xffff); +		req->r_request->hdr.data_off = cpu_to_le16(data_off); +	} +	req->r_request->hdr.data_len = cpu_to_le32(data_len); + +	BUG_ON(p > msg->front.iov_base + msg->front.iov_len); +	msg_size = p - msg->front.iov_base; +	msg->front.iov_len = msg_size; +	msg->hdr.front_len = cpu_to_le32(msg_size); + +	dout("build_request msg_size was %d\n", (int)msg_size); +} +EXPORT_SYMBOL(ceph_osdc_build_request); + +/*   * Register request, send initial attempt.   */  int ceph_osdc_start_request(struct ceph_osd_client *osdc,  			    struct ceph_osd_request *req,  			    bool nofail)  { -	int rc = 0; - -	req->r_request->pages = req->r_pages; -	req->r_request->nr_pages = req->r_num_pages; -#ifdef CONFIG_BLOCK -	req->r_request->bio = req->r_bio; -#endif -	req->r_request->trail = req->r_trail; - -	register_request(osdc, req); +	int rc;  	down_read(&osdc->map_sem);  	mutex_lock(&osdc->request_mutex); -	/* -	 * a racing kick_requests() may have sent the message for us -	 * while we dropped request_mutex above, so only send now if -	 * the request still han't been touched yet. -	 */ -	if (req->r_sent == 0) { -		rc = __send_request(osdc, req); -		if (rc) { -			if (nofail) { -				dout("osdc_start_request failed send, " -				     " marking %lld\n", req->r_tid); -				req->r_resend = true; -				rc = 0; -			} else { -				__unregister_request(osdc, req); -			} -		} -	} + +	rc = __ceph_osdc_start_request(osdc, req, nofail); +  	mutex_unlock(&osdc->request_mutex);  	up_read(&osdc->map_sem); +  	return rc;  }  EXPORT_SYMBOL(ceph_osdc_start_request); @@ -1371,6 +2443,7 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc,  		__cancel_request(req);  		__unregister_request(osdc, req);  		mutex_unlock(&osdc->request_mutex); +		complete_request(req);  		dout("wait_request tid %llu canceled/timed out\n", req->r_tid);  		return rc;  	} @@ -1415,6 +2488,17 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc)  EXPORT_SYMBOL(ceph_osdc_sync);  /* + * Call all pending notify callbacks - for use after a watch is + * unregistered, to make sure no more callbacks for it will be invoked + */ +void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc) +{ +	flush_workqueue(osdc->notify_wq); +} +EXPORT_SYMBOL(ceph_osdc_flush_notifies); + + +/*   * init, shutdown   */  int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) @@ -1433,9 +2517,15 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)  	INIT_LIST_HEAD(&osdc->osd_lru);  	osdc->requests = RB_ROOT;  	INIT_LIST_HEAD(&osdc->req_lru); +	INIT_LIST_HEAD(&osdc->req_unsent); +	INIT_LIST_HEAD(&osdc->req_notarget); +	INIT_LIST_HEAD(&osdc->req_linger);  	osdc->num_requests = 0;  	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);  	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); +	spin_lock_init(&osdc->event_lock); +	osdc->event_tree = RB_ROOT; +	osdc->event_count = 0;  	schedule_delayed_work(&osdc->osds_timeout_work,  	   round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); @@ -1446,17 +2536,26 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)  	if (!osdc->req_mempool)  		goto out; -	err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true, +	err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, +				OSD_OP_FRONT_LEN, 10, true,  				"osd_op");  	if (err < 0)  		goto out_mempool; -	err = ceph_msgpool_init(&osdc->msgpool_op_reply, +	err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,  				OSD_OPREPLY_FRONT_LEN, 10, true,  				"osd_op_reply");  	if (err < 0)  		goto out_msgpool; + +	err = -ENOMEM; +	osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); +	if (!osdc->notify_wq) +		goto out_msgpool_reply; +  	return 0; +out_msgpool_reply: +	ceph_msgpool_destroy(&osdc->msgpool_op_reply);  out_msgpool:  	ceph_msgpool_destroy(&osdc->msgpool_op);  out_mempool: @@ -1464,22 +2563,22 @@ out_mempool:  out:  	return err;  } -EXPORT_SYMBOL(ceph_osdc_init);  void ceph_osdc_stop(struct ceph_osd_client *osdc)  { +	flush_workqueue(osdc->notify_wq); +	destroy_workqueue(osdc->notify_wq);  	cancel_delayed_work_sync(&osdc->timeout_work);  	cancel_delayed_work_sync(&osdc->osds_timeout_work);  	if (osdc->osdmap) {  		ceph_osdmap_destroy(osdc->osdmap);  		osdc->osdmap = NULL;  	} -	remove_old_osds(osdc, 1); +	remove_all_osds(osdc);  	mempool_destroy(osdc->req_mempool);  	ceph_msgpool_destroy(&osdc->msgpool_op);  	ceph_msgpool_destroy(&osdc->msgpool_op_reply);  } -EXPORT_SYMBOL(ceph_osdc_stop);  /*   * Read some contiguous pages.  If we cross a stripe boundary, shorten @@ -1489,25 +2588,29 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,  			struct ceph_vino vino, struct ceph_file_layout *layout,  			u64 off, u64 *plen,  			u32 truncate_seq, u64 truncate_size, -			struct page **pages, int num_pages) +			struct page **pages, int num_pages, int page_align)  {  	struct ceph_osd_request *req;  	int rc = 0;  	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,  	     vino.snap, off, *plen); -	req = ceph_osdc_new_request(osdc, layout, vino, off, plen, +	req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1,  				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, -				    NULL, 0, truncate_seq, truncate_size, NULL, -				    false, 1); -	if (!req) -		return -ENOMEM; +				    NULL, truncate_seq, truncate_size, +				    false); +	if (IS_ERR(req)) +		return PTR_ERR(req);  	/* it may be a short read due to an object boundary */ -	req->r_pages = pages; -	dout("readpages  final extent is %llu~%llu (%d pages)\n", -	     off, *plen, req->r_num_pages); +	osd_req_op_extent_osd_data_pages(req, 0, +				pages, *plen, page_align, false, false); + +	dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n", +	     off, *plen, *plen, page_align); + +	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);  	rc = ceph_osdc_start_request(osdc, req, false);  	if (!rc) @@ -1528,29 +2631,29 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,  			 u64 off, u64 len,  			 u32 truncate_seq, u64 truncate_size,  			 struct timespec *mtime, -			 struct page **pages, int num_pages, -			 int flags, int do_sync, bool nofail) +			 struct page **pages, int num_pages)  {  	struct ceph_osd_request *req;  	int rc = 0; +	int page_align = off & ~PAGE_MASK; -	BUG_ON(vino.snap != CEPH_NOSNAP); -	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, +	BUG_ON(vino.snap != CEPH_NOSNAP);	/* snapshots aren't writeable */ +	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1,  				    CEPH_OSD_OP_WRITE, -				    flags | CEPH_OSD_FLAG_ONDISK | -					    CEPH_OSD_FLAG_WRITE, -				    snapc, do_sync, -				    truncate_seq, truncate_size, mtime, -				    nofail, 1); -	if (!req) -		return -ENOMEM; +				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, +				    snapc, truncate_seq, truncate_size, +				    true); +	if (IS_ERR(req)) +		return PTR_ERR(req);  	/* it may be a short write due to an object boundary */ -	req->r_pages = pages; -	dout("writepages %llu~%llu (%d pages)\n", off, len, -	     req->r_num_pages); +	osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, +				false, false); +	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); + +	ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime); -	rc = ceph_osdc_start_request(osdc, req, nofail); +	rc = ceph_osdc_start_request(osdc, req, true);  	if (!rc)  		rc = ceph_osdc_wait_request(osdc, req); @@ -1562,6 +2665,26 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,  }  EXPORT_SYMBOL(ceph_osdc_writepages); +int ceph_osdc_setup(void) +{ +	BUG_ON(ceph_osd_request_cache); +	ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", +					sizeof (struct ceph_osd_request), +					__alignof__(struct ceph_osd_request), +					0, NULL); + +	return ceph_osd_request_cache ? 0 : -ENOMEM; +} +EXPORT_SYMBOL(ceph_osdc_setup); + +void ceph_osdc_cleanup(void) +{ +	BUG_ON(!ceph_osd_request_cache); +	kmem_cache_destroy(ceph_osd_request_cache); +	ceph_osd_request_cache = NULL; +} +EXPORT_SYMBOL(ceph_osdc_cleanup); +  /*   * handle incoming message   */ @@ -1582,6 +2705,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)  	case CEPH_MSG_OSD_OPREPLY:  		handle_reply(osdc, msg, con);  		break; +	case CEPH_MSG_WATCH_NOTIFY: +		handle_watch_notify(osdc, msg); +		break;  	default:  		pr_err("received unknown message type %d %s\n", type, @@ -1603,7 +2729,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,  	struct ceph_osd_client *osdc = osd->o_osdc;  	struct ceph_msg *m;  	struct ceph_osd_request *req; -	int front = le32_to_cpu(hdr->front_len); +	int front_len = le32_to_cpu(hdr->front_len);  	int data_len = le32_to_cpu(hdr->data_len);  	u64 tid; @@ -1613,23 +2739,23 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,  	if (!req) {  		*skip = 1;  		m = NULL; -		pr_info("get_reply unknown tid %llu from osd%d\n", tid, -			osd->o_osd); +		dout("get_reply unknown tid %llu from osd%d\n", tid, +		     osd->o_osd);  		goto out;  	} -	if (req->r_con_filling_msg) { -		dout("get_reply revoking msg %p from old con %p\n", -		     req->r_reply, req->r_con_filling_msg); -		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); -		ceph_con_put(req->r_con_filling_msg); -		req->r_con_filling_msg = NULL; -	} - -	if (front > req->r_reply->front.iov_len) { -		pr_warning("get_reply front %d > preallocated %d\n", -			   front, (int)req->r_reply->front.iov_len); -		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS); +	if (req->r_reply->con) +		dout("%s revoking msg %p from old con %p\n", __func__, +		     req->r_reply, req->r_reply->con); +	ceph_msg_revoke_incoming(req->r_reply); + +	if (front_len > req->r_reply->front_alloc_len) { +		pr_warning("get_reply front %d > preallocated %d (%u#%llu)\n", +			   front_len, req->r_reply->front_alloc_len, +			   (unsigned int)con->peer_name.type, +			   le64_to_cpu(con->peer_name.num)); +		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, +				 false);  		if (!m)  			goto out;  		ceph_msg_put(req->r_reply); @@ -1638,25 +2764,29 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,  	m = ceph_msg_get(req->r_reply);  	if (data_len > 0) { -		unsigned data_off = le16_to_cpu(hdr->data_off); -		int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); - -		if (unlikely(req->r_num_pages < want)) { -			pr_warning("tid %lld reply %d > expected %d pages\n", -				   tid, want, m->nr_pages); -			*skip = 1; -			ceph_msg_put(m); -			m = NULL; -			goto out; +		struct ceph_osd_data *osd_data; + +		/* +		 * XXX This is assuming there is only one op containing +		 * XXX page data.  Probably OK for reads, but this +		 * XXX ought to be done more generally. +		 */ +		osd_data = osd_req_op_extent_osd_data(req, 0); +		if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { +			if (osd_data->pages && +				unlikely(osd_data->length < data_len)) { + +				pr_warning("tid %lld reply has %d bytes " +					"we had only %llu bytes ready\n", +					tid, data_len, osd_data->length); +				*skip = 1; +				ceph_msg_put(m); +				m = NULL; +				goto out; +			}  		} -		m->pages = req->r_pages; -		m->nr_pages = req->r_num_pages; -#ifdef CONFIG_BLOCK -		m->bio = req->r_bio; -#endif  	}  	*skip = 0; -	req->r_con_filling_msg = ceph_con_get(con);  	dout("get_reply tid %lld %p\n", tid, m);  out: @@ -1673,9 +2803,11 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,  	int type = le16_to_cpu(hdr->type);  	int front = le32_to_cpu(hdr->front_len); +	*skip = 0;  	switch (type) {  	case CEPH_MSG_OSD_MAP: -		return ceph_msg_new(type, front, GFP_NOFS); +	case CEPH_MSG_WATCH_NOTIFY: +		return ceph_msg_new(type, front, GFP_NOFS, false);  	case CEPH_MSG_OSD_OPREPLY:  		return get_reply(con, hdr, skip);  	default: @@ -1706,37 +2838,36 @@ static void put_osd_con(struct ceph_connection *con)  /*   * authentication   */ -static int get_authorizer(struct ceph_connection *con, -			  void **buf, int *len, int *proto, -			  void **reply_buf, int *reply_len, int force_new) +/* + * Note: returned pointer is the address of a structure that's + * managed separately.  Caller must *not* attempt to free it. + */ +static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, +					int *proto, int force_new)  {  	struct ceph_osd *o = con->private;  	struct ceph_osd_client *osdc = o->o_osdc;  	struct ceph_auth_client *ac = osdc->client->monc.auth; -	int ret = 0; +	struct ceph_auth_handshake *auth = &o->o_auth; -	if (force_new && o->o_authorizer) { -		ac->ops->destroy_authorizer(ac, o->o_authorizer); -		o->o_authorizer = NULL; -	} -	if (o->o_authorizer == NULL) { -		ret = ac->ops->create_authorizer( -			ac, CEPH_ENTITY_TYPE_OSD, -			&o->o_authorizer, -			&o->o_authorizer_buf, -			&o->o_authorizer_buf_len, -			&o->o_authorizer_reply_buf, -			&o->o_authorizer_reply_buf_len); +	if (force_new && auth->authorizer) { +		ceph_auth_destroy_authorizer(ac, auth->authorizer); +		auth->authorizer = NULL; +	} +	if (!auth->authorizer) { +		int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, +						      auth);  		if (ret) -			return ret; +			return ERR_PTR(ret); +	} else { +		int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD, +						     auth); +		if (ret) +			return ERR_PTR(ret);  	} -  	*proto = ac->protocol; -	*buf = o->o_authorizer_buf; -	*len = o->o_authorizer_buf_len; -	*reply_buf = o->o_authorizer_reply_buf; -	*reply_len = o->o_authorizer_reply_buf_len; -	return 0; + +	return auth;  } @@ -1746,7 +2877,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)  	struct ceph_osd_client *osdc = o->o_osdc;  	struct ceph_auth_client *ac = osdc->client->monc.auth; -	return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len); +	return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len);  }  static int invalidate_authorizer(struct ceph_connection *con) @@ -1755,9 +2886,7 @@ static int invalidate_authorizer(struct ceph_connection *con)  	struct ceph_osd_client *osdc = o->o_osdc;  	struct ceph_auth_client *ac = osdc->client->monc.auth; -	if (ac->ops->invalidate_authorizer) -		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); - +	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);  	return ceph_monc_validate_auth(&osdc->client->monc);  } diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index d73f3f6efa3..c547e46084d 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -13,32 +13,24 @@  char *ceph_osdmap_state_str(char *str, int len, int state)  { -	int flag = 0; -  	if (!len) -		goto done; - -	*str = '\0'; -	if (state) { -		if (state & CEPH_OSD_EXISTS) { -			snprintf(str, len, "exists"); -			flag = 1; -		} -		if (state & CEPH_OSD_UP) { -			snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""), -				 "up"); -			flag = 1; -		} -	} else { +		return str; + +	if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP)) +		snprintf(str, len, "exists, up"); +	else if (state & CEPH_OSD_EXISTS) +		snprintf(str, len, "exists"); +	else if (state & CEPH_OSD_UP) +		snprintf(str, len, "up"); +	else  		snprintf(str, len, "doesn't exist"); -	} -done: +  	return str;  }  /* maps */ -static int calc_bits_of(unsigned t) +static int calc_bits_of(unsigned int t)  {  	int b = 0;  	while (t) { @@ -53,13 +45,8 @@ static int calc_bits_of(unsigned t)   */  static void calc_pg_masks(struct ceph_pg_pool_info *pi)  { -	pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1; -	pi->pgp_num_mask = -		(1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1; -	pi->lpg_num_mask = -		(1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1; -	pi->lpgp_num_mask = -		(1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1; +	pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1; +	pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1;  }  /* @@ -135,6 +122,21 @@ bad:  	return -EINVAL;  } +static int skip_name_map(void **p, void *end) +{ +        int len; +        ceph_decode_32_safe(p, end, len ,bad); +        while (len--) { +                int strlen; +                *p += sizeof(u32); +                ceph_decode_32_safe(p, end, strlen, bad); +                *p += strlen; +} +        return 0; +bad: +        return -EINVAL; +} +  static struct crush_map *crush_decode(void *pbyval, void *end)  {  	struct crush_map *c; @@ -143,6 +145,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)  	void **p = &pbyval;  	void *start = pbyval;  	u32 magic; +	u32 num_name_maps;  	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); @@ -150,24 +153,23 @@ static struct crush_map *crush_decode(void *pbyval, void *end)  	if (c == NULL)  		return ERR_PTR(-ENOMEM); +        /* set tunables to default values */ +        c->choose_local_tries = 2; +        c->choose_local_fallback_tries = 5; +        c->choose_total_tries = 19; +	c->chooseleaf_descend_once = 0; +  	ceph_decode_need(p, end, 4*sizeof(u32), bad);  	magic = ceph_decode_32(p);  	if (magic != CRUSH_MAGIC) {  		pr_err("crush_decode magic %x != current %x\n", -		       (unsigned)magic, (unsigned)CRUSH_MAGIC); +		       (unsigned int)magic, (unsigned int)CRUSH_MAGIC);  		goto bad;  	}  	c->max_buckets = ceph_decode_32(p);  	c->max_rules = ceph_decode_32(p);  	c->max_devices = ceph_decode_32(p); -	c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS); -	if (c->device_parents == NULL) -		goto badmem; -	c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS); -	if (c->bucket_parents == NULL) -		goto badmem; -  	c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);  	if (c->buckets == NULL)  		goto badmem; @@ -283,7 +285,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)  		ceph_decode_32_safe(p, end, yes, bad);  #if BITS_PER_LONG == 32  		err = -EINVAL; -		if (yes > ULONG_MAX / sizeof(struct crush_rule_step)) +		if (yes > (ULONG_MAX - sizeof(*r)) +			  / sizeof(struct crush_rule_step))  			goto bad;  #endif  		r = c->rules[i] = kmalloc(sizeof(*r) + @@ -303,7 +306,35 @@ static struct crush_map *crush_decode(void *pbyval, void *end)  	}  	/* ignore trailing name maps. */ +        for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) { +                err = skip_name_map(p, end); +                if (err < 0) +                        goto done; +        } + +        /* tunables */ +        ceph_decode_need(p, end, 3*sizeof(u32), done); +        c->choose_local_tries = ceph_decode_32(p); +        c->choose_local_fallback_tries =  ceph_decode_32(p); +        c->choose_total_tries = ceph_decode_32(p); +        dout("crush decode tunable choose_local_tries = %d", +             c->choose_local_tries); +        dout("crush decode tunable choose_local_fallback_tries = %d", +             c->choose_local_fallback_tries); +        dout("crush decode tunable choose_total_tries = %d", +             c->choose_total_tries); + +	ceph_decode_need(p, end, sizeof(u32), done); +	c->chooseleaf_descend_once = ceph_decode_32(p); +	dout("crush decode tunable chooseleaf_descend_once = %d", +	     c->chooseleaf_descend_once); + +	ceph_decode_need(p, end, sizeof(u8), done); +	c->chooseleaf_vary_r = ceph_decode_8(p); +	dout("crush decode tunable chooseleaf_vary_r = %d", +	     c->chooseleaf_vary_r); +done:  	dout("crush_decode success\n");  	return c; @@ -317,16 +348,17 @@ bad:  /*   * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid - * to a set of osds) + * to a set of osds) and primary_temp (explicit primary setting)   */  static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)  { -	u64 a = *(u64 *)&l; -	u64 b = *(u64 *)&r; - -	if (a < b) +	if (l.pool < r.pool)  		return -1; -	if (a > b) +	if (l.pool > r.pool) +		return 1; +	if (l.seed < r.seed) +		return -1; +	if (l.seed > r.seed)  		return 1;  	return 0;  } @@ -339,6 +371,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,  	struct ceph_pg_mapping *pg = NULL;  	int c; +	dout("__insert_pg_mapping %llx %p\n", *(u64 *)&new->pgid, new);  	while (*p) {  		parent = *p;  		pg = rb_entry(parent, struct ceph_pg_mapping, node); @@ -366,16 +399,34 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,  	while (n) {  		pg = rb_entry(n, struct ceph_pg_mapping, node);  		c = pgid_cmp(pgid, pg->pgid); -		if (c < 0) +		if (c < 0) {  			n = n->rb_left; -		else if (c > 0) +		} else if (c > 0) {  			n = n->rb_right; -		else +		} else { +			dout("__lookup_pg_mapping %lld.%x got %p\n", +			     pgid.pool, pgid.seed, pg);  			return pg; +		}  	}  	return NULL;  } +static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid) +{ +	struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); + +	if (pg) { +		dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed, +		     pg); +		rb_erase(&pg->node, root); +		kfree(pg); +		return 0; +	} +	dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed); +	return -ENOENT; +} +  /*   * rbtree of pg pool info   */ @@ -401,7 +452,7 @@ static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)  	return 0;  } -static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) +static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id)  {  	struct ceph_pg_pool_info *pi;  	struct rb_node *n = root->rb_node; @@ -418,6 +469,27 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)  	return NULL;  } +struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) +{ +	return __lookup_pg_pool(&map->pg_pools, id); +} + +const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) +{ +	struct ceph_pg_pool_info *pi; + +	if (id == CEPH_NOPOOL) +		return NULL; + +	if (WARN_ON_ONCE(id > (u64) INT_MAX)) +		return NULL; + +	pi = __lookup_pg_pool(&map->pg_pools, (int) id); + +	return pi ? pi->name : NULL; +} +EXPORT_SYMBOL(ceph_pg_pool_name_by_id); +  int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)  {  	struct rb_node *rbp; @@ -439,52 +511,109 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)  	kfree(pi);  } -static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) +static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)  { -	unsigned n, m; +	u8 ev, cv; +	unsigned len, num; +	void *pool_end; + +	ceph_decode_need(p, end, 2 + 4, bad); +	ev = ceph_decode_8(p);  /* encoding version */ +	cv = ceph_decode_8(p); /* compat version */ +	if (ev < 5) { +		pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); +		return -EINVAL; +	} +	if (cv > 9) { +		pr_warning("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv); +		return -EINVAL; +	} +	len = ceph_decode_32(p); +	ceph_decode_need(p, end, len, bad); +	pool_end = *p + len; -	ceph_decode_copy(p, &pi->v, sizeof(pi->v)); -	calc_pg_masks(pi); +	pi->type = ceph_decode_8(p); +	pi->size = ceph_decode_8(p); +	pi->crush_ruleset = ceph_decode_8(p); +	pi->object_hash = ceph_decode_8(p); -	/* num_snaps * snap_info_t */ -	n = le32_to_cpu(pi->v.num_snaps); -	while (n--) { -		ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) + -				 sizeof(struct ceph_timespec), bad); -		*p += sizeof(u64) +       /* key */ -			1 + sizeof(u64) + /* u8, snapid */ -			sizeof(struct ceph_timespec); -		m = ceph_decode_32(p);    /* snap name */ -		*p += m; +	pi->pg_num = ceph_decode_32(p); +	pi->pgp_num = ceph_decode_32(p); + +	*p += 4 + 4;  /* skip lpg* */ +	*p += 4;      /* skip last_change */ +	*p += 8 + 4;  /* skip snap_seq, snap_epoch */ + +	/* skip snaps */ +	num = ceph_decode_32(p); +	while (num--) { +		*p += 8;  /* snapid key */ +		*p += 1 + 1; /* versions */ +		len = ceph_decode_32(p); +		*p += len; +	} + +	/* skip removed_snaps */ +	num = ceph_decode_32(p); +	*p += num * (8 + 8); + +	*p += 8;  /* skip auid */ +	pi->flags = ceph_decode_64(p); +	*p += 4;  /* skip crash_replay_interval */ + +	if (ev >= 7) +		*p += 1;  /* skip min_size */ + +	if (ev >= 8) +		*p += 8 + 8;  /* skip quota_max_* */ + +	if (ev >= 9) { +		/* skip tiers */ +		num = ceph_decode_32(p); +		*p += num * 8; + +		*p += 8;  /* skip tier_of */ +		*p += 1;  /* skip cache_mode */ + +		pi->read_tier = ceph_decode_64(p); +		pi->write_tier = ceph_decode_64(p); +	} else { +		pi->read_tier = -1; +		pi->write_tier = -1;  	} -	*p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; +	/* ignore the rest */ + +	*p = pool_end; +	calc_pg_masks(pi);  	return 0;  bad:  	return -EINVAL;  } -static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) +static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)  {  	struct ceph_pg_pool_info *pi; -	u32 num, len, pool; +	u32 num, len; +	u64 pool;  	ceph_decode_32_safe(p, end, num, bad);  	dout(" %d pool names\n", num);  	while (num--) { -		ceph_decode_32_safe(p, end, pool, bad); +		ceph_decode_64_safe(p, end, pool, bad);  		ceph_decode_32_safe(p, end, len, bad); -		dout("  pool %d len %d\n", pool, len); +		dout("  pool %llu len %d\n", pool, len); +		ceph_decode_need(p, end, len, bad);  		pi = __lookup_pg_pool(&map->pg_pools, pool);  		if (pi) { +			char *name = kstrndup(*p, len, GFP_NOFS); + +			if (!name) +				return -ENOMEM;  			kfree(pi->name); -			pi->name = kmalloc(len + 1, GFP_NOFS); -			if (pi->name) { -				memcpy(pi->name, *p, len); -				pi->name[len] = '\0'; -				dout("  name is %s\n", pi->name); -			} +			pi->name = name; +			dout("  name is %s\n", pi->name);  		}  		*p += len;  	} @@ -509,6 +638,13 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)  		rb_erase(&pg->node, &map->pg_temp);  		kfree(pg);  	} +	while (!RB_EMPTY_ROOT(&map->primary_temp)) { +		struct ceph_pg_mapping *pg = +			rb_entry(rb_first(&map->primary_temp), +				 struct ceph_pg_mapping, node); +		rb_erase(&pg->node, &map->primary_temp); +		kfree(pg); +	}  	while (!RB_EMPTY_ROOT(&map->pg_pools)) {  		struct ceph_pg_pool_info *pi =  			rb_entry(rb_first(&map->pg_pools), @@ -518,179 +654,516 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)  	kfree(map->osd_state);  	kfree(map->osd_weight);  	kfree(map->osd_addr); +	kfree(map->osd_primary_affinity);  	kfree(map);  }  /* - * adjust max osd value.  reallocate arrays. + * Adjust max_osd value, (re)allocate arrays. + * + * The new elements are properly initialized.   */  static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)  {  	u8 *state; -	struct ceph_entity_addr *addr;  	u32 *weight; +	struct ceph_entity_addr *addr; +	int i; -	state = kcalloc(max, sizeof(*state), GFP_NOFS); -	addr = kcalloc(max, sizeof(*addr), GFP_NOFS); -	weight = kcalloc(max, sizeof(*weight), GFP_NOFS); -	if (state == NULL || addr == NULL || weight == NULL) { +	state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); +	weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); +	addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); +	if (!state || !weight || !addr) {  		kfree(state); -		kfree(addr);  		kfree(weight); +		kfree(addr); +  		return -ENOMEM;  	} -	/* copy old? */ -	if (map->osd_state) { -		memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); -		memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); -		memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); -		kfree(map->osd_state); -		kfree(map->osd_addr); -		kfree(map->osd_weight); +	for (i = map->max_osd; i < max; i++) { +		state[i] = 0; +		weight[i] = CEPH_OSD_OUT; +		memset(addr + i, 0, sizeof(*addr));  	}  	map->osd_state = state;  	map->osd_weight = weight;  	map->osd_addr = addr; + +	if (map->osd_primary_affinity) { +		u32 *affinity; + +		affinity = krealloc(map->osd_primary_affinity, +				    max*sizeof(*affinity), GFP_NOFS); +		if (!affinity) +			return -ENOMEM; + +		for (i = map->max_osd; i < max; i++) +			affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + +		map->osd_primary_affinity = affinity; +	} +  	map->max_osd = max; +  	return 0;  } +#define OSDMAP_WRAPPER_COMPAT_VER	7 +#define OSDMAP_CLIENT_DATA_COMPAT_VER	1 + +/* + * Return 0 or error.  On success, *v is set to 0 for old (v6) osdmaps, + * to struct_v of the client_data section for new (v7 and above) + * osdmaps. + */ +static int get_osdmap_client_data_v(void **p, void *end, +				    const char *prefix, u8 *v) +{ +	u8 struct_v; + +	ceph_decode_8_safe(p, end, struct_v, e_inval); +	if (struct_v >= 7) { +		u8 struct_compat; + +		ceph_decode_8_safe(p, end, struct_compat, e_inval); +		if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) { +			pr_warning("got v %d cv %d > %d of %s ceph_osdmap\n", +				   struct_v, struct_compat, +				   OSDMAP_WRAPPER_COMPAT_VER, prefix); +			return -EINVAL; +		} +		*p += 4; /* ignore wrapper struct_len */ + +		ceph_decode_8_safe(p, end, struct_v, e_inval); +		ceph_decode_8_safe(p, end, struct_compat, e_inval); +		if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) { +			pr_warning("got v %d cv %d > %d of %s ceph_osdmap client data\n", +				   struct_v, struct_compat, +				   OSDMAP_CLIENT_DATA_COMPAT_VER, prefix); +			return -EINVAL; +		} +		*p += 4; /* ignore client data struct_len */ +	} else { +		u16 version; + +		*p -= 1; +		ceph_decode_16_safe(p, end, version, e_inval); +		if (version < 6) { +			pr_warning("got v %d < 6 of %s ceph_osdmap\n", version, +				   prefix); +			return -EINVAL; +		} + +		/* old osdmap enconding */ +		struct_v = 0; +	} + +	*v = struct_v; +	return 0; + +e_inval: +	return -EINVAL; +} + +static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, +			  bool incremental) +{ +	u32 n; + +	ceph_decode_32_safe(p, end, n, e_inval); +	while (n--) { +		struct ceph_pg_pool_info *pi; +		u64 pool; +		int ret; + +		ceph_decode_64_safe(p, end, pool, e_inval); + +		pi = __lookup_pg_pool(&map->pg_pools, pool); +		if (!incremental || !pi) { +			pi = kzalloc(sizeof(*pi), GFP_NOFS); +			if (!pi) +				return -ENOMEM; + +			pi->id = pool; + +			ret = __insert_pg_pool(&map->pg_pools, pi); +			if (ret) { +				kfree(pi); +				return ret; +			} +		} + +		ret = decode_pool(p, end, pi); +		if (ret) +			return ret; +	} + +	return 0; + +e_inval: +	return -EINVAL; +} + +static int decode_pools(void **p, void *end, struct ceph_osdmap *map) +{ +	return __decode_pools(p, end, map, false); +} + +static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map) +{ +	return __decode_pools(p, end, map, true); +} + +static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map, +			    bool incremental) +{ +	u32 n; + +	ceph_decode_32_safe(p, end, n, e_inval); +	while (n--) { +		struct ceph_pg pgid; +		u32 len, i; +		int ret; + +		ret = ceph_decode_pgid(p, end, &pgid); +		if (ret) +			return ret; + +		ceph_decode_32_safe(p, end, len, e_inval); + +		ret = __remove_pg_mapping(&map->pg_temp, pgid); +		BUG_ON(!incremental && ret != -ENOENT); + +		if (!incremental || len > 0) { +			struct ceph_pg_mapping *pg; + +			ceph_decode_need(p, end, len*sizeof(u32), e_inval); + +			if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) +				return -EINVAL; + +			pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS); +			if (!pg) +				return -ENOMEM; + +			pg->pgid = pgid; +			pg->pg_temp.len = len; +			for (i = 0; i < len; i++) +				pg->pg_temp.osds[i] = ceph_decode_32(p); + +			ret = __insert_pg_mapping(pg, &map->pg_temp); +			if (ret) { +				kfree(pg); +				return ret; +			} +		} +	} + +	return 0; + +e_inval: +	return -EINVAL; +} + +static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map) +{ +	return __decode_pg_temp(p, end, map, false); +} + +static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) +{ +	return __decode_pg_temp(p, end, map, true); +} + +static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map, +				 bool incremental) +{ +	u32 n; + +	ceph_decode_32_safe(p, end, n, e_inval); +	while (n--) { +		struct ceph_pg pgid; +		u32 osd; +		int ret; + +		ret = ceph_decode_pgid(p, end, &pgid); +		if (ret) +			return ret; + +		ceph_decode_32_safe(p, end, osd, e_inval); + +		ret = __remove_pg_mapping(&map->primary_temp, pgid); +		BUG_ON(!incremental && ret != -ENOENT); + +		if (!incremental || osd != (u32)-1) { +			struct ceph_pg_mapping *pg; + +			pg = kzalloc(sizeof(*pg), GFP_NOFS); +			if (!pg) +				return -ENOMEM; + +			pg->pgid = pgid; +			pg->primary_temp.osd = osd; + +			ret = __insert_pg_mapping(pg, &map->primary_temp); +			if (ret) { +				kfree(pg); +				return ret; +			} +		} +	} + +	return 0; + +e_inval: +	return -EINVAL; +} + +static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map) +{ +	return __decode_primary_temp(p, end, map, false); +} + +static int decode_new_primary_temp(void **p, void *end, +				   struct ceph_osdmap *map) +{ +	return __decode_primary_temp(p, end, map, true); +} + +u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) +{ +	BUG_ON(osd >= map->max_osd); + +	if (!map->osd_primary_affinity) +		return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + +	return map->osd_primary_affinity[osd]; +} + +static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) +{ +	BUG_ON(osd >= map->max_osd); + +	if (!map->osd_primary_affinity) { +		int i; + +		map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32), +						    GFP_NOFS); +		if (!map->osd_primary_affinity) +			return -ENOMEM; + +		for (i = 0; i < map->max_osd; i++) +			map->osd_primary_affinity[i] = +			    CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; +	} + +	map->osd_primary_affinity[osd] = aff; + +	return 0; +} + +static int decode_primary_affinity(void **p, void *end, +				   struct ceph_osdmap *map) +{ +	u32 len, i; + +	ceph_decode_32_safe(p, end, len, e_inval); +	if (len == 0) { +		kfree(map->osd_primary_affinity); +		map->osd_primary_affinity = NULL; +		return 0; +	} +	if (len != map->max_osd) +		goto e_inval; + +	ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval); + +	for (i = 0; i < map->max_osd; i++) { +		int ret; + +		ret = set_primary_affinity(map, i, ceph_decode_32(p)); +		if (ret) +			return ret; +	} + +	return 0; + +e_inval: +	return -EINVAL; +} + +static int decode_new_primary_affinity(void **p, void *end, +				       struct ceph_osdmap *map) +{ +	u32 n; + +	ceph_decode_32_safe(p, end, n, e_inval); +	while (n--) { +		u32 osd, aff; +		int ret; + +		ceph_decode_32_safe(p, end, osd, e_inval); +		ceph_decode_32_safe(p, end, aff, e_inval); + +		ret = set_primary_affinity(map, osd, aff); +		if (ret) +			return ret; + +		pr_info("osd%d primary-affinity 0x%x\n", osd, aff); +	} + +	return 0; + +e_inval: +	return -EINVAL; +} +  /*   * decode a full map.   */ -struct ceph_osdmap *osdmap_decode(void **p, void *end) +static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)  { -	struct ceph_osdmap *map; -	u16 version; -	u32 len, max, i; -	u8 ev; -	int err = -EINVAL; +	u8 struct_v; +	u32 epoch = 0;  	void *start = *p; -	struct ceph_pg_pool_info *pi; - -	dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); +	u32 max; +	u32 len, i; +	int err; -	map = kzalloc(sizeof(*map), GFP_NOFS); -	if (map == NULL) -		return ERR_PTR(-ENOMEM); -	map->pg_temp = RB_ROOT; +	dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); -	ceph_decode_16_safe(p, end, version, bad); -	if (version > CEPH_OSDMAP_VERSION) { -		pr_warning("got unknown v %d > %d of osdmap\n", version, -			   CEPH_OSDMAP_VERSION); +	err = get_osdmap_client_data_v(p, end, "full", &struct_v); +	if (err)  		goto bad; -	} -	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); +	/* fsid, epoch, created, modified */ +	ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) + +			 sizeof(map->created) + sizeof(map->modified), e_inval);  	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); -	map->epoch = ceph_decode_32(p); +	epoch = map->epoch = ceph_decode_32(p);  	ceph_decode_copy(p, &map->created, sizeof(map->created));  	ceph_decode_copy(p, &map->modified, sizeof(map->modified)); -	ceph_decode_32_safe(p, end, max, bad); -	while (max--) { -		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); -		pi = kzalloc(sizeof(*pi), GFP_NOFS); -		if (!pi) -			goto bad; -		pi->id = ceph_decode_32(p); -		ev = ceph_decode_8(p); /* encoding version */ -		if (ev > CEPH_PG_POOL_VERSION) { -			pr_warning("got unknown v %d > %d of ceph_pg_pool\n", -				   ev, CEPH_PG_POOL_VERSION); -			kfree(pi); -			goto bad; -		} -		err = __decode_pool(p, end, pi); -		if (err < 0) -			goto bad; -		__insert_pg_pool(&map->pg_pools, pi); -	} +	/* pools */ +	err = decode_pools(p, end, map); +	if (err) +		goto bad; -	if (version >= 5 && __decode_pool_names(p, end, map) < 0) +	/* pool_name */ +	err = decode_pool_names(p, end, map); +	if (err)  		goto bad; -	ceph_decode_32_safe(p, end, map->pool_max, bad); +	ceph_decode_32_safe(p, end, map->pool_max, e_inval); -	ceph_decode_32_safe(p, end, map->flags, bad); +	ceph_decode_32_safe(p, end, map->flags, e_inval); -	max = ceph_decode_32(p); +	/* max_osd */ +	ceph_decode_32_safe(p, end, max, e_inval);  	/* (re)alloc osd arrays */  	err = osdmap_set_max_osd(map, max); -	if (err < 0) +	if (err)  		goto bad; -	dout("osdmap_decode max_osd = %d\n", map->max_osd); -	/* osds */ -	err = -EINVAL; +	/* osd_state, osd_weight, osd_addrs->client_addr */  	ceph_decode_need(p, end, 3*sizeof(u32) +  			 map->max_osd*(1 + sizeof(*map->osd_weight) + -				       sizeof(*map->osd_addr)), bad); -	*p += 4; /* skip length field (should match max) */ +				       sizeof(*map->osd_addr)), e_inval); + +	if (ceph_decode_32(p) != map->max_osd) +		goto e_inval; +  	ceph_decode_copy(p, map->osd_state, map->max_osd); -	*p += 4; /* skip length field (should match max) */ +	if (ceph_decode_32(p) != map->max_osd) +		goto e_inval; +  	for (i = 0; i < map->max_osd; i++)  		map->osd_weight[i] = ceph_decode_32(p); -	*p += 4; /* skip length field (should match max) */ +	if (ceph_decode_32(p) != map->max_osd) +		goto e_inval; +  	ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));  	for (i = 0; i < map->max_osd; i++)  		ceph_decode_addr(&map->osd_addr[i]);  	/* pg_temp */ -	ceph_decode_32_safe(p, end, len, bad); -	for (i = 0; i < len; i++) { -		int n, j; -		struct ceph_pg pgid; -		struct ceph_pg_mapping *pg; - -		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); -		ceph_decode_copy(p, &pgid, sizeof(pgid)); -		n = ceph_decode_32(p); -		ceph_decode_need(p, end, n * sizeof(u32), bad); -		err = -ENOMEM; -		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); -		if (!pg) +	err = decode_pg_temp(p, end, map); +	if (err) +		goto bad; + +	/* primary_temp */ +	if (struct_v >= 1) { +		err = decode_primary_temp(p, end, map); +		if (err)  			goto bad; -		pg->pgid = pgid; -		pg->len = n; -		for (j = 0; j < n; j++) -			pg->osds[j] = ceph_decode_32(p); +	} -		err = __insert_pg_mapping(pg, &map->pg_temp); +	/* primary_affinity */ +	if (struct_v >= 2) { +		err = decode_primary_affinity(p, end, map);  		if (err)  			goto bad; -		dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len); +	} else { +		/* XXX can this happen? */ +		kfree(map->osd_primary_affinity); +		map->osd_primary_affinity = NULL;  	}  	/* crush */ -	ceph_decode_32_safe(p, end, len, bad); -	dout("osdmap_decode crush len %d from off 0x%x\n", len, -	     (int)(*p - start)); -	ceph_decode_need(p, end, len, bad); -	map->crush = crush_decode(*p, end); -	*p += len; +	ceph_decode_32_safe(p, end, len, e_inval); +	map->crush = crush_decode(*p, min(*p + len, end));  	if (IS_ERR(map->crush)) {  		err = PTR_ERR(map->crush);  		map->crush = NULL;  		goto bad;  	} +	*p += len; -	/* ignore the rest of the map */ +	/* ignore the rest */  	*p = end; -	dout("osdmap_decode done %p %p\n", *p, end); -	return map; +	dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); +	return 0; +e_inval: +	err = -EINVAL;  bad: -	dout("osdmap_decode fail\n"); -	ceph_osdmap_destroy(map); -	return ERR_PTR(err); +	pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n", +	       err, epoch, (int)(*p - start), *p, start, end); +	print_hex_dump(KERN_DEBUG, "osdmap: ", +		       DUMP_PREFIX_OFFSET, 16, 1, +		       start, end - start, true); +	return err; +} + +/* + * Allocate and decode a full map. + */ +struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) +{ +	struct ceph_osdmap *map; +	int ret; + +	map = kzalloc(sizeof(*map), GFP_NOFS); +	if (!map) +		return ERR_PTR(-ENOMEM); + +	map->pg_temp = RB_ROOT; +	map->primary_temp = RB_ROOT; +	mutex_init(&map->crush_scratch_mutex); + +	ret = osdmap_decode(p, end, map); +	if (ret) { +		ceph_osdmap_destroy(map); +		return ERR_PTR(ret); +	} + +	return map;  }  /* @@ -704,45 +1177,47 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,  	struct ceph_fsid fsid;  	u32 epoch = 0;  	struct ceph_timespec modified; -	u32 len, pool; -	__s32 new_pool_max, new_flags, max; +	s32 len; +	u64 pool; +	__s64 new_pool_max; +	__s32 new_flags, max;  	void *start = *p; -	int err = -EINVAL; -	u16 version; -	struct rb_node *rbp; +	int err; +	u8 struct_v; + +	dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); -	ceph_decode_16_safe(p, end, version, bad); -	if (version > CEPH_OSDMAP_INC_VERSION) { -		pr_warning("got unknown v %d > %d of inc osdmap\n", version, -			   CEPH_OSDMAP_INC_VERSION); +	err = get_osdmap_client_data_v(p, end, "inc", &struct_v); +	if (err)  		goto bad; -	} -	ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32), -			 bad); +	/* fsid, epoch, modified, new_pool_max, new_flags */ +	ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) + +			 sizeof(u64) + sizeof(u32), e_inval);  	ceph_decode_copy(p, &fsid, sizeof(fsid));  	epoch = ceph_decode_32(p);  	BUG_ON(epoch != map->epoch+1);  	ceph_decode_copy(p, &modified, sizeof(modified)); -	new_pool_max = ceph_decode_32(p); +	new_pool_max = ceph_decode_64(p);  	new_flags = ceph_decode_32(p);  	/* full map? */ -	ceph_decode_32_safe(p, end, len, bad); +	ceph_decode_32_safe(p, end, len, e_inval);  	if (len > 0) {  		dout("apply_incremental full map len %d, %p to %p\n",  		     len, *p, end); -		return osdmap_decode(p, min(*p+len, end)); +		return ceph_osdmap_decode(p, min(*p+len, end));  	}  	/* new crush? */ -	ceph_decode_32_safe(p, end, len, bad); +	ceph_decode_32_safe(p, end, len, e_inval);  	if (len > 0) { -		dout("apply_incremental new crush map len %d, %p to %p\n", -		     len, *p, end);  		newcrush = crush_decode(*p, min(*p+len, end)); -		if (IS_ERR(newcrush)) -			return ERR_CAST(newcrush); +		if (IS_ERR(newcrush)) { +			err = PTR_ERR(newcrush); +			newcrush = NULL; +			goto bad; +		}  		*p += len;  	} @@ -752,18 +1227,16 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,  	if (new_pool_max >= 0)  		map->pool_max = new_pool_max; -	ceph_decode_need(p, end, 5*sizeof(u32), bad); -  	/* new max? */ -	max = ceph_decode_32(p); +	ceph_decode_32_safe(p, end, max, e_inval);  	if (max >= 0) {  		err = osdmap_set_max_osd(map, max); -		if (err < 0) +		if (err)  			goto bad;  	}  	map->epoch++; -	map->modified = map->modified; +	map->modified = modified;  	if (newcrush) {  		if (map->crush)  			crush_destroy(map->crush); @@ -771,56 +1244,34 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,  		newcrush = NULL;  	} -	/* new_pool */ -	ceph_decode_32_safe(p, end, len, bad); -	while (len--) { -		__u8 ev; -		struct ceph_pg_pool_info *pi; +	/* new_pools */ +	err = decode_new_pools(p, end, map); +	if (err) +		goto bad; -		ceph_decode_32_safe(p, end, pool, bad); -		ceph_decode_need(p, end, 1 + sizeof(pi->v), bad); -		ev = ceph_decode_8(p);  /* encoding version */ -		if (ev > CEPH_PG_POOL_VERSION) { -			pr_warning("got unknown v %d > %d of ceph_pg_pool\n", -				   ev, CEPH_PG_POOL_VERSION); -			goto bad; -		} -		pi = __lookup_pg_pool(&map->pg_pools, pool); -		if (!pi) { -			pi = kzalloc(sizeof(*pi), GFP_NOFS); -			if (!pi) { -				err = -ENOMEM; -				goto bad; -			} -			pi->id = pool; -			__insert_pg_pool(&map->pg_pools, pi); -		} -		err = __decode_pool(p, end, pi); -		if (err < 0) -			goto bad; -	} -	if (version >= 5 && __decode_pool_names(p, end, map) < 0) +	/* new_pool_names */ +	err = decode_pool_names(p, end, map); +	if (err)  		goto bad;  	/* old_pool */ -	ceph_decode_32_safe(p, end, len, bad); +	ceph_decode_32_safe(p, end, len, e_inval);  	while (len--) {  		struct ceph_pg_pool_info *pi; -		ceph_decode_32_safe(p, end, pool, bad); +		ceph_decode_64_safe(p, end, pool, e_inval);  		pi = __lookup_pg_pool(&map->pg_pools, pool);  		if (pi)  			__remove_pg_pool(&map->pg_pools, pi);  	}  	/* new_up */ -	err = -EINVAL; -	ceph_decode_32_safe(p, end, len, bad); +	ceph_decode_32_safe(p, end, len, e_inval);  	while (len--) {  		u32 osd;  		struct ceph_entity_addr addr; -		ceph_decode_32_safe(p, end, osd, bad); -		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad); +		ceph_decode_32_safe(p, end, osd, e_inval); +		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval);  		ceph_decode_addr(&addr);  		pr_info("osd%d up\n", osd);  		BUG_ON(osd >= map->max_osd); @@ -828,22 +1279,27 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,  		map->osd_addr[osd] = addr;  	} -	/* new_down */ -	ceph_decode_32_safe(p, end, len, bad); +	/* new_state */ +	ceph_decode_32_safe(p, end, len, e_inval);  	while (len--) {  		u32 osd; -		ceph_decode_32_safe(p, end, osd, bad); +		u8 xorstate; +		ceph_decode_32_safe(p, end, osd, e_inval); +		xorstate = **(u8 **)p;  		(*p)++;  /* clean flag */ -		pr_info("osd%d down\n", osd); +		if (xorstate == 0) +			xorstate = CEPH_OSD_UP; +		if (xorstate & CEPH_OSD_UP) +			pr_info("osd%d down\n", osd);  		if (osd < map->max_osd) -			map->osd_state[osd] &= ~CEPH_OSD_UP; +			map->osd_state[osd] ^= xorstate;  	}  	/* new_weight */ -	ceph_decode_32_safe(p, end, len, bad); +	ceph_decode_32_safe(p, end, len, e_inval);  	while (len--) {  		u32 osd, off; -		ceph_decode_need(p, end, sizeof(u32)*2, bad); +		ceph_decode_need(p, end, sizeof(u32)*2, e_inval);  		osd = ceph_decode_32(p);  		off = ceph_decode_32(p);  		pr_info("osd%d weight 0x%x %s\n", osd, off, @@ -854,67 +1310,35 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,  	}  	/* new_pg_temp */ -	rbp = rb_first(&map->pg_temp); -	ceph_decode_32_safe(p, end, len, bad); -	while (len--) { -		struct ceph_pg_mapping *pg; -		int j; -		struct ceph_pg pgid; -		u32 pglen; -		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); -		ceph_decode_copy(p, &pgid, sizeof(pgid)); -		pglen = ceph_decode_32(p); - -		/* remove any? */ -		while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping, -						node)->pgid, pgid) <= 0) { -			struct ceph_pg_mapping *cur = -				rb_entry(rbp, struct ceph_pg_mapping, node); - -			rbp = rb_next(rbp); -			dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); -			rb_erase(&cur->node, &map->pg_temp); -			kfree(cur); -		} +	err = decode_new_pg_temp(p, end, map); +	if (err) +		goto bad; -		if (pglen) { -			/* insert */ -			ceph_decode_need(p, end, pglen*sizeof(u32), bad); -			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); -			if (!pg) { -				err = -ENOMEM; -				goto bad; -			} -			pg->pgid = pgid; -			pg->len = pglen; -			for (j = 0; j < pglen; j++) -				pg->osds[j] = ceph_decode_32(p); -			err = __insert_pg_mapping(pg, &map->pg_temp); -			if (err) { -				kfree(pg); -				goto bad; -			} -			dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, -			     pglen); -		} +	/* new_primary_temp */ +	if (struct_v >= 1) { +		err = decode_new_primary_temp(p, end, map); +		if (err) +			goto bad;  	} -	while (rbp) { -		struct ceph_pg_mapping *cur = -			rb_entry(rbp, struct ceph_pg_mapping, node); - -		rbp = rb_next(rbp); -		dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); -		rb_erase(&cur->node, &map->pg_temp); -		kfree(cur); + +	/* new_primary_affinity */ +	if (struct_v >= 2) { +		err = decode_new_primary_affinity(p, end, map); +		if (err) +			goto bad;  	}  	/* ignore the rest */  	*p = end; + +	dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);  	return map; +e_inval: +	err = -EINVAL;  bad: -	pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n", -	       epoch, (int)(*p - start), *p, start, end); +	pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n", +	       err, epoch, (int)(*p - start), *p, start, end);  	print_hex_dump(KERN_DEBUG, "osdmap: ",  		       DUMP_PREFIX_OFFSET, 16, 1,  		       start, end - start, true); @@ -934,8 +1358,8 @@ bad:   * for now, we write only a single su, until we can   * pass a stride back to the caller.   */ -void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, -				   u64 off, u64 *plen, +int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, +				   u64 off, u64 len,  				   u64 *ono,  				   u64 *oxoff, u64 *oxlen)  { @@ -946,13 +1370,19 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,  	u32 su_per_object;  	u64 t, su_offset; -	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen, +	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, len,  	     osize, su); +	if (su == 0 || sc == 0) +		goto invalid;  	su_per_object = osize / su; +	if (su_per_object == 0) +		goto invalid;  	dout("osize %u / su %u = su_per_object %u\n", osize, su,  	     su_per_object); -	BUG_ON((su & ~PAGE_MASK) != 0); +	if ((su & ~PAGE_MASK) != 0) +		goto invalid; +  	/* bl = *off / su; */  	t = off;  	do_div(t, su); @@ -964,7 +1394,7 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,  	objsetno = stripeno / su_per_object;  	*ono = objsetno * sc + stripepos; -	dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono); +	dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned int)*ono);  	/* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */  	t = off; @@ -973,138 +1403,315 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,  	/*  	 * Calculate the length of the extent being written to the selected -	 * object. This is the minimum of the full length requested (plen) or +	 * object. This is the minimum of the full length requested (len) or  	 * the remainder of the current stripe being written to.  	 */ -	*oxlen = min_t(u64, *plen, su - su_offset); -	*plen = *oxlen; +	*oxlen = min_t(u64, len, su - su_offset);  	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); +	return 0; + +invalid: +	dout(" invalid layout\n"); +	*ono = 0; +	*oxoff = 0; +	*oxlen = 0; +	return -EINVAL;  }  EXPORT_SYMBOL(ceph_calc_file_object_mapping);  /* - * calculate an object layout (i.e. pgid) from an oid, - * file_layout, and osdmap + * Calculate mapping of a (oloc, oid) pair to a PG.  Should only be + * called with target's (oloc, oid), since tiering isn't taken into + * account.   */ -int ceph_calc_object_layout(struct ceph_object_layout *ol, -			    const char *oid, -			    struct ceph_file_layout *fl, -			    struct ceph_osdmap *osdmap) +int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, +			struct ceph_object_locator *oloc, +			struct ceph_object_id *oid, +			struct ceph_pg *pg_out)  { -	unsigned num, num_mask; -	struct ceph_pg pgid; -	s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred); -	int poolid = le32_to_cpu(fl->fl_pg_pool); -	struct ceph_pg_pool_info *pool; -	unsigned ps; - -	BUG_ON(!osdmap); +	struct ceph_pg_pool_info *pi; -	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); -	if (!pool) +	pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); +	if (!pi)  		return -EIO; -	ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); -	if (preferred >= 0) { -		ps += preferred; -		num = le32_to_cpu(pool->v.lpg_num); -		num_mask = pool->lpg_num_mask; + +	pg_out->pool = oloc->pool; +	pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, +				     oid->name_len); + +	dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, +	     pg_out->pool, pg_out->seed); +	return 0; +} +EXPORT_SYMBOL(ceph_oloc_oid_to_pg); + +static int do_crush(struct ceph_osdmap *map, int ruleno, int x, +		    int *result, int result_max, +		    const __u32 *weight, int weight_max) +{ +	int r; + +	BUG_ON(result_max > CEPH_PG_MAX_SIZE); + +	mutex_lock(&map->crush_scratch_mutex); +	r = crush_do_rule(map->crush, ruleno, x, result, result_max, +			  weight, weight_max, map->crush_scratch_ary); +	mutex_unlock(&map->crush_scratch_mutex); + +	return r; +} + +/* + * Calculate raw (crush) set for given pgid. + * + * Return raw set length, or error. + */ +static int pg_to_raw_osds(struct ceph_osdmap *osdmap, +			  struct ceph_pg_pool_info *pool, +			  struct ceph_pg pgid, u32 pps, int *osds) +{ +	int ruleno; +	int len; + +	/* crush */ +	ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, +				 pool->type, pool->size); +	if (ruleno < 0) { +		pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", +		       pgid.pool, pool->crush_ruleset, pool->type, +		       pool->size); +		return -ENOENT; +	} + +	len = do_crush(osdmap, ruleno, pps, osds, +		       min_t(int, pool->size, CEPH_PG_MAX_SIZE), +		       osdmap->osd_weight, osdmap->max_osd); +	if (len < 0) { +		pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", +		       len, ruleno, pgid.pool, pool->crush_ruleset, +		       pool->type, pool->size); +		return len; +	} + +	return len; +} + +/* + * Given raw set, calculate up set and up primary. + * + * Return up set length.  *primary is set to up primary osd id, or -1 + * if up set is empty. + */ +static int raw_to_up_osds(struct ceph_osdmap *osdmap, +			  struct ceph_pg_pool_info *pool, +			  int *osds, int len, int *primary) +{ +	int up_primary = -1; +	int i; + +	if (ceph_can_shift_osds(pool)) { +		int removed = 0; + +		for (i = 0; i < len; i++) { +			if (ceph_osd_is_down(osdmap, osds[i])) { +				removed++; +				continue; +			} +			if (removed) +				osds[i - removed] = osds[i]; +		} + +		len -= removed; +		if (len > 0) +			up_primary = osds[0];  	} else { -		num = le32_to_cpu(pool->v.pg_num); -		num_mask = pool->pg_num_mask; +		for (i = len - 1; i >= 0; i--) { +			if (ceph_osd_is_down(osdmap, osds[i])) +				osds[i] = CRUSH_ITEM_NONE; +			else +				up_primary = osds[i]; +		}  	} -	pgid.ps = cpu_to_le16(ps); -	pgid.preferred = cpu_to_le16(preferred); -	pgid.pool = fl->fl_pg_pool; -	if (preferred >= 0) -		dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps, -		     (int)preferred); -	else -		dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); +	*primary = up_primary; +	return len; +} -	ol->ol_pgid = pgid; -	ol->ol_stripe_unit = fl->fl_object_stripe_unit; -	return 0; +static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, +				   struct ceph_pg_pool_info *pool, +				   int *osds, int len, int *primary) +{ +	int i; +	int pos = -1; + +	/* +	 * Do we have any non-default primary_affinity values for these +	 * osds? +	 */ +	if (!osdmap->osd_primary_affinity) +		return; + +	for (i = 0; i < len; i++) { +		int osd = osds[i]; + +		if (osd != CRUSH_ITEM_NONE && +		    osdmap->osd_primary_affinity[osd] != +					CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { +			break; +		} +	} +	if (i == len) +		return; + +	/* +	 * Pick the primary.  Feed both the seed (for the pg) and the +	 * osd into the hash/rng so that a proportional fraction of an +	 * osd's pgs get rejected as primary. +	 */ +	for (i = 0; i < len; i++) { +		int osd = osds[i]; +		u32 aff; + +		if (osd == CRUSH_ITEM_NONE) +			continue; + +		aff = osdmap->osd_primary_affinity[osd]; +		if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY && +		    (crush_hash32_2(CRUSH_HASH_RJENKINS1, +				    pps, osd) >> 16) >= aff) { +			/* +			 * We chose not to use this primary.  Note it +			 * anyway as a fallback in case we don't pick +			 * anyone else, but keep looking. +			 */ +			if (pos < 0) +				pos = i; +		} else { +			pos = i; +			break; +		} +	} +	if (pos < 0) +		return; + +	*primary = osds[pos]; + +	if (ceph_can_shift_osds(pool) && pos > 0) { +		/* move the new primary to the front */ +		for (i = pos; i > 0; i--) +			osds[i] = osds[i - 1]; +		osds[0] = *primary; +	}  } -EXPORT_SYMBOL(ceph_calc_object_layout);  /* - * Calculate raw osd vector for the given pgid.  Return pointer to osd - * array, or NULL on failure. + * Given up set, apply pg_temp and primary_temp mappings. + * + * Return acting set length.  *primary is set to acting primary osd id, + * or -1 if acting set is empty.   */ -static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, -			int *osds, int *num) +static int apply_temps(struct ceph_osdmap *osdmap, +		       struct ceph_pg_pool_info *pool, struct ceph_pg pgid, +		       int *osds, int len, int *primary)  {  	struct ceph_pg_mapping *pg; -	struct ceph_pg_pool_info *pool; -	int ruleno; -	unsigned poolid, ps, pps; -	int preferred; +	int temp_len; +	int temp_primary; +	int i; + +	/* raw_pg -> pg */ +	pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, +				    pool->pg_num_mask);  	/* pg_temp? */  	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);  	if (pg) { -		*num = pg->len; -		return pg->osds; -	} - -	/* crush */ -	poolid = le32_to_cpu(pgid.pool); -	ps = le16_to_cpu(pgid.ps); -	preferred = (s16)le16_to_cpu(pgid.preferred); - -	/* don't forcefeed bad device ids to crush */ -	if (preferred >= osdmap->max_osd || -	    preferred >= osdmap->crush->max_devices) -		preferred = -1; +		temp_len = 0; +		temp_primary = -1; + +		for (i = 0; i < pg->pg_temp.len; i++) { +			if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { +				if (ceph_can_shift_osds(pool)) +					continue; +				else +					osds[temp_len++] = CRUSH_ITEM_NONE; +			} else { +				osds[temp_len++] = pg->pg_temp.osds[i]; +			} +		} -	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); -	if (!pool) -		return NULL; -	ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, -				 pool->v.type, pool->v.size); -	if (ruleno < 0) { -		pr_err("no crush rule pool %d ruleset %d type %d size %d\n", -		       poolid, pool->v.crush_ruleset, pool->v.type, -		       pool->v.size); -		return NULL; +		/* apply pg_temp's primary */ +		for (i = 0; i < temp_len; i++) { +			if (osds[i] != CRUSH_ITEM_NONE) { +				temp_primary = osds[i]; +				break; +			} +		} +	} else { +		temp_len = len; +		temp_primary = *primary;  	} -	if (preferred >= 0) -		pps = ceph_stable_mod(ps, -				      le32_to_cpu(pool->v.lpgp_num), -				      pool->lpgp_num_mask); -	else -		pps = ceph_stable_mod(ps, -				      le32_to_cpu(pool->v.pgp_num), -				      pool->pgp_num_mask); -	pps += poolid; -	*num = crush_do_rule(osdmap->crush, ruleno, pps, osds, -			     min_t(int, pool->v.size, *num), -			     preferred, osdmap->osd_weight); -	return osds; +	/* primary_temp? */ +	pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); +	if (pg) +		temp_primary = pg->primary_temp.osd; + +	*primary = temp_primary; +	return temp_len;  }  /* - * Return acting set for given pgid. + * Calculate acting set for given pgid. + * + * Return acting set length, or error.  *primary is set to acting + * primary osd id, or -1 if acting set is empty or on error.   */  int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, -			int *acting) +			int *osds, int *primary)  { -	int rawosds[CEPH_PG_MAX_SIZE], *osds; -	int i, o, num = CEPH_PG_MAX_SIZE; +	struct ceph_pg_pool_info *pool; +	u32 pps; +	int len; -	osds = calc_pg_raw(osdmap, pgid, rawosds, &num); -	if (!osds) -		return -1; +	pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); +	if (!pool) { +		*primary = -1; +		return -ENOENT; +	} + +	if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { +		/* hash pool id and seed so that pool PGs do not overlap */ +		pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, +				     ceph_stable_mod(pgid.seed, pool->pgp_num, +						     pool->pgp_num_mask), +				     pgid.pool); +	} else { +		/* +		 * legacy behavior: add ps and pool together.  this is +		 * not a great approach because the PGs from each pool +		 * will overlap on top of each other: 0.5 == 1.4 == +		 * 2.3 == ... +		 */ +		pps = ceph_stable_mod(pgid.seed, pool->pgp_num, +				      pool->pgp_num_mask) + +			(unsigned)pgid.pool; +	} -	/* primary is first up osd */ -	o = 0; -	for (i = 0; i < num; i++) -		if (ceph_osd_is_up(osdmap, osds[i])) -			acting[o++] = osds[i]; -	return o; +	len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); +	if (len < 0) { +		*primary = -1; +		return len; +	} + +	len = raw_to_up_osds(osdmap, pool, osds, len, primary); + +	apply_primary_affinity(osdmap, pps, pool, osds, len, primary); + +	len = apply_temps(osdmap, pool, pgid, osds, len, primary); + +	return len;  }  /* @@ -1112,17 +1719,11 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,   */  int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)  { -	int rawosds[CEPH_PG_MAX_SIZE], *osds; -	int i, num = CEPH_PG_MAX_SIZE; +	int osds[CEPH_PG_MAX_SIZE]; +	int primary; -	osds = calc_pg_raw(osdmap, pgid, rawosds, &num); -	if (!osds) -		return -1; +	ceph_calc_pg_acting(osdmap, pgid, osds, &primary); -	/* primary is first up osd */ -	for (i = 0; i < num; i++) -		if (ceph_osd_is_up(osdmap, osds[i])) -			return osds[i]; -	return -1; +	return primary;  }  EXPORT_SYMBOL(ceph_calc_pg_primary); diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c index 13cb409a7bb..92866bebb65 100644 --- a/net/ceph/pagelist.c +++ b/net/ceph/pagelist.c @@ -1,4 +1,3 @@ -  #include <linux/module.h>  #include <linux/gfp.h>  #include <linux/pagemap.h> @@ -72,8 +71,7 @@ int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)  }  EXPORT_SYMBOL(ceph_pagelist_append); -/** - * Allocate enough pages for a pagelist to append the given amount +/* Allocate enough pages for a pagelist to append the given amount   * of data without without allocating.   * Returns: 0 on success, -ENOMEM on error.   */ @@ -95,9 +93,7 @@ int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)  }  EXPORT_SYMBOL(ceph_pagelist_reserve); -/** - * Free any pages that have been preallocated. - */ +/* Free any pages that have been preallocated. */  int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)  {  	while (!list_empty(&pl->free_list)) { @@ -112,9 +108,7 @@ int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)  }  EXPORT_SYMBOL(ceph_pagelist_free_reserve); -/** - * Create a truncation point. - */ +/* Create a truncation point. */  void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,  			      struct ceph_pagelist_cursor *c)  { @@ -124,8 +118,7 @@ void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,  }  EXPORT_SYMBOL(ceph_pagelist_set_cursor); -/** - * Truncate a pagelist to the given point. Move extra pages to reserve. +/* Truncate a pagelist to the given point. Move extra pages to reserve.   * This won't sleep.   * Returns: 0 on success,   *          -EINVAL if the pagelist doesn't match the trunc point pagelist @@ -140,8 +133,8 @@ int ceph_pagelist_truncate(struct ceph_pagelist *pl,  	ceph_pagelist_unmap_tail(pl);  	while (pl->head.prev != c->page_lru) {  		page = list_entry(pl->head.prev, struct page, lru); -		list_del(&page->lru);                /* remove from pagelist */ -		list_add_tail(&page->lru, &pl->free_list); /* add to reserve */ +		/* move from pagelist to reserve */ +		list_move_tail(&page->lru, &pl->free_list);  		++pl->num_pages_free;  	}  	pl->room = c->room; diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 54caf068715..555013034f7 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -12,38 +12,51 @@  /*   * build a vector of user pages   */ -struct page **ceph_get_direct_page_vector(const char __user *data, -						 int num_pages, -						 loff_t off, size_t len) +struct page **ceph_get_direct_page_vector(const void __user *data, +					  int num_pages, bool write_page)  {  	struct page **pages; -	int rc; +	int got = 0; +	int rc = 0;  	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);  	if (!pages)  		return ERR_PTR(-ENOMEM);  	down_read(¤t->mm->mmap_sem); -	rc = get_user_pages(current, current->mm, (unsigned long)data, -			    num_pages, 0, 0, pages, NULL); +	while (got < num_pages) { +		rc = get_user_pages(current, current->mm, +		    (unsigned long)data + ((unsigned long)got * PAGE_SIZE), +		    num_pages - got, write_page, 0, pages + got, NULL); +		if (rc < 0) +			break; +		BUG_ON(rc == 0); +		got += rc; +	}  	up_read(¤t->mm->mmap_sem);  	if (rc < 0)  		goto fail;  	return pages;  fail: -	kfree(pages); +	ceph_put_page_vector(pages, got, false);  	return ERR_PTR(rc);  }  EXPORT_SYMBOL(ceph_get_direct_page_vector); -void ceph_put_page_vector(struct page **pages, int num_pages) +void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty)  {  	int i; -	for (i = 0; i < num_pages; i++) +	for (i = 0; i < num_pages; i++) { +		if (dirty) +			set_page_dirty_lock(pages[i]);  		put_page(pages[i]); -	kfree(pages); +	} +	if (is_vmalloc_addr(pages)) +		vfree(pages); +	else +		kfree(pages);  }  EXPORT_SYMBOL(ceph_put_page_vector); @@ -83,7 +96,7 @@ EXPORT_SYMBOL(ceph_alloc_page_vector);   * copy user data into a page vector   */  int ceph_copy_user_to_page_vector(struct page **pages, -					 const char __user *data, +					 const void __user *data,  					 loff_t off, size_t len)  {  	int i = 0; @@ -108,17 +121,17 @@ int ceph_copy_user_to_page_vector(struct page **pages,  }  EXPORT_SYMBOL(ceph_copy_user_to_page_vector); -int ceph_copy_to_page_vector(struct page **pages, -				    const char *data, +void ceph_copy_to_page_vector(struct page **pages, +				    const void *data,  				    loff_t off, size_t len)  {  	int i = 0;  	size_t po = off & ~PAGE_CACHE_MASK;  	size_t left = len; -	size_t l;  	while (left > 0) { -		l = min_t(size_t, PAGE_CACHE_SIZE-po, left); +		size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left); +  		memcpy(page_address(pages[i]) + po, data, l);  		data += l;  		left -= l; @@ -128,21 +141,20 @@ int ceph_copy_to_page_vector(struct page **pages,  			i++;  		}  	} -	return len;  }  EXPORT_SYMBOL(ceph_copy_to_page_vector); -int ceph_copy_from_page_vector(struct page **pages, -				    char *data, +void ceph_copy_from_page_vector(struct page **pages, +				    void *data,  				    loff_t off, size_t len)  {  	int i = 0;  	size_t po = off & ~PAGE_CACHE_MASK;  	size_t left = len; -	size_t l;  	while (left > 0) { -		l = min_t(size_t, PAGE_CACHE_SIZE-po, left); +		size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left); +  		memcpy(data, page_address(pages[i]) + po, l);  		data += l;  		left -= l; @@ -152,41 +164,10 @@ int ceph_copy_from_page_vector(struct page **pages,  			i++;  		}  	} -	return len;  }  EXPORT_SYMBOL(ceph_copy_from_page_vector);  /* - * copy user data from a page vector into a user pointer - */ -int ceph_copy_page_vector_to_user(struct page **pages, -					 char __user *data, -					 loff_t off, size_t len) -{ -	int i = 0; -	int po = off & ~PAGE_CACHE_MASK; -	int left = len; -	int l, bad; - -	while (left > 0) { -		l = min_t(int, left, PAGE_CACHE_SIZE-po); -		bad = copy_to_user(data, page_address(pages[i]) + po, l); -		if (bad == l) -			return -EFAULT; -		data += l - bad; -		left -= l - bad; -		if (po) { -			po += l - bad; -			if (po == PAGE_CACHE_SIZE) -				po = 0; -		} -		i++; -	} -	return len; -} -EXPORT_SYMBOL(ceph_copy_page_vector_to_user); - -/*   * Zero an extent within a page vector.  Offset is relative to the   * start of the first page.   */ diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c new file mode 100644 index 00000000000..154683f5f14 --- /dev/null +++ b/net/ceph/snapshot.c @@ -0,0 +1,78 @@ +/* + * snapshot.c    Ceph snapshot context utility routines (part of libceph) + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <stddef.h> + +#include <linux/types.h> +#include <linux/export.h> +#include <linux/ceph/libceph.h> + +/* + * Ceph snapshot contexts are reference counted objects, and the + * returned structure holds a single reference.  Acquire additional + * references with ceph_get_snap_context(), and release them with + * ceph_put_snap_context().  When the reference count reaches zero + * the entire structure is freed. + */ + +/* + * Create a new ceph snapshot context large enough to hold the + * indicated number of snapshot ids (which can be 0).  Caller has + * to fill in snapc->seq and snapc->snaps[0..snap_count-1]. + * + * Returns a null pointer if an error occurs. + */ +struct ceph_snap_context *ceph_create_snap_context(u32 snap_count, +						gfp_t gfp_flags) +{ +	struct ceph_snap_context *snapc; +	size_t size; + +	size = sizeof (struct ceph_snap_context); +	size += snap_count * sizeof (snapc->snaps[0]); +	snapc = kzalloc(size, gfp_flags); +	if (!snapc) +		return NULL; + +	atomic_set(&snapc->nref, 1); +	snapc->num_snaps = snap_count; + +	return snapc; +} +EXPORT_SYMBOL(ceph_create_snap_context); + +struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc) +{ +	if (sc) +		atomic_inc(&sc->nref); +	return sc; +} +EXPORT_SYMBOL(ceph_get_snap_context); + +void ceph_put_snap_context(struct ceph_snap_context *sc) +{ +	if (!sc) +		return; +	if (atomic_dec_and_test(&sc->nref)) { +		/*printk(" deleting snap_context %p\n", sc);*/ +		kfree(sc); +	} +} +EXPORT_SYMBOL(ceph_put_snap_context);  | 
