From 182fac2689b769a96e7fc9defcd560c5cca92b1e Mon Sep 17 00:00:00 2001
From: Jim Schutt <jaschut@sandia.gov>
Date: Wed, 29 Feb 2012 08:30:58 -0700
Subject: net/ceph: Only clear SOCK_NOSPACE when there is sufficient space in
 the socket buffer

The Ceph messenger would sometimes queue multiple work items to write
data to a socket when the socket buffer was full.

Fix this problem by making ceph_write_space() use SOCK_NOSPACE in the
same way that net/core/stream.c:sk_stream_write_space() does, i.e.,
clearing it only when sufficient space is available in the socket buffer.

Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Alex Elder <elder@dreamhost.com>
---
 net/ceph/messenger.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index ad5b70801f3..d11f91b0545 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -143,16 +143,22 @@ static void ceph_write_space(struct sock *sk)
 	struct ceph_connection *con =
 		(struct ceph_connection *)sk->sk_user_data;
 
-	/* only queue to workqueue if there is data we want to write. */
+	/* only queue to workqueue if there is data we want to write,
+	 * and there is sufficient space in the socket buffer to accept
+	 * more data.  clear SOCK_NOSPACE so that ceph_write_space()
+	 * doesn't get called again until try_write() fills the socket
+	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
+	 * and net/core/stream.c:sk_stream_write_space().
+	 */
 	if (test_bit(WRITE_PENDING, &con->state)) {
-		dout("ceph_write_space %p queueing write work\n", con);
-		queue_con(con);
+		if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+			dout("ceph_write_space %p queueing write work\n", con);
+			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			queue_con(con);
+		}
 	} else {
 		dout("ceph_write_space %p nothing to write\n", con);
 	}
-
-	/* since we have our own write_space, clear the SOCK_NOSPACE flag */
-	clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 }
 
 /* socket's state has changed */
-- 
cgit v1.2.3-18-g5258


From 1ce208a6ce030ea6ccd4b13c8cec0a84c0c7a1e9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 12 Jan 2012 17:48:11 -0800
Subject: ceph: don't reset s_cap_ttl to zero

Avoid the need to check for a special zero s_cap_ttl value by just
using (jiffies - 1) as the value assigned to indicate "sometime in
the past."

Signed-off-by: Alex Elder <elder@dreamhost.com>
Reviewed-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 866e8d7ca37..89971e137aa 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -402,7 +402,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 
 	spin_lock_init(&s->s_gen_ttl_lock);
 	s->s_cap_gen = 0;
-	s->s_cap_ttl = 0;
+	s->s_cap_ttl = jiffies - 1;
 
 	spin_lock_init(&s->s_cap_lock);
 	s->s_renew_requested = 0;
@@ -1083,8 +1083,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
 	int wake = 0;
 
 	spin_lock(&session->s_cap_lock);
-	was_stale = is_renew && (session->s_cap_ttl == 0 ||
-				 time_after_eq(jiffies, session->s_cap_ttl));
+	was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
 
 	session->s_cap_ttl = session->s_renew_requested +
 		mdsc->mdsmap->m_session_timeout*HZ;
@@ -2332,7 +2331,7 @@ static void handle_session(struct ceph_mds_session *session,
 			session->s_mds);
 		spin_lock(&session->s_gen_ttl_lock);
 		session->s_cap_gen++;
-		session->s_cap_ttl = 0;
+		session->s_cap_ttl = jiffies - 1;
 		spin_unlock(&session->s_gen_ttl_lock);
 		send_renew_caps(mdsc, session);
 		break;
-- 
cgit v1.2.3-18-g5258


From a661fc561190c0ee2d7cfabcfa92204e2b3aa349 Mon Sep 17 00:00:00 2001
From: Amon Ott <ao@m-privacy.de>
Date: Mon, 23 Jan 2012 09:25:23 -0800
Subject: ceph: use 2 instead of 1 as fallback for 32-bit inode number

The root directory of the Ceph mount has inode number 1, so falling back
to 1 always creates a collision. 2 is unused on my test systems and seems
less likely to collide.

Signed-off-by: Amon Ott <ao@m-privacy.de>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1421f3d875a..18d8a866a07 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -367,7 +367,7 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
 	u32 ino = vino & 0xffffffff;
 	ino ^= vino >> 32;
 	if (!ino)
-		ino = 1;
+		ino = 2;
 	return ino;
 }
 
-- 
cgit v1.2.3-18-g5258


From 810339ec2fae5cbd0164b8acde7fb65652755864 Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Fri, 3 Feb 2012 09:55:36 -0500
Subject: ceph: avoid panic with mismatched symlink sizes in fill_inode()

Return -EINVAL rather than panic if iinfo->symlink_len and inode->i_size
do not match.

Also use kstrndup rather than kmalloc/memcpy.

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Reviewed-by: Alex Elder <elder@dreamhost.com>
---
 fs/ceph/inode.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 2c489378b4c..9fff9f3b17e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -677,18 +677,19 @@ static int fill_inode(struct inode *inode,
 	case S_IFLNK:
 		inode->i_op = &ceph_symlink_iops;
 		if (!ci->i_symlink) {
-			int symlen = iinfo->symlink_len;
+			u32 symlen = iinfo->symlink_len;
 			char *sym;
 
-			BUG_ON(symlen != inode->i_size);
 			spin_unlock(&ci->i_ceph_lock);
 
+			err = -EINVAL;
+			if (WARN_ON(symlen != inode->i_size))
+				goto out;
+
 			err = -ENOMEM;
-			sym = kmalloc(symlen+1, GFP_NOFS);
+			sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
 			if (!sym)
 				goto out;
-			memcpy(sym, iinfo->symlink, symlen);
-			sym[symlen] = 0;
 
 			spin_lock(&ci->i_ceph_lock);
 			if (!ci->i_symlink)
-- 
cgit v1.2.3-18-g5258


From 64486697771cbe219fffcb5c8e2ed9ca4fdf086c Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Thu, 16 Feb 2012 11:55:48 -0500
Subject: libceph: fix overflow check in crush_decode()

The existing overflow check (n > ULONG_MAX / b) didn't work, because
n = ULONG_MAX / b would both bypass the check and still overflow the
allocation size a + n * b.

The correct check should be (n > (ULONG_MAX - a) / b).

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/osdmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index fd863fe7693..29ad46ec9dc 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -283,7 +283,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 		ceph_decode_32_safe(p, end, yes, bad);
 #if BITS_PER_LONG == 32
 		err = -EINVAL;
-		if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
+		if (yes > (ULONG_MAX - sizeof(*r))
+			  / sizeof(struct crush_rule_step))
 			goto bad;
 #endif
 		r = c->rules[i] = kmalloc(sizeof(*r) +
-- 
cgit v1.2.3-18-g5258


From 80834312a4da1405a9bc788313c67643de6fcb4c Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Thu, 16 Feb 2012 11:56:29 -0500
Subject: ceph: fix overflow check in build_snap_context()

The overflow check for a + n * b should be (n > (ULONG_MAX - a) / b),
rather than (n > ULONG_MAX / b - a).

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/snap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index a559c80f127..f04c0961f99 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -331,7 +331,7 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 
 	/* alloc new snap context */
 	err = -ENOMEM;
-	if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
+	if (num > (ULONG_MAX - sizeof(*snapc)) / sizeof(u64))
 		goto fail;
 	snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
 	if (!snapc)
-- 
cgit v1.2.3-18-g5258


From 5766651971e81298732466c9aa462ff47898ba37 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Mon, 23 Jan 2012 15:49:27 -0600
Subject: ceph: use a shared zero page rather than one per messenger

Each messenger allocates a page to be used when writing zeroes
out in the event of error or other abnormal condition.  Instead,
use the kernel ZERO_PAGE() for that purpose.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 include/linux/ceph/messenger.h |  1 -
 net/ceph/messenger.c           | 43 ++++++++++++++++++++++++++++--------------
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index ffbeb2c217b..6b5af5f976d 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -54,7 +54,6 @@ struct ceph_connection_operations {
 struct ceph_messenger {
 	struct ceph_entity_inst inst;    /* my name+address */
 	struct ceph_entity_addr my_enc_addr;
-	struct page *zero_page;          /* used in certain error cases */
 
 	bool nocrc;
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index d11f91b0545..738356255e0 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -52,6 +52,9 @@ static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
 static DEFINE_SPINLOCK(addr_str_lock);
 static int last_addr_str;
 
+static struct page *zero_page;		/* used in certain error cases */
+static void *zero_page_address;		/* kernel virtual addr of zero_page */
+
 const char *ceph_pr_addr(const struct sockaddr_storage *ss)
 {
 	int i;
@@ -99,18 +102,41 @@ struct workqueue_struct *ceph_msgr_wq;
 
 int ceph_msgr_init(void)
 {
+	BUG_ON(zero_page != NULL);
+	zero_page = ZERO_PAGE(0);
+	page_cache_get(zero_page);
+
+	BUG_ON(zero_page_address != NULL);
+	zero_page_address = kmap(zero_page);
+
 	ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0);
 	if (!ceph_msgr_wq) {
 		pr_err("msgr_init failed to create workqueue\n");
+
+		zero_page_address = NULL;
+		kunmap(zero_page);
+		page_cache_release(zero_page);
+		zero_page = NULL;
+
 		return -ENOMEM;
 	}
+
 	return 0;
 }
 EXPORT_SYMBOL(ceph_msgr_init);
 
 void ceph_msgr_exit(void)
 {
+	BUG_ON(ceph_msgr_wq == NULL);
 	destroy_workqueue(ceph_msgr_wq);
+
+	BUG_ON(zero_page_address == NULL);
+	zero_page_address = NULL;
+
+	BUG_ON(zero_page == NULL);
+	kunmap(zero_page);
+	page_cache_release(zero_page);
+	zero_page = NULL;
 }
 EXPORT_SYMBOL(ceph_msgr_exit);
 
@@ -841,9 +867,9 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 			max_write = bv->bv_len;
 #endif
 		} else {
-			page = con->msgr->zero_page;
+			page = zero_page;
 			if (crc)
-				kaddr = page_address(con->msgr->zero_page);
+				kaddr = zero_page_address;
 		}
 		len = min_t(int, max_write - con->out_msg_pos.page_pos,
 			    total_max_write);
@@ -914,7 +940,7 @@ static int write_partial_skip(struct ceph_connection *con)
 
 	while (con->out_skip > 0) {
 		struct kvec iov = {
-			.iov_base = page_address(con->msgr->zero_page),
+			.iov_base = zero_page_address,
 			.iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
 		};
 
@@ -2222,15 +2248,6 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
 
 	spin_lock_init(&msgr->global_seq_lock);
 
-	/* the zero page is needed if a request is "canceled" while the message
-	 * is being written over the socket */
-	msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
-	if (!msgr->zero_page) {
-		kfree(msgr);
-		return ERR_PTR(-ENOMEM);
-	}
-	kmap(msgr->zero_page);
-
 	if (myaddr)
 		msgr->inst.addr = *myaddr;
 
@@ -2247,8 +2264,6 @@ EXPORT_SYMBOL(ceph_messenger_create);
 void ceph_messenger_destroy(struct ceph_messenger *msgr)
 {
 	dout("destroy %p\n", msgr);
-	kunmap(msgr->zero_page);
-	__free_page(msgr->zero_page);
 	kfree(msgr);
 	dout("destroyed messenger %p\n", msgr);
 }
-- 
cgit v1.2.3-18-g5258


From a5bc3129a296fd4663c3ef0be5575e82453739dd Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Mon, 23 Jan 2012 15:49:27 -0600
Subject: ceph: make use of "else" where appropriate

Rearrange ceph_tcp_connect() a bit, making use of "else" rather than
re-testing a value with consecutive "if" statements.  Don't record a
connection's socket pointer unless the connect operation is
successful.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/messenger.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 738356255e0..b5536e4e39a 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -251,7 +251,6 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
 			       IPPROTO_TCP, &sock);
 	if (ret)
 		return ERR_PTR(ret);
-	con->sock = sock;
 	sock->sk->sk_allocation = GFP_NOFS;
 
 #ifdef CONFIG_LOCKDEP
@@ -268,18 +267,16 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
 		dout("connect %s EINPROGRESS sk_state = %u\n",
 		     ceph_pr_addr(&con->peer_addr.in_addr),
 		     sock->sk->sk_state);
-		ret = 0;
-	}
-	if (ret < 0) {
+	} else if (ret < 0) {
 		pr_err("connect %s error %d\n",
 		       ceph_pr_addr(&con->peer_addr.in_addr), ret);
 		sock_release(sock);
-		con->sock = NULL;
 		con->error_msg = "connect error";
-	}
 
-	if (ret < 0)
 		return ERR_PTR(ret);
+	}
+	con->sock = sock;
+
 	return sock;
 }
 
-- 
cgit v1.2.3-18-g5258


From f64a93172b97dcfcfa68f595652220653562f605 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Mon, 23 Jan 2012 15:49:27 -0600
Subject: ceph: kill addr_str_lock spinlock; use atomic instead

A spinlock is used to protect a value used for selecting an array
index for a string used for formatting a socket address for human
consumption.  The index is reset to 0 if it ever reaches the maximum
index value.

Instead, use an ever-increasing atomic variable as a sequence
number, and compute the array index by masking off all but the
sequence number's lowest bits.  Make the number of entries in the
array a power of two to allow the use of such a mask (to avoid jumps
in the index value when the sequence number wraps).

The length of these strings is somewhat arbitrarily set at 60 bytes.
The worst-case length of a string produced is 54 bytes, for an IPv6
address that can't be shortened, e.g.:
    [1234:5678:9abc:def0:1111:2222:123.234.210.100]:32767
Change it so we arbitrarily use 64 bytes instead; if nothing else
it will make the array of these line up better in hex dumps.

Rename a few things to reinforce the distinction between the number
of strings in the array and the length of individual strings.

Signed-off-by: Alex Elder <elder@newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/messenger.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index b5536e4e39a..e86bb3f1485 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -44,13 +44,16 @@ static void con_work(struct work_struct *);
 static void ceph_fault(struct ceph_connection *con);
 
 /*
- * nicely render a sockaddr as a string.
+ * Nicely render a sockaddr as a string.  An array of formatted
+ * strings is used, to approximate reentrancy.
  */
-#define MAX_ADDR_STR 20
-#define MAX_ADDR_STR_LEN 60
-static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
-static DEFINE_SPINLOCK(addr_str_lock);
-static int last_addr_str;
+#define ADDR_STR_COUNT_LOG	5	/* log2(# address strings in array) */
+#define ADDR_STR_COUNT		(1 << ADDR_STR_COUNT_LOG)
+#define ADDR_STR_COUNT_MASK	(ADDR_STR_COUNT - 1)
+#define MAX_ADDR_STR_LEN	64	/* 54 is enough */
+
+static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
+static atomic_t addr_str_seq = ATOMIC_INIT(0);
 
 static struct page *zero_page;		/* used in certain error cases */
 static void *zero_page_address;		/* kernel virtual addr of zero_page */
@@ -62,11 +65,7 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss)
 	struct sockaddr_in *in4 = (void *)ss;
 	struct sockaddr_in6 *in6 = (void *)ss;
 
-	spin_lock(&addr_str_lock);
-	i = last_addr_str++;
-	if (last_addr_str == MAX_ADDR_STR)
-		last_addr_str = 0;
-	spin_unlock(&addr_str_lock);
+	i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK;
 	s = addr_str[i];
 
 	switch (ss->ss_family) {
-- 
cgit v1.2.3-18-g5258


From bd406145129e8724cc71b65ff2a788dbd4d60c50 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Mon, 23 Jan 2012 15:49:27 -0600
Subject: ceph: eliminate some needless casts

This eliminates type casts in some places where they are not
required.

Signed-off-by: Alex Elder <elder@newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/messenger.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index e86bb3f1485..09a412ba4b7 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -70,13 +70,13 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss)
 
 	switch (ss->ss_family) {
 	case AF_INET:
-		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
-			 (unsigned int)ntohs(in4->sin_port));
+		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr,
+			 ntohs(in4->sin_port));
 		break;
 
 	case AF_INET6:
-		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
-			 (unsigned int)ntohs(in6->sin6_port));
+		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr,
+			 ntohs(in6->sin6_port));
 		break;
 
 	default:
@@ -153,8 +153,8 @@ EXPORT_SYMBOL(ceph_msgr_flush);
 /* data available on socket, or listen socket received a connect */
 static void ceph_data_ready(struct sock *sk, int count_unused)
 {
-	struct ceph_connection *con =
-		(struct ceph_connection *)sk->sk_user_data;
+	struct ceph_connection *con = sk->sk_user_data;
+
 	if (sk->sk_state != TCP_CLOSE_WAIT) {
 		dout("ceph_data_ready on %p state = %lu, queueing work\n",
 		     con, con->state);
@@ -189,8 +189,7 @@ static void ceph_write_space(struct sock *sk)
 /* socket's state has changed */
 static void ceph_state_change(struct sock *sk)
 {
-	struct ceph_connection *con =
-		(struct ceph_connection *)sk->sk_user_data;
+	struct ceph_connection *con = sk->sk_user_data;
 
 	dout("ceph_state_change %p state = %lu sk_state = %u\n",
 	     con, con->state, sk->sk_state);
@@ -225,7 +224,7 @@ static void set_sock_callbacks(struct socket *sock,
 			       struct ceph_connection *con)
 {
 	struct sock *sk = sock->sk;
-	sk->sk_user_data = (void *)con;
+	sk->sk_user_data = con;
 	sk->sk_data_ready = ceph_data_ready;
 	sk->sk_write_space = ceph_write_space;
 	sk->sk_state_change = ceph_state_change;
@@ -552,7 +551,7 @@ static void prepare_write_message(struct ceph_connection *con)
 
 	/* fill in crc (except data pages), footer */
 	con->out_msg->hdr.crc =
-		cpu_to_le32(crc32c(0, (void *)&m->hdr,
+		cpu_to_le32(crc32c(0, &m->hdr,
 				      sizeof(m->hdr) - sizeof(m->hdr.crc)));
 	con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
 	con->out_msg->footer.front_crc =
@@ -1647,7 +1646,7 @@ static int read_partial_message(struct ceph_connection *con)
 			return ret;
 		con->in_base_pos += ret;
 		if (con->in_base_pos == sizeof(con->in_hdr)) {
-			u32 crc = crc32c(0, (void *)&con->in_hdr,
+			u32 crc = crc32c(0, &con->in_hdr,
 				 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
 			if (crc != le32_to_cpu(con->in_hdr.crc)) {
 				pr_err("read_partial_message bad hdr "
-- 
cgit v1.2.3-18-g5258


From 99f0f3b2c4be15784bb4ede33b5f2c3f7861dba7 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Mon, 23 Jan 2012 15:49:27 -0600
Subject: ceph: eliminate some abusive casts

This fixes some spots where a type cast to (void *) was used as
as a universal type hiding mechanism.  Instead, properly cast the
type to the intended target type.

Signed-off-by: Alex Elder <elder@newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/messenger.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 09a412ba4b7..3917847ad8e 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -62,8 +62,8 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss)
 {
 	int i;
 	char *s;
-	struct sockaddr_in *in4 = (void *)ss;
-	struct sockaddr_in6 *in6 = (void *)ss;
+	struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
+	struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
 
 	i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK;
 	s = addr_str[i];
@@ -1112,8 +1112,8 @@ static void addr_set_port(struct sockaddr_storage *ss, int p)
 static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss,
 		char delim, const char **ipend)
 {
-	struct sockaddr_in *in4 = (void *)ss;
-	struct sockaddr_in6 *in6 = (void *)ss;
+	struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
+	struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
 
 	memset(ss, 0, sizeof(*ss));
 
-- 
cgit v1.2.3-18-g5258


From b829c1954dbeb42a1277a8cb05943050ee70be94 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Mon, 23 Jan 2012 15:49:27 -0600
Subject: ceph: don't null-terminate xattr values

For some reason, ceph_setxattr() allocates an extra byte in which a
'\0' is stored past the end of an extended attribute value.  This is
not needed, and is potentially misleading, so get rid of it.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a76f697303d..bfff735091f 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -730,11 +730,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 		goto out;
 
 	if (val_len) {
-		newval = kmalloc(val_len + 1, GFP_NOFS);
+		newval = kmemdup(value, val_len, GFP_NOFS);
 		if (!newval)
 			goto out;
-		memcpy(newval, value, val_len);
-		newval[val_len] = '\0';
 	}
 
 	xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
-- 
cgit v1.2.3-18-g5258


From 06476a69d8954f36a15ff5ddbfd47bdfcff22791 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Mon, 23 Jan 2012 15:49:27 -0600
Subject: ceph: pass inode rather than table to ceph_match_vxattr()

All callers of ceph_match_vxattr() determine what to pass as the
first argument by calling ceph_inode_vxattrs(inode).  Just do that
inside ceph_match_vxattr() itself, changing it to take an inode
rather than the vxattr pointer as its first argument.

Also ensure the function works correctly for an empty table (i.e.,
containing only a terminating null entry).

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 43 ++++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index bfff735091f..3418615c53e 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -126,14 +126,19 @@ static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
 	return NULL;
 }
 
-static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
+static struct ceph_vxattr_cb *ceph_match_vxattr(struct inode *inode,
 						const char *name)
 {
-	do {
-		if (strcmp(vxattr->name, name) == 0)
-			return vxattr;
-		vxattr++;
-	} while (vxattr->name);
+	struct ceph_vxattr_cb *vxattr = ceph_inode_vxattrs(inode);
+
+	if (vxattr) {
+		while (vxattr->name) {
+			if (!strcmp(vxattr->name, name))
+				return vxattr;
+			vxattr++;
+		}
+	}
+
 	return NULL;
 }
 
@@ -502,7 +507,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 {
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
 	int err;
 	struct ceph_inode_xattr *xattr;
 	struct ceph_vxattr_cb *vxattr = NULL;
@@ -511,8 +515,7 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 		return -ENODATA;
 
 	/* let's see if a virtual xattr was requested */
-	if (vxattrs)
-		vxattr = ceph_match_vxattr(vxattrs, name);
+	vxattr = ceph_match_vxattr(inode, name);
 
 	spin_lock(&ci->i_ceph_lock);
 	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
@@ -698,8 +701,8 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 		  const void *value, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
+	struct ceph_vxattr_cb *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
 	int err;
 	int name_len = strlen(name);
 	int val_len = size;
@@ -716,12 +719,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 	if (!ceph_is_valid_xattr(name))
 		return -EOPNOTSUPP;
 
-	if (vxattrs) {
-		struct ceph_vxattr_cb *vxattr =
-			ceph_match_vxattr(vxattrs, name);
-		if (vxattr && vxattr->readonly)
-			return -EOPNOTSUPP;
-	}
+	vxattr = ceph_match_vxattr(inode, name);
+	if (vxattr && vxattr->readonly)
+		return -EOPNOTSUPP;
 
 	/* preallocate memory for xattr name, value, index node */
 	err = -ENOMEM;
@@ -814,8 +814,8 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 int ceph_removexattr(struct dentry *dentry, const char *name)
 {
 	struct inode *inode = dentry->d_inode;
+	struct ceph_vxattr_cb *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
 	int issued;
 	int err;
 	int required_blob_size;
@@ -827,12 +827,9 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
 	if (!ceph_is_valid_xattr(name))
 		return -EOPNOTSUPP;
 
-	if (vxattrs) {
-		struct ceph_vxattr_cb *vxattr =
-			ceph_match_vxattr(vxattrs, name);
-		if (vxattr && vxattr->readonly)
-			return -EOPNOTSUPP;
-	}
+	vxattr = ceph_match_vxattr(inode, name);
+	if (vxattr && vxattr->readonly)
+		return -EOPNOTSUPP;
 
 	err = -ENOMEM;
 	spin_lock(&ci->i_ceph_lock);
-- 
cgit v1.2.3-18-g5258


From 22891907193e005923a14384d82d702f6af4f0cf Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Mon, 23 Jan 2012 15:49:28 -0600
Subject: ceph: use a symbolic name for "ceph." extended attribute namespace

Use symbolic constants to define the top-level prefix for "ceph."
extended attribute names.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 3418615c53e..05bb56f402a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -8,9 +8,12 @@
 #include <linux/xattr.h>
 #include <linux/slab.h>
 
+#define XATTR_CEPH_PREFIX "ceph."
+#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
+
 static bool ceph_is_valid_xattr(const char *name)
 {
-	return !strncmp(name, "ceph.", 5) ||
+	return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_SECURITY_PREFIX,
 			XATTR_SECURITY_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
@@ -80,14 +83,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
 }
 
 static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
-	{ true, "ceph.dir.entries", ceph_vxattrcb_entries},
-	{ true, "ceph.dir.files", ceph_vxattrcb_files},
-	{ true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
-	{ true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
-	{ true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
-	{ true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
-	{ true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
-	{ true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
+	{ true, XATTR_CEPH_PREFIX "dir.entries", ceph_vxattrcb_entries},
+	{ true, XATTR_CEPH_PREFIX "dir.files", ceph_vxattrcb_files},
+	{ true, XATTR_CEPH_PREFIX "dir.subdirs", ceph_vxattrcb_subdirs},
+	{ true, XATTR_CEPH_PREFIX "dir.rentries", ceph_vxattrcb_rentries},
+	{ true, XATTR_CEPH_PREFIX "dir.rfiles", ceph_vxattrcb_rfiles},
+	{ true, XATTR_CEPH_PREFIX "dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+	{ true, XATTR_CEPH_PREFIX "dir.rbytes", ceph_vxattrcb_rbytes},
+	{ true, XATTR_CEPH_PREFIX "dir.rctime", ceph_vxattrcb_rctime},
 	{ true, NULL, NULL }
 };
 
@@ -111,9 +114,9 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 }
 
 static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
-	{ true, "ceph.file.layout", ceph_vxattrcb_layout},
+	{ true, XATTR_CEPH_PREFIX "file.layout", ceph_vxattrcb_layout},
 	/* The following extended attribute name is deprecated */
-	{ true, "ceph.layout", ceph_vxattrcb_layout},
+	{ true, XATTR_CEPH_PREFIX "layout", ceph_vxattrcb_layout},
 	{ true, NULL, NULL }
 };
 
-- 
cgit v1.2.3-18-g5258


From eb78808446aeed8e25b080c66bf823c1f188236d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Mon, 23 Jan 2012 15:49:28 -0600
Subject: ceph: use macros to normalize vxattr table definitions

Entries in the ceph virtual extended attribute tables all follow a
distinct pattern in their definition.  Enforce this pattern through
the use of a macro.

Also, a null name field signals the end of the table, so make that
be the first field in the ceph_vxattr_cb structure.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 05bb56f402a..38aef476f78 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -25,10 +25,10 @@ static bool ceph_is_valid_xattr(const char *name)
  * statistics and layout metadata.
  */
 struct ceph_vxattr_cb {
-	bool readonly;
 	char *name;
 	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
 			      size_t size);
+	bool readonly;
 };
 
 /* directories */
@@ -82,16 +82,25 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
 			(long)ci->i_rctime.tv_nsec);
 }
 
+#define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
+
+#define XATTR_NAME_CEPH(_type, _name) \
+		{ \
+			.name = CEPH_XATTR_NAME(_type, _name), \
+			.getxattr_cb = ceph_vxattrcb_ ## _name, \
+			.readonly = true, \
+		}
+
 static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
-	{ true, XATTR_CEPH_PREFIX "dir.entries", ceph_vxattrcb_entries},
-	{ true, XATTR_CEPH_PREFIX "dir.files", ceph_vxattrcb_files},
-	{ true, XATTR_CEPH_PREFIX "dir.subdirs", ceph_vxattrcb_subdirs},
-	{ true, XATTR_CEPH_PREFIX "dir.rentries", ceph_vxattrcb_rentries},
-	{ true, XATTR_CEPH_PREFIX "dir.rfiles", ceph_vxattrcb_rfiles},
-	{ true, XATTR_CEPH_PREFIX "dir.rsubdirs", ceph_vxattrcb_rsubdirs},
-	{ true, XATTR_CEPH_PREFIX "dir.rbytes", ceph_vxattrcb_rbytes},
-	{ true, XATTR_CEPH_PREFIX "dir.rctime", ceph_vxattrcb_rctime},
-	{ true, NULL, NULL }
+	XATTR_NAME_CEPH(dir, entries),
+	XATTR_NAME_CEPH(dir, files),
+	XATTR_NAME_CEPH(dir, subdirs),
+	XATTR_NAME_CEPH(dir, rentries),
+	XATTR_NAME_CEPH(dir, rfiles),
+	XATTR_NAME_CEPH(dir, rsubdirs),
+	XATTR_NAME_CEPH(dir, rbytes),
+	XATTR_NAME_CEPH(dir, rctime),
+	{ 0 }	/* Required table terminator */
 };
 
 /* files */
@@ -114,10 +123,14 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 }
 
 static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
-	{ true, XATTR_CEPH_PREFIX "file.layout", ceph_vxattrcb_layout},
+	XATTR_NAME_CEPH(file, layout),
 	/* The following extended attribute name is deprecated */
-	{ true, XATTR_CEPH_PREFIX "layout", ceph_vxattrcb_layout},
-	{ true, NULL, NULL }
+	{
+		.name = XATTR_CEPH_PREFIX "layout",
+		.getxattr_cb = ceph_vxattrcb_layout,
+		.readonly = true,
+	},
+	{ 0 }	/* Required table terminator */
 };
 
 static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
-- 
cgit v1.2.3-18-g5258


From 881a5fa20092d221a7c4b365742c959ef4b297ec Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Mon, 23 Jan 2012 15:49:28 -0600
Subject: ceph: drop "_cb" from name of struct ceph_vxattr_cb

A struct ceph_vxattr_cb does not represent a callback at all, but
rather a virtual extended attribute itself.  Drop the "_cb" suffix
from its name to reflect that.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 38aef476f78..e29c7d3fa40 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -24,7 +24,7 @@ static bool ceph_is_valid_xattr(const char *name)
  * These define virtual xattrs exposing the recursive directory
  * statistics and layout metadata.
  */
-struct ceph_vxattr_cb {
+struct ceph_vxattr {
 	char *name;
 	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
 			      size_t size);
@@ -91,7 +91,7 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
 			.readonly = true, \
 		}
 
-static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
+static struct ceph_vxattr ceph_dir_vxattrs[] = {
 	XATTR_NAME_CEPH(dir, entries),
 	XATTR_NAME_CEPH(dir, files),
 	XATTR_NAME_CEPH(dir, subdirs),
@@ -122,7 +122,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 	return ret;
 }
 
-static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+static struct ceph_vxattr ceph_file_vxattrs[] = {
 	XATTR_NAME_CEPH(file, layout),
 	/* The following extended attribute name is deprecated */
 	{
@@ -133,7 +133,7 @@ static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
 	{ 0 }	/* Required table terminator */
 };
 
-static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
+static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
 {
 	if (S_ISDIR(inode->i_mode))
 		return ceph_dir_vxattrs;
@@ -142,10 +142,10 @@ static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
 	return NULL;
 }
 
-static struct ceph_vxattr_cb *ceph_match_vxattr(struct inode *inode,
+static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
 						const char *name)
 {
-	struct ceph_vxattr_cb *vxattr = ceph_inode_vxattrs(inode);
+	struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode);
 
 	if (vxattr) {
 		while (vxattr->name) {
@@ -525,7 +525,7 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int err;
 	struct ceph_inode_xattr *xattr;
-	struct ceph_vxattr_cb *vxattr = NULL;
+	struct ceph_vxattr *vxattr = NULL;
 
 	if (!ceph_is_valid_xattr(name))
 		return -ENODATA;
@@ -587,7 +587,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
 {
 	struct inode *inode = dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
 	u32 vir_namelen = 0;
 	u32 namelen;
 	int err;
@@ -717,7 +717,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 		  const void *value, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
-	struct ceph_vxattr_cb *vxattr;
+	struct ceph_vxattr *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int err;
 	int name_len = strlen(name);
@@ -830,7 +830,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 int ceph_removexattr(struct dentry *dentry, const char *name)
 {
 	struct inode *inode = dentry->d_inode;
-	struct ceph_vxattr_cb *vxattr;
+	struct ceph_vxattr *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int issued;
 	int err;
-- 
cgit v1.2.3-18-g5258


From aa4066ed7ba60421423c35f66b789bb3dd21d89e Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Mon, 23 Jan 2012 15:49:28 -0600
Subject: ceph: encode type in vxattr callback routines

The names of the callback functions used for virtual extended
attributes are based only on the last component of the attribute
name.  Because of the way these are defined, this precludes allowing
a single (lowest) attribute name for different callbacks, dependent
on the type of file being operated on.  (For example, it might be
nice to support both "ceph.dir.layout" and "ceph.file.layout".)

Just change the callback names to avoid this problem.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index e29c7d3fa40..46be30d6d12 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -33,49 +33,49 @@ struct ceph_vxattr {
 
 /* directories */
 
-static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
 					size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
 }
 
-static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
 				      size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_files);
 }
 
-static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
 					size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_subdirs);
 }
 
-static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
 					 size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
 }
 
-static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
 				       size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rfiles);
 }
 
-static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
 					 size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rsubdirs);
 }
 
-static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
 				       size_t size)
 {
 	return snprintf(val, size, "%lld", ci->i_rbytes);
 }
 
-static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 				       size_t size)
 {
 	return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
@@ -87,7 +87,7 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
 #define XATTR_NAME_CEPH(_type, _name) \
 		{ \
 			.name = CEPH_XATTR_NAME(_type, _name), \
-			.getxattr_cb = ceph_vxattrcb_ ## _name, \
+			.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
 			.readonly = true, \
 		}
 
@@ -105,7 +105,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
 
 /* files */
 
-static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
 				   size_t size)
 {
 	int ret;
@@ -127,7 +127,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
 	/* The following extended attribute name is deprecated */
 	{
 		.name = XATTR_CEPH_PREFIX "layout",
-		.getxattr_cb = ceph_vxattrcb_layout,
+		.getxattr_cb = ceph_vxattrcb_file_layout,
 		.readonly = true,
 	},
 	{ 0 }	/* Required table terminator */
-- 
cgit v1.2.3-18-g5258


From 3ce6cd1233046eb97d6d2bd5d80c1cd40528ea2f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Mon, 23 Jan 2012 15:49:28 -0600
Subject: ceph: avoid repeatedly computing the size of constant vxattr names

All names defined in the directory and file virtual extended
attribute tables are constant, and the size of each is known at
compile time.  So there's no need to compute their length every
time any file's attribute is listed.

Record the length of each string and use it when needed to determine
the space need to represent them.  In addition, compute the
aggregate size of strings in each table just once at initialization
time.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c |  3 +++
 fs/ceph/super.h |  2 ++
 fs/ceph/xattr.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 00de2c9568c..c3da3b32bdd 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -927,6 +927,7 @@ static int __init init_ceph(void)
 	if (ret)
 		goto out;
 
+	ceph_xattr_init();
 	ret = register_filesystem(&ceph_fs_type);
 	if (ret)
 		goto out_icache;
@@ -936,6 +937,7 @@ static int __init init_ceph(void)
 	return 0;
 
 out_icache:
+	ceph_xattr_exit();
 	destroy_caches();
 out:
 	return ret;
@@ -945,6 +947,7 @@ static void __exit exit_ceph(void)
 {
 	dout("exit_ceph\n");
 	unregister_filesystem(&ceph_fs_type);
+	ceph_xattr_exit();
 	destroy_caches();
 }
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 18d8a866a07..fc35036d258 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -733,6 +733,8 @@ extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
 extern int ceph_removexattr(struct dentry *, const char *);
 extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
 extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
+extern void __init ceph_xattr_init(void);
+extern void ceph_xattr_exit(void);
 
 /* caps.c */
 extern const char *ceph_cap_string(int c);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 46be30d6d12..88eaedf78fa 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -26,6 +26,7 @@ static bool ceph_is_valid_xattr(const char *name)
  */
 struct ceph_vxattr {
 	char *name;
+	size_t name_size;	/* strlen(name) + 1 (for '\0') */
 	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
 			      size_t size);
 	bool readonly;
@@ -87,6 +88,7 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 #define XATTR_NAME_CEPH(_type, _name) \
 		{ \
 			.name = CEPH_XATTR_NAME(_type, _name), \
+			.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
 			.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
 			.readonly = true, \
 		}
@@ -102,6 +104,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
 	XATTR_NAME_CEPH(dir, rctime),
 	{ 0 }	/* Required table terminator */
 };
+static size_t ceph_dir_vxattrs_name_size;	/* total size of all names */
 
 /* files */
 
@@ -127,11 +130,13 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
 	/* The following extended attribute name is deprecated */
 	{
 		.name = XATTR_CEPH_PREFIX "layout",
+		.name_size = sizeof (XATTR_CEPH_PREFIX "layout"),
 		.getxattr_cb = ceph_vxattrcb_file_layout,
 		.readonly = true,
 	},
 	{ 0 }	/* Required table terminator */
 };
+static size_t ceph_file_vxattrs_name_size;	/* total size of all names */
 
 static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
 {
@@ -142,6 +147,46 @@ static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
 	return NULL;
 }
 
+static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
+{
+	if (vxattrs == ceph_dir_vxattrs)
+		return ceph_dir_vxattrs_name_size;
+	if (vxattrs == ceph_file_vxattrs)
+		return ceph_file_vxattrs_name_size;
+	BUG();
+
+	return 0;
+}
+
+/*
+ * Compute the aggregate size (including terminating '\0') of all
+ * virtual extended attribute names in the given vxattr table.
+ */
+static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
+{
+	struct ceph_vxattr *vxattr;
+	size_t size = 0;
+
+	for (vxattr = vxattrs; vxattr->name; vxattr++)
+		size += vxattr->name_size;
+
+	return size;
+}
+
+/* Routines called at initialization and exit time */
+
+void __init ceph_xattr_init(void)
+{
+	ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs);
+	ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs);
+}
+
+void ceph_xattr_exit(void)
+{
+	ceph_dir_vxattrs_name_size = 0;
+	ceph_file_vxattrs_name_size = 0;
+}
+
 static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
 						const char *name)
 {
@@ -615,11 +660,12 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
 		goto out;
 
 list_xattr:
-	vir_namelen = 0;
-	/* include virtual dir xattrs */
-	if (vxattrs)
-		for (i = 0; vxattrs[i].name; i++)
-			vir_namelen += strlen(vxattrs[i].name) + 1;
+	/*
+	 * Start with virtual dir xattr names (if any) (including
+	 * terminating '\0' characters for each).
+	 */
+	vir_namelen = ceph_vxattrs_name_size(vxattrs);
+
 	/* adding 1 byte per each variable due to the null termination */
 	namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
 	err = -ERANGE;
-- 
cgit v1.2.3-18-g5258


From 18fa8b3feaac772925263b04b1429d80e2dfd779 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Mon, 23 Jan 2012 15:49:28 -0600
Subject: ceph: make ceph_setxattr() and ceph_removexattr() more alike

This patch just rearranges a few bits of code to make more
portions of ceph_setxattr() and ceph_removexattr() identical.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 88eaedf78fa..8294f461ecd 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -765,15 +765,15 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 	struct inode *inode = dentry->d_inode;
 	struct ceph_vxattr *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	int issued;
 	int err;
+	int dirty;
 	int name_len = strlen(name);
 	int val_len = size;
 	char *newname = NULL;
 	char *newval = NULL;
 	struct ceph_inode_xattr *xattr = NULL;
-	int issued;
 	int required_blob_size;
-	int dirty;
 
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
@@ -804,6 +804,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 	spin_lock(&ci->i_ceph_lock);
 retry:
 	issued = __ceph_caps_issued(ci, NULL);
+	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
 	if (!(issued & CEPH_CAP_XATTR_EXCL))
 		goto do_sync;
 	__build_xattrs(inode);
@@ -812,7 +813,7 @@ retry:
 
 	if (!ci->i_xattrs.prealloc_blob ||
 	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
-		struct ceph_buffer *blob = NULL;
+		struct ceph_buffer *blob;
 
 		spin_unlock(&ci->i_ceph_lock);
 		dout(" preaallocating new blob size=%d\n", required_blob_size);
@@ -826,12 +827,13 @@ retry:
 		goto retry;
 	}
 
-	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
 	err = __set_xattr(ci, newname, name_len, newval,
 			  val_len, 1, 1, 1, &xattr);
+
 	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
 	ci->i_xattrs.dirty = true;
 	inode->i_ctime = CURRENT_TIME;
+
 	spin_unlock(&ci->i_ceph_lock);
 	if (dirty)
 		__mark_inode_dirty(inode, dirty);
@@ -895,13 +897,13 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
 
 	err = -ENOMEM;
 	spin_lock(&ci->i_ceph_lock);
-	__build_xattrs(inode);
 retry:
 	issued = __ceph_caps_issued(ci, NULL);
 	dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
 
 	if (!(issued & CEPH_CAP_XATTR_EXCL))
 		goto do_sync;
+	__build_xattrs(inode);
 
 	required_blob_size = __get_required_blob_size(ci, 0, 0);
 
@@ -922,10 +924,10 @@ retry:
 	}
 
 	err = __remove_xattr_by_name(ceph_inode(inode), name);
+
 	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
 	ci->i_xattrs.dirty = true;
 	inode->i_ctime = CURRENT_TIME;
-
 	spin_unlock(&ci->i_ceph_lock);
 	if (dirty)
 		__mark_inode_dirty(inode, dirty);
-- 
cgit v1.2.3-18-g5258


From 2107978668de13da484f7abc3f03516494c7fca9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Tue, 24 Jan 2012 10:08:36 -0600
Subject: rbd: a few small cleanups

Some minor cleanups in "drivers/block/rbd.c:
    - Use the more meaningful "RBD_MAX_OBJ_NAME_LEN" in place if "96"
      in the definition of RBD_MAX_MD_NAME_LEN.
    - Use DEFINE_SPINLOCK() to define and initialize node_lock.
    - Drop a needless (char *) cast in parse_rbd_opts_token().
    - Make a few minor formatting changes.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 drivers/block/rbd.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a6278e7e61a..b9371f0b953 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -46,7 +46,7 @@
 
 #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
 
-#define RBD_MAX_MD_NAME_LEN	(96 + sizeof(RBD_SUFFIX))
+#define RBD_MAX_MD_NAME_LEN	(RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX))
 #define RBD_MAX_POOL_NAME_LEN	64
 #define RBD_MAX_SNAP_NAME_LEN	32
 #define RBD_MAX_OPT_LEN		1024
@@ -175,7 +175,7 @@ static struct bus_type rbd_bus_type = {
 	.name		= "rbd",
 };
 
-static spinlock_t node_lock;      /* protects client get/put */
+static DEFINE_SPINLOCK(node_lock);      /* protects client get/put */
 
 static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
 static LIST_HEAD(rbd_dev_list);    /* devices */
@@ -324,7 +324,7 @@ static int parse_rbd_opts_token(char *c, void *private)
 	substring_t argstr[MAX_OPT_ARGS];
 	int token, intval, ret;
 
-	token = match_token((char *)c, rbdopt_tokens, argstr);
+	token = match_token(c, rbdopt_tokens, argstr);
 	if (token < 0)
 		return -EINVAL;
 
@@ -372,7 +372,8 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
 	rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
 
 	ret = ceph_parse_options(&opt, options, mon_addr,
-				 mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts);
+				mon_addr + strlen(mon_addr),
+				parse_rbd_opts_token, rbd_opts);
 	if (ret < 0)
 		goto done_err;
 
@@ -460,15 +461,13 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 	u32 snap_count = le32_to_cpu(ondisk->snap_count);
 	int ret = -ENOMEM;
 
-	if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) {
+	if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
 		return -ENXIO;
-	}
 
 	init_rwsem(&header->snap_rwsem);
 	header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 	header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
-				snap_count *
-				 sizeof(struct rbd_image_snap_ondisk),
+				snap_count * sizeof (*ondisk),
 				gfp_flags);
 	if (!header->snapc)
 		return -ENOMEM;
@@ -498,8 +497,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 	header->snapc->num_snaps = snap_count;
 	header->total_snaps = snap_count;
 
-	if (snap_count &&
-	    allocated_snaps == snap_count) {
+	if (snap_count && allocated_snaps == snap_count) {
 		for (i = 0; i < snap_count; i++) {
 			header->snapc->snaps[i] =
 				le64_to_cpu(ondisk->snaps[i].id);
@@ -2423,7 +2421,7 @@ static int rbd_sysfs_init(void)
 	rbd_bus_type.bus_attrs = rbd_bus_attrs;
 
 	ret = bus_register(&rbd_bus_type);
-	 if (ret < 0)
+	if (ret < 0)
 		return ret;
 
 	ret = device_register(&rbd_root_dev);
@@ -2444,7 +2442,6 @@ int __init rbd_init(void)
 	rc = rbd_sysfs_init();
 	if (rc)
 		return rc;
-	spin_lock_init(&node_lock);
 	pr_info("loaded " DRV_NAME_LONG "\n");
 	return 0;
 }
-- 
cgit v1.2.3-18-g5258


From ee57741c5209154b8ef124bcaa2496da1b69a988 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Tue, 24 Jan 2012 10:08:36 -0600
Subject: rbd: make ceph_parse_options() return a pointer

ceph_parse_options() takes the address of a pointer as an argument
and uses it to return the address of an allocated structure if
successful.  With this interface is not evident at call sites that
the pointer is always initialized.  Change the interface to return
the address instead (or a pointer-coded error code) to make the
validity of the returned pointer obvious.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 drivers/block/rbd.c          |  6 ++++--
 fs/ceph/super.c              |  6 ++++--
 include/linux/ceph/libceph.h |  2 +-
 net/ceph/ceph_common.c       | 16 ++++++++--------
 4 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b9371f0b953..ed6711e3532 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -371,11 +371,13 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
 
 	rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
 
-	ret = ceph_parse_options(&opt, options, mon_addr,
+	opt = ceph_parse_options(options, mon_addr,
 				mon_addr + strlen(mon_addr),
 				parse_rbd_opts_token, rbd_opts);
-	if (ret < 0)
+	if (IS_ERR(opt)) {
+		ret = PTR_ERR(opt);
 		goto done_err;
+	}
 
 	spin_lock(&node_lock);
 	rbdc = __rbd_client_find(opt);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c3da3b32bdd..4fab1fdcfa6 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -334,10 +334,12 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 	*path += 2;
 	dout("server path '%s'\n", *path);
 
-	err = ceph_parse_options(popt, options, dev_name, dev_name_end,
+	*popt = ceph_parse_options(options, dev_name, dev_name_end,
 				 parse_fsopt_token, (void *)fsopt);
-	if (err)
+	if (IS_ERR(*popt)) {
+		err = PTR_ERR(*popt);
 		goto out;
+	}
 
 	/* success */
 	*pfsopt = fsopt;
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 95bd8502e71..92eef7c3d3c 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -207,7 +207,7 @@ extern struct kmem_cache *ceph_cap_cachep;
 extern struct kmem_cache *ceph_dentry_cachep;
 extern struct kmem_cache *ceph_file_cachep;
 
-extern int ceph_parse_options(struct ceph_options **popt, char *options,
+extern struct ceph_options *ceph_parse_options(char *options,
 			      const char *dev_name, const char *dev_name_end,
 			      int (*parse_extra_token)(char *c, void *private),
 			      void *private);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 761ad9d6cc3..621c3221b39 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -277,10 +277,11 @@ out:
 	return err;
 }
 
-int ceph_parse_options(struct ceph_options **popt, char *options,
-		       const char *dev_name, const char *dev_name_end,
-		       int (*parse_extra_token)(char *c, void *private),
-		       void *private)
+struct ceph_options *
+ceph_parse_options(char *options, const char *dev_name,
+			const char *dev_name_end,
+			int (*parse_extra_token)(char *c, void *private),
+			void *private)
 {
 	struct ceph_options *opt;
 	const char *c;
@@ -289,7 +290,7 @@ int ceph_parse_options(struct ceph_options **popt, char *options,
 
 	opt = kzalloc(sizeof(*opt), GFP_KERNEL);
 	if (!opt)
-		return err;
+		return ERR_PTR(-ENOMEM);
 	opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
 				GFP_KERNEL);
 	if (!opt->mon_addr)
@@ -412,12 +413,11 @@ int ceph_parse_options(struct ceph_options **popt, char *options,
 	}
 
 	/* success */
-	*popt = opt;
-	return 0;
+	return opt;
 
 out:
 	ceph_destroy_options(opt);
-	return err;
+	return ERR_PTR(err);
 }
 EXPORT_SYMBOL(ceph_parse_options);
 
-- 
cgit v1.2.3-18-g5258


From 1dbb439913f0fc0bc30d36411a4a3b3202c0aab1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Tue, 24 Jan 2012 10:08:37 -0600
Subject: rbd: do not duplicate ceph_client pointer in rbd_device

The rbd_device structure maintains a duplicate copy of the
ceph_client pointer maintained in its rbd_client structure.  There
appears to be no good reason for this, and its presence presents a
risk of them getting out of synch or otherwise misused.  So kill it
off, and use the rbd_client copy only.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 drivers/block/rbd.c | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ed6711e3532..dcdfe8dbf4f 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -140,7 +140,6 @@ struct rbd_device {
 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
 	struct request_queue	*q;
 
-	struct ceph_client	*client;
 	struct rbd_client	*rbd_client;
 
 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
@@ -388,7 +387,6 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
 		/* using an existing client */
 		kref_get(&rbdc->kref);
 		rbd_dev->rbd_client = rbdc;
-		rbd_dev->client = rbdc->client;
 		spin_unlock(&node_lock);
 		return 0;
 	}
@@ -401,7 +399,6 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
 	}
 
 	rbd_dev->rbd_client = rbdc;
-	rbd_dev->client = rbdc->client;
 	return 0;
 done_err:
 	kfree(rbd_opts);
@@ -435,7 +432,6 @@ static void rbd_put_client(struct rbd_device *rbd_dev)
 	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
 	spin_unlock(&node_lock);
 	rbd_dev->rbd_client = NULL;
-	rbd_dev->client = NULL;
 }
 
 /*
@@ -858,6 +854,7 @@ static int rbd_do_request(struct request *rq,
 	struct rbd_request *req_data;
 	struct ceph_osd_request_head *reqhead;
 	struct rbd_image_header *header = &dev->header;
+	struct ceph_osd_client *osdc;
 
 	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
 	if (!req_data) {
@@ -876,11 +873,9 @@ static int rbd_do_request(struct request *rq,
 
 	down_read(&header->snap_rwsem);
 
-	req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
-				      snapc,
-				      ops,
-				      false,
-				      GFP_NOIO, pages, bio);
+	osdc = &dev->rbd_client->client->osdc;
+	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
+					false, GFP_NOIO, pages, bio);
 	if (!req) {
 		up_read(&header->snap_rwsem);
 		ret = -ENOMEM;
@@ -909,8 +904,8 @@ static int rbd_do_request(struct request *rq,
 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
 	layout->fl_pg_preferred = cpu_to_le32(-1);
 	layout->fl_pg_pool = cpu_to_le32(dev->poolid);
-	ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
-			     ofs, &len, &bno, req, ops);
+	ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
+				req, ops);
 
 	ceph_osdc_build_request(req, ofs, &len,
 				ops,
@@ -920,16 +915,16 @@ static int rbd_do_request(struct request *rq,
 	up_read(&header->snap_rwsem);
 
 	if (linger_req) {
-		ceph_osdc_set_request_linger(&dev->client->osdc, req);
+		ceph_osdc_set_request_linger(osdc, req);
 		*linger_req = req;
 	}
 
-	ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
+	ret = ceph_osdc_start_request(osdc, req, false);
 	if (ret < 0)
 		goto done_err;
 
 	if (!rbd_cb) {
-		ret = ceph_osdc_wait_request(&dev->client->osdc, req);
+		ret = ceph_osdc_wait_request(osdc, req);
 		if (ver)
 			*ver = le64_to_cpu(req->r_reassert_version.version);
 		dout("reassert_ver=%lld\n",
@@ -1227,7 +1222,7 @@ static int rbd_req_sync_watch(struct rbd_device *dev,
 			      u64 ver)
 {
 	struct ceph_osd_req_op *ops;
-	struct ceph_osd_client *osdc = &dev->client->osdc;
+	struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
 
 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
 	if (ret < 0)
@@ -1314,7 +1309,7 @@ static int rbd_req_sync_notify(struct rbd_device *dev,
 		          const char *obj)
 {
 	struct ceph_osd_req_op *ops;
-	struct ceph_osd_client *osdc = &dev->client->osdc;
+	struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
 	struct ceph_osd_event *event;
 	struct rbd_notify_info info;
 	int payload_len = sizeof(u32) + sizeof(u32);
@@ -1623,13 +1618,14 @@ static int rbd_header_add_snap(struct rbd_device *dev,
 	int ret;
 	void *data, *p, *e;
 	u64 ver;
+	struct ceph_mon_client *monc;
 
 	/* we should create a snapshot only if we're pointing at the head */
 	if (dev->cur_snap)
 		return -EINVAL;
 
-	ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
-				      &new_snapid);
+	monc = &dev->rbd_client->client->monc;
+	ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid);
 	dout("created snapid=%lld\n", new_snapid);
 	if (ret < 0)
 		return ret;
@@ -1809,7 +1805,8 @@ static ssize_t rbd_client_id_show(struct device *dev,
 {
 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
 
-	return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client));
+	return sprintf(buf, "client%lld\n",
+			ceph_client_id(rbd_dev->rbd_client->client));
 }
 
 static ssize_t rbd_pool_show(struct device *dev,
@@ -2233,7 +2230,7 @@ static ssize_t rbd_add(struct bus_type *bus,
 	mutex_unlock(&ctl_mutex);
 
 	/* pick the pool */
-	osdc = &rbd_dev->client->osdc;
+	osdc = &rbd_dev->rbd_client->client->osdc;
 	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
 	if (rc < 0)
 		goto err_out_client;
@@ -2312,9 +2309,12 @@ static void rbd_dev_release(struct device *dev)
 	struct rbd_device *rbd_dev =
 			container_of(dev, struct rbd_device, dev);
 
-	if (rbd_dev->watch_request)
-		ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc,
+	if (rbd_dev->watch_request) {
+		struct ceph_client *client = rbd_dev->rbd_client->client;
+
+		ceph_osdc_unregister_linger_request(&client->osdc,
 						    rbd_dev->watch_request);
+	}
 	if (rbd_dev->watch_event)
 		rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
 
-- 
cgit v1.2.3-18-g5258


From cc9d734c3d1b39c6a557673469aea26364060226 Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@dreamhost.com>
Date: Mon, 21 Nov 2011 18:19:13 -0800
Subject: rbd: use a single value of snap_name to mean no snap

There's already a constant for this anyway.

Since rbd_header_set_snap() is only used to set the rbd device
snap_name field, just do that within that function rather than
having it take the snap_name as an argument.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>

v2: Changed interface rbd_header_set_snap() so it explicitly updates
    the snap_name in the rbd_device.  Also added a BUILD_BUG_ON()
    to verify the size of the snap_name field is sufficient for
    SNAP_HEAD_NAME.
---
 drivers/block/rbd.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index dcdfe8dbf4f..d8d052d4225 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -553,20 +553,18 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
 	return i;
 }
 
-static int rbd_header_set_snap(struct rbd_device *dev,
-			       const char *snap_name,
-			       u64 *size)
+static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
 {
 	struct rbd_image_header *header = &dev->header;
 	struct ceph_snap_context *snapc = header->snapc;
 	int ret = -ENOENT;
 
+	BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME));
+
 	down_write(&header->snap_rwsem);
 
-	if (!snap_name ||
-	    !*snap_name ||
-	    strcmp(snap_name, "-") == 0 ||
-	    strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
+	if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME,
+		    sizeof (RBD_SNAP_HEAD_NAME))) {
 		if (header->total_snaps)
 			snapc->seq = header->snap_seq;
 		else
@@ -576,7 +574,7 @@ static int rbd_header_set_snap(struct rbd_device *dev,
 		if (size)
 			*size = header->image_size;
 	} else {
-		ret = snap_by_name(header, snap_name, &snapc->seq, size);
+		ret = snap_by_name(header, dev->snap_name, &snapc->seq, size);
 		if (ret < 0)
 			goto done;
 
@@ -1729,7 +1727,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	if (rc)
 		return rc;
 
-	rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
+	rc = rbd_header_set_snap(rbd_dev, &total_size);
 	if (rc)
 		return rc;
 
@@ -2215,7 +2213,8 @@ static ssize_t rbd_add(struct bus_type *bus,
 	}
 
 	if (rbd_dev->snap_name[0] == 0)
-		rbd_dev->snap_name[0] = '-';
+		memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
+			sizeof (RBD_SNAP_HEAD_NAME));
 
 	rbd_dev->obj_len = strlen(rbd_dev->obj);
 	snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
-- 
cgit v1.2.3-18-g5258


From b7f23c361b65a0bdcc81acd8d38471b7810df3ff Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Sun, 29 Jan 2012 13:57:43 -0600
Subject: rbd: encapsulate new rbd id selection

Move the loop that finds a new unique rbd id to use into
its own helper function.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 drivers/block/rbd.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d8d052d4225..aaa19d8c367 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2149,6 +2149,23 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
 	return ret;
 }
 
+/* caller must hold ctl_mutex */
+static int rbd_id_get(void)
+{
+	struct list_head *tmp;
+	int new_id = 0;
+
+	list_for_each(tmp, &rbd_dev_