From cd9d9f5df6098c50726200d4185e9e8da32785b3 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Wed, 4 Apr 2012 13:35:44 -0500
Subject: rbd: don't hold spinlock during messenger flush

A recent change made changes to the rbd_client_list be protected by
a spinlock.  Unfortunately in rbd_put_client(), the lock is taken
before possibly dropping the last reference to an rbd_client, and on
the last reference that eventually calls flush_workqueue() which can
sleep.

The problem was flagged by a debug spinlock warning:
    BUG: spinlock wrong CPU on CPU#3, rbd/27814

The solution is to move the spinlock acquisition and release inside
rbd_client_release(), which is the spot where it's really needed for
protecting the removal of the rbd_client from the client list.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Reviewed-by: Sage Weil <sage@newdream.net>
---
 drivers/block/rbd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 013c7a549fb..c1f77013165 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -450,7 +450,9 @@ static void rbd_client_release(struct kref *kref)
 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 
 	dout("rbd_release_client %p\n", rbdc);
+	spin_lock(&rbd_client_list_lock);
 	list_del(&rbdc->node);
+	spin_unlock(&rbd_client_list_lock);
 
 	ceph_destroy_client(rbdc->client);
 	kfree(rbdc->rbd_opts);
@@ -463,9 +465,7 @@ static void rbd_client_release(struct kref *kref)
  */
 static void rbd_put_client(struct rbd_device *rbd_dev)
 {
-	spin_lock(&rbd_client_list_lock);
 	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
-	spin_unlock(&rbd_client_list_lock);
 	rbd_dev->rbd_client = NULL;
 }
 
-- 
cgit v1.2.3-18-g5258


From 3469ac1aa3a2f1e2586a412923c414779a0af854 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 7 May 2012 15:33:36 -0700
Subject: ceph: drop support for preferred_osd pgs

This was an ill-conceived feature that has been removed from Ceph.  Do
this gracefully:

 - reject attempts to specify a preferred_osd via the ioctl
 - stop exposing this information via virtual xattrs
 - always fill in -1 for requests, in case we talk to an older server
 - don't calculate preferred_osd placements/pgids

Reviewed-by: Alex Elder <elder@inktank.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 drivers/block/rbd.c          |  1 -
 fs/ceph/file.c               |  1 -
 fs/ceph/ioctl.c              | 15 +++++---------
 fs/ceph/xattr.c              |  9 ---------
 include/linux/ceph/ceph_fs.h |  4 ++--
 include/linux/ceph/osdmap.h  |  2 --
 net/ceph/osdmap.c            | 47 ++++++++++----------------------------------
 7 files changed, 17 insertions(+), 62 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c1f77013165..a67fa63a966 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -935,7 +935,6 @@ static int rbd_do_request(struct request *rq,
 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
 	layout->fl_stripe_count = cpu_to_le32(1);
 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	layout->fl_pg_preferred = cpu_to_le32(-1);
 	layout->fl_pg_pool = cpu_to_le32(dev->poolid);
 	ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
 				req, ops);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ed72428d9c7..988d4f302e4 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -54,7 +54,6 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)
 	req->r_fmode = ceph_flags_to_mode(flags);
 	req->r_args.open.flags = cpu_to_le32(flags);
 	req->r_args.open.mode = cpu_to_le32(create_mode);
-	req->r_args.open.preferred = cpu_to_le32(-1);
 out:
 	return req;
 }
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 790914a598d..4feab52c5bf 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -26,8 +26,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
 		l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
 		l.object_size = ceph_file_layout_object_size(ci->i_layout);
 		l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
-		l.preferred_osd =
-			(s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
+		l.preferred_osd = (s32)-1;
 		if (copy_to_user(arg, &l, sizeof(l)))
 			return -EFAULT;
 	}
@@ -49,6 +48,10 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 	if (copy_from_user(&l, arg, sizeof(l)))
 		return -EFAULT;
 
+	/* preferred_osd is no longer supported */
+	if (l.preferred_osd != -1)
+		return -EINVAL;
+
 	/* validate changed params against current layout */
 	err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
 	if (!err) {
@@ -56,8 +59,6 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 		nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
 		nl.object_size = ceph_file_layout_object_size(ci->i_layout);
 		nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
-		nl.preferred_osd =
-				(s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
 	} else
 		return err;
 
@@ -69,8 +70,6 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 		nl.object_size = l.object_size;
 	if (l.data_pool)
 		nl.data_pool = l.data_pool;
-	if (l.preferred_osd)
-		nl.preferred_osd = l.preferred_osd;
 
 	if ((nl.object_size & ~PAGE_MASK) ||
 	    (nl.stripe_unit & ~PAGE_MASK) ||
@@ -106,8 +105,6 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 	req->r_args.setlayout.layout.fl_object_size =
 		cpu_to_le32(l.object_size);
 	req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
-	req->r_args.setlayout.layout.fl_pg_preferred =
-		cpu_to_le32(l.preferred_osd);
 
 	parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
 	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
@@ -171,8 +168,6 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
 			cpu_to_le32(l.object_size);
 	req->r_args.setlayout.layout.fl_pg_pool =
 			cpu_to_le32(l.data_pool);
-	req->r_args.setlayout.layout.fl_pg_preferred =
-			cpu_to_le32(l.preferred_osd);
 
 	err = ceph_mdsc_do_request(mdsc, inode, req);
 	ceph_mdsc_put_request(req);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 35b86331d8a..785cb3057c9 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -118,15 +118,6 @@ static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
 		(unsigned long long)ceph_file_layout_su(ci->i_layout),
 		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
 		(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
-
-	if (ceph_file_layout_pg_preferred(ci->i_layout) >= 0) {
-		val += ret;
-		size -= ret;
-		ret += snprintf(val, size, "preferred_osd=%lld\n",
-			    (unsigned long long)ceph_file_layout_pg_preferred(
-				    ci->i_layout));
-	}
-
 	return ret;
 }
 
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index b8c60694b2b..e81ab30d489 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -65,7 +65,7 @@ struct ceph_file_layout {
 	__le32 fl_object_stripe_unit;  /* UNUSED.  for per-object parity, if any */
 
 	/* object -> pg layout */
-	__le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
+	__le32 fl_unused;       /* unused; used to be preferred primary (-1) */
 	__le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
 } __attribute__ ((packed));
 
@@ -384,7 +384,7 @@ union ceph_mds_request_args {
 		__le32 stripe_count;         /* ... */
 		__le32 object_size;
 		__le32 file_replication;
-		__le32 preferred;
+		__le32 unused;               /* used to be preferred osd */
 	} __attribute__ ((packed)) open;
 	struct {
 		__le32 flags;
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index ba4c205cbb0..311ef8d6aa9 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -65,8 +65,6 @@ struct ceph_osdmap {
 #define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
 #define ceph_file_layout_object_su(l) \
 	((__s32)le32_to_cpu((l).fl_object_stripe_unit))
-#define ceph_file_layout_pg_preferred(l) \
-	((__s32)le32_to_cpu((l).fl_pg_preferred))
 #define ceph_file_layout_pg_pool(l) \
 	((__s32)le32_to_cpu((l).fl_pg_pool))
 
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 29ad46ec9dc..7d39f3cb494 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1000,7 +1000,6 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 {
 	unsigned num, num_mask;
 	struct ceph_pg pgid;
-	s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
 	int poolid = le32_to_cpu(fl->fl_pg_pool);
 	struct ceph_pg_pool_info *pool;
 	unsigned ps;
@@ -1011,23 +1010,13 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 	if (!pool)
 		return -EIO;
 	ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
-	if (preferred >= 0) {
-		ps += preferred;
-		num = le32_to_cpu(pool->v.lpg_num);
-		num_mask = pool->lpg_num_mask;
-	} else {
-		num = le32_to_cpu(pool->v.pg_num);
-		num_mask = pool->pg_num_mask;
-	}
+	num = le32_to_cpu(pool->v.pg_num);
+	num_mask = pool->pg_num_mask;
 
 	pgid.ps = cpu_to_le16(ps);
-	pgid.preferred = cpu_to_le16(preferred);
+	pgid.preferred = cpu_to_le16(-1);
 	pgid.pool = fl->fl_pg_pool;
-	if (preferred >= 0)
-		dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
-		     (int)preferred);
-	else
-		dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
+	dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
 
 	ol->ol_pgid = pgid;
 	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
@@ -1046,23 +1035,17 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 	struct ceph_pg_pool_info *pool;
 	int ruleno;
 	unsigned poolid, ps, pps, t;
-	int preferred;
 
 	poolid = le32_to_cpu(pgid.pool);
 	ps = le16_to_cpu(pgid.ps);
-	preferred = (s16)le16_to_cpu(pgid.preferred);
 
 	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
 	if (!pool)
 		return NULL;
 
 	/* pg_temp? */
-	if (preferred >= 0)
-		t = ceph_stable_mod(ps, le32_to_cpu(pool->v.lpg_num),
-				    pool->lpgp_num_mask);
-	else
-		t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num),
-				    pool->pgp_num_mask);
+	t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num),
+			    pool->pgp_num_mask);
 	pgid.ps = cpu_to_le16(t);
 	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
 	if (pg) {
@@ -1080,23 +1063,13 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 		return NULL;
 	}
 
-	/* don't forcefeed bad device ids to crush */
-	if (preferred >= osdmap->max_osd ||
-	    preferred >= osdmap->crush->max_devices)
-		preferred = -1;
-
-	if (preferred >= 0)
-		pps = ceph_stable_mod(ps,
-				      le32_to_cpu(pool->v.lpgp_num),
-				      pool->lpgp_num_mask);
-	else
-		pps = ceph_stable_mod(ps,
-				      le32_to_cpu(pool->v.pgp_num),
-				      pool->pgp_num_mask);
+	pps = ceph_stable_mod(ps,
+			      le32_to_cpu(pool->v.pgp_num),
+			      pool->pgp_num_mask);
 	pps += poolid;
 	*num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
 			     min_t(int, pool->v.size, *num),
-			     preferred, osdmap->osd_weight);
+			     -1, osdmap->osd_weight);
 	return osds;
 }
 
-- 
cgit v1.2.3-18-g5258


From e49bf4c51cbe27439c00516d4297193d45dd4097 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 7 May 2012 15:34:35 -0700
Subject: ceph: refactor SETLAYOUT and SETDIRLAYOUT ioctl checks into common
 helper

Both of these methods perform similar checks; move that code to a helper
so that we can ensure the checks are consistent.

Reviewed-by: Alex Elder <elder@inktank.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/ioctl.c | 82 ++++++++++++++++++++++++++-------------------------------
 1 file changed, 38 insertions(+), 44 deletions(-)

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 4feab52c5bf..0e2f021109e 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -34,6 +34,36 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
 	return err;
 }
 
+static long __validate_layout(struct ceph_mds_client *mdsc,
+			      struct ceph_ioctl_layout *l)
+{
+	int i, err;
+
+	/* preferred_osd is no longer supported */
+	if (l->preferred_osd != -1)
+		return -EINVAL;
+
+	/* validate striping parameters */
+	if ((l->object_size & ~PAGE_MASK) ||
+	    (l->stripe_unit & ~PAGE_MASK) ||
+	    ((unsigned)l->object_size % (unsigned)l->stripe_unit))
+		return -EINVAL;
+
+	/* make sure it's a valid data pool */
+	mutex_lock(&mdsc->mutex);
+	err = -EINVAL;
+	for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+		if (mdsc->mdsmap->m_data_pg_pools[i] == l->data_pool) {
+			err = 0;
+			break;
+		}
+	mutex_unlock(&mdsc->mutex);
+	if (err)
+		return err;
+
+	return 0;
+}
+
 static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 {
 	struct inode *inode = file->f_dentry->d_inode;
@@ -43,15 +73,11 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 	struct ceph_ioctl_layout l;
 	struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
 	struct ceph_ioctl_layout nl;
-	int err, i;
+	int err;
 
 	if (copy_from_user(&l, arg, sizeof(l)))
 		return -EFAULT;
 
-	/* preferred_osd is no longer supported */
-	if (l.preferred_osd != -1)
-		return -EINVAL;
-
 	/* validate changed params against current layout */
 	err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
 	if (!err) {
@@ -71,24 +97,9 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 	if (l.data_pool)
 		nl.data_pool = l.data_pool;
 
-	if ((nl.object_size & ~PAGE_MASK) ||
-	    (nl.stripe_unit & ~PAGE_MASK) ||
-	    ((unsigned)nl.object_size % (unsigned)nl.stripe_unit))
-		return -EINVAL;
-
-	/* make sure it's a valid data pool */
-	if (l.data_pool > 0) {
-		mutex_lock(&mdsc->mutex);
-		err = -EINVAL;
-		for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
-			if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
-				err = 0;
-				break;
-			}
-		mutex_unlock(&mdsc->mutex);
-		if (err)
-			return err;
-	}
+	err = __validate_layout(mdsc, &nl);
+	if (err)
+		return err;
 
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
 				       USE_AUTH_MDS);
@@ -124,33 +135,16 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_mds_request *req;
 	struct ceph_ioctl_layout l;
-	int err, i;
+	int err;
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 
 	/* copy and validate */
 	if (copy_from_user(&l, arg, sizeof(l)))
 		return -EFAULT;
 
-	if ((l.object_size & ~PAGE_MASK) ||
-	    (l.stripe_unit & ~PAGE_MASK) ||
-	    !l.stripe_unit ||
-	    (l.object_size &&
-	        (unsigned)l.object_size % (unsigned)l.stripe_unit))
-		return -EINVAL;
-
-	/* make sure it's a valid data pool */
-	if (l.data_pool > 0) {
-		mutex_lock(&mdsc->mutex);
-		err = -EINVAL;
-		for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
-			if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
-				err = 0;
-				break;
-			}
-		mutex_unlock(&mdsc->mutex);
-		if (err)
-			return err;
-	}
+	err = __validate_layout(mdsc, &l);
+	if (err)
+		return err;
 
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
 				       USE_AUTH_MDS);
-- 
cgit v1.2.3-18-g5258


From 8b12d47b80c7a34dffdd98244d99316db490ec58 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 7 May 2012 15:38:35 -0700
Subject: crush: clean up types, const-ness

Move various types from int -> __u32 (or similar), and add const as
appropriate.

This reflects changes that have been present in the userland implementation
for some time.

Reviewed-by: Alex Elder <elder@inktank.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 include/linux/crush/crush.h  |  2 +-
 include/linux/crush/mapper.h |  6 +++---
 net/ceph/crush/crush.c       |  8 ++++----
 net/ceph/crush/mapper.c      | 31 ++++++++++++++++---------------
 4 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 97e435b191f..3f50369a50e 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -168,7 +168,7 @@ struct crush_map {
 
 
 /* crush.c */
-extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
+extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int pos);
 extern void crush_calc_parents(struct crush_map *map);
 extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
 extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h
index c46b99c18bb..9322ab8bccd 100644
--- a/include/linux/crush/mapper.h
+++ b/include/linux/crush/mapper.h
@@ -10,11 +10,11 @@
 
 #include "crush.h"
 
-extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
-extern int crush_do_rule(struct crush_map *map,
+extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size);
+extern int crush_do_rule(const struct crush_map *map,
 			 int ruleno,
 			 int x, int *result, int result_max,
 			 int forcefeed,    /* -1 for none */
-			 __u32 *weights);
+			 const __u32 *weights);
 
 #endif
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index d6ebb13a18a..8dd19a0deed 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -26,9 +26,9 @@ const char *crush_bucket_alg_name(int alg)
  * @b: bucket pointer
  * @p: item index in bucket
  */
-int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
+int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
 {
-	if (p >= b->size)
+	if ((__u32)p >= b->size)
 		return 0;
 
 	switch (b->alg) {
@@ -124,10 +124,9 @@ void crush_destroy_bucket(struct crush_bucket *b)
  */
 void crush_destroy(struct crush_map *map)
 {
-	int b;
-
 	/* buckets */
 	if (map->buckets) {
+		__s32 b;
 		for (b = 0; b < map->max_buckets; b++) {
 			if (map->buckets[b] == NULL)
 				continue;
@@ -138,6 +137,7 @@ void crush_destroy(struct crush_map *map)
 
 	/* rules */
 	if (map->rules) {
+		__u32 b;
 		for (b = 0; b < map->max_rules; b++)
 			kfree(map->rules[b]);
 		kfree(map->rules);
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index b79747c4b64..436102a8a46 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -32,9 +32,9 @@
  * @type: storage ruleset type (user defined)
  * @size: output set size
  */
-int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
+int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size)
 {
-	int i;
+	__u32 i;
 
 	for (i = 0; i < map->max_rules; i++) {
 		if (map->rules[i] &&
@@ -72,7 +72,7 @@ static int bucket_perm_choose(struct crush_bucket *bucket,
 	unsigned i, s;
 
 	/* start a new permutation if @x has changed */
-	if (bucket->perm_x != x || bucket->perm_n == 0) {
+	if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {
 		dprintk("bucket %d new x=%d\n", bucket->id, x);
 		bucket->perm_x = x;
 
@@ -219,7 +219,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
 static int bucket_straw_choose(struct crush_bucket_straw *bucket,
 			       int x, int r)
 {
-	int i;
+	__u32 i;
 	int high = 0;
 	__u64 high_draw = 0;
 	__u64 draw;
@@ -262,7 +262,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
  * true if device is marked "out" (failed, fully offloaded)
  * of the cluster
  */
-static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
+static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x)
 {
 	if (weight[item] >= 0x10000)
 		return 0;
@@ -287,16 +287,16 @@ static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
  * @recurse_to_leaf: true if we want one device under each item of given type
  * @out2: second output vector for leaf items (if @recurse_to_leaf)
  */
-static int crush_choose(struct crush_map *map,
+static int crush_choose(const struct crush_map *map,
 			struct crush_bucket *bucket,
-			__u32 *weight,
+			const __u32 *weight,
 			int x, int numrep, int type,
 			int *out, int outpos,
 			int firstn, int recurse_to_leaf,
 			int *out2)
 {
 	int rep;
-	int ftotal, flocal;
+	unsigned int ftotal, flocal;
 	int retry_descent, retry_bucket, skip_rep;
 	struct crush_bucket *in = bucket;
 	int r;
@@ -304,7 +304,7 @@ static int crush_choose(struct crush_map *map,
 	int item = 0;
 	int itemtype;
 	int collide, reject;
-	const int orig_tries = 5; /* attempts before we fall back to search */
+	const unsigned int orig_tries = 5; /* attempts before we fall back to search */
 
 	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
 		bucket->id, x, outpos, numrep);
@@ -325,7 +325,7 @@ static int crush_choose(struct crush_map *map,
 				r = rep;
 				if (in->alg == CRUSH_BUCKET_UNIFORM) {
 					/* be careful */
-					if (firstn || numrep >= in->size)
+					if (firstn || (__u32)numrep >= in->size)
 						/* r' = r + f_total */
 						r += ftotal;
 					else if (in->size % numrep == 0)
@@ -425,7 +425,7 @@ reject:
 						/* else give up */
 						skip_rep = 1;
 					dprintk("  reject %d  collide %d  "
-						"ftotal %d  flocal %d\n",
+						"ftotal %u  flocal %u\n",
 						reject, collide, ftotal,
 						flocal);
 				}
@@ -456,9 +456,9 @@ reject:
  * @result_max: maximum result size
  * @force: force initial replica choice; -1 for none
  */
-int crush_do_rule(struct crush_map *map,
+int crush_do_rule(const struct crush_map *map,
 		  int ruleno, int x, int *result, int result_max,
-		  int force, __u32 *weight)
+		  int force, const __u32 *weight)
 {
 	int result_len;
 	int force_context[CRUSH_MAX_DEPTH];
@@ -473,7 +473,7 @@ int crush_do_rule(struct crush_map *map,
 	int osize;
 	int *tmp;
 	struct crush_rule *rule;
-	int step;
+	__u32 step;
 	int i, j;
 	int numrep;
 	int firstn;
@@ -488,7 +488,8 @@ int crush_do_rule(struct crush_map *map,
 	/*
 	 * determine hierarchical context of force, if any.  note
 	 * that this may or may not correspond to the specific types
-	 * referenced by the crush rule.
+	 * referenced by the crush rule.  it will also only affect
+	 * the first descent (TAKE).
 	 */
 	if (force >= 0 &&
 	    force < map->max_devices &&
-- 
cgit v1.2.3-18-g5258


From c90f95ed46393e29d843686e21947d1c6fcb1164 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 7 May 2012 15:35:09 -0700
Subject: crush: adjust local retry threshold

This small adjustment reflects a change that was made in ceph.git commit
af6a9f30696c900a2a8bd7ae24e8ed15fb4964bb, about 6 months ago.  An N-1
search is not exhaustive.  Fixed ceph.git bug #1594.

Reviewed-by: Alex Elder <elder@inktank.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/crush/mapper.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 436102a8a46..583f644b0e2 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -415,7 +415,7 @@ reject:
 					if (collide && flocal < 3)
 						/* retry locally a few times */
 						retry_bucket = 1;
-					else if (flocal < in->size + orig_tries)
+					else if (flocal <= in->size + orig_tries)
 						/* exhaustive bucket search */
 						retry_bucket = 1;
 					else if (ftotal < 20)
-- 
cgit v1.2.3-18-g5258


From a1f4895be8bf1ba56c2306b058f51619e9b0e8f8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 7 May 2012 15:35:24 -0700
Subject: crush: be more tolerant of nonsensical crush maps

If we get a map that doesn't make sense, error out or ignore the badness
instead of BUGging out.  This reflects the ceph.git commits
9895f0bff7dc68e9b49b572613d242315fb11b6c and
8ded26472058d5205803f244c2f33cb6cb10de79.

Reviewed-by: Alex Elder <elder@inktank.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/crush/mapper.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 583f644b0e2..00baad5d3bd 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -152,8 +152,8 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
 			return bucket->h.items[i];
 	}
 
-	BUG_ON(1);
-	return 0;
+	dprintk("bad list sums for bucket %d\n", bucket->h.id);
+	return bucket->h.items[0];
 }
 
 
@@ -239,6 +239,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
 static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
 {
 	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
+	BUG_ON(in->size == 0);
 	switch (in->alg) {
 	case CRUSH_BUCKET_UNIFORM:
 		return bucket_uniform_choose((struct crush_bucket_uniform *)in,
@@ -253,7 +254,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
 		return bucket_straw_choose((struct crush_bucket_straw *)in,
 					   x, r);
 	default:
-		BUG_ON(1);
+		dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
 		return in->items[0];
 	}
 }
@@ -354,7 +355,11 @@ static int crush_choose(const struct crush_map *map,
 					item = bucket_perm_choose(in, x, r);
 				else
 					item = crush_bucket_choose(in, x, r);
-				BUG_ON(item >= map->max_devices);
+				if (item >= map->max_devices) {
+					dprintk("   bad item %d\n", item);
+					skip_rep = 1;
+					break;
+				}
 
 				/* desired type? */
 				if (item < 0)
@@ -365,8 +370,12 @@ static int crush_choose(const struct crush_map *map,
 
 				/* keep going? */
 				if (itemtype != type) {
-					BUG_ON(item >= 0 ||
-					       (-1-item) >= map->max_buckets);
+					if (item >= 0 ||
+					    (-1-item) >= map->max_buckets) {
+						dprintk("   bad item type %d\n", type);
+						skip_rep = 1;
+						break;
+					}
 					in = map->buckets[-1-item];
 					retry_bucket = 1;
 					continue;
@@ -478,7 +487,10 @@ int crush_do_rule(const struct crush_map *map,
 	int numrep;
 	int firstn;
 
-	BUG_ON(ruleno >= map->max_rules);
+	if ((__u32)ruleno >= map->max_rules) {
+		dprintk(" bad ruleno %d\n", ruleno);
+		return 0;
+	}
 
 	rule = map->rules[ruleno];
 	result_len = 0;
@@ -528,7 +540,8 @@ int crush_do_rule(const struct crush_map *map,
 			firstn = 1;
 		case CRUSH_RULE_CHOOSE_LEAF_INDEP:
 		case CRUSH_RULE_CHOOSE_INDEP:
-			BUG_ON(wsize == 0);
+			if (wsize == 0)
+				break;
 
 			recurse_to_leaf =
 				rule->steps[step].op ==
@@ -597,7 +610,9 @@ int crush_do_rule(const struct crush_map *map,
 			break;
 
 		default:
-			BUG_ON(1);
+			dprintk(" unknown op %d at step %d\n",
+				curstep->op, step);
+			break;
 		}
 	}
 	return result_len;
-- 
cgit v1.2.3-18-g5258


From 0668216efe16ab1adf077e5f138775cee2af927a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 7 May 2012 15:35:48 -0700
Subject: crush: use a temporary variable to simplify crush_do_rule

Use a temporary variable here to avoid repeated array lookups and clean up
the code a bit.

This reflects ceph.git commit 6b5be27634ad307b471a5bf0db85c4f5c834885f.

Reviewed-by: Alex Elder <elder@inktank.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/crush/mapper.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 00baad5d3bd..fba9460fe57 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -519,14 +519,15 @@ int crush_do_rule(const struct crush_map *map,
 	}
 
 	for (step = 0; step < rule->len; step++) {
+		struct crush_rule_step *curstep = &rule->steps[step];
+
 		firstn = 0;
-		switch (rule->steps[step].op) {
+		switch (curstep->op) {
 		case CRUSH_RULE_TAKE:
-			w[0] = rule->steps[step].arg1;
+			w[0] = curstep->arg1;
 
 			/* find position in force_context/hierarchy */
-			while (force_pos >= 0 &&
-			       force_context[force_pos] != w[0])
+			while (force_pos >= 0 && force_context[force_pos] != w[0])
 				force_pos--;
 			/* and move past it */
 			if (force_pos >= 0)
@@ -538,15 +539,16 @@ int crush_do_rule(const struct crush_map *map,
 		case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
 		case CRUSH_RULE_CHOOSE_FIRSTN:
 			firstn = 1;
+			/* fall through */
 		case CRUSH_RULE_CHOOSE_LEAF_INDEP:
 		case CRUSH_RULE_CHOOSE_INDEP:
 			if (wsize == 0)
 				break;
 
 			recurse_to_leaf =
-				rule->steps[step].op ==
+				curstep->op ==
 				 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
-				rule->steps[step].op ==
+				curstep->op ==
 				CRUSH_RULE_CHOOSE_LEAF_INDEP;
 
 			/* reset output */
@@ -558,7 +560,7 @@ int crush_do_rule(const struct crush_map *map,
 				 * basically, numrep <= 0 means relative to
 				 * the provided result_max
 				 */
-				numrep = rule->steps[step].arg1;
+				numrep = curstep->arg1;
 				if (numrep <= 0) {
 					numrep += result_max;
 					if (numrep <= 0)
@@ -569,7 +571,7 @@ int crush_do_rule(const struct crush_map *map,
 					/* skip any intermediate types */
 					while (force_pos &&
 					       force_context[force_pos] < 0 &&
-					       rule->steps[step].arg2 !=
+					       curstep->arg2 !=
 					       map->buckets[-1 -
 					       force_context[force_pos]]->type)
 						force_pos--;
@@ -583,7 +585,7 @@ int crush_do_rule(const struct crush_map *map,
 						      map->buckets[-1-w[i]],
 						      weight,
 						      x, numrep,
-						      rule->steps[step].arg2,
+						      curstep->arg2,
 						      o+osize, j,
 						      firstn,
 						      recurse_to_leaf, c+osize);
-- 
cgit v1.2.3-18-g5258


From 41ebcc0907c58f75d0b25afcaf8b9c35c6b1ad14 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 7 May 2012 15:39:29 -0700
Subject: crush: remove forcefeed functionality

Remove forcefeed functionality from CRUSH.  This is an ugly misfeature that
is mostly useless and unused.  Remove it.

Reflects ceph.git commit ed974b5000f2851207d860a651809af4a1867942.

Reviewed-by: Alex Elder <elder@inktank.com>
Signed-off-by: Sage Weil <sage@inktank.com>

Conflicts:

	net/ceph/crush/mapper.c
---
 include/linux/crush/mapper.h |  1 -
 net/ceph/crush/mapper.c      | 48 +-------------------------------------------
 net/ceph/osdmap.c            |  2 +-
 3 files changed, 2 insertions(+), 49 deletions(-)

diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h
index 9322ab8bccd..71d79f44a7d 100644
--- a/include/linux/crush/mapper.h
+++ b/include/linux/crush/mapper.h
@@ -14,7 +14,6 @@ extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, i
 extern int crush_do_rule(const struct crush_map *map,
 			 int ruleno,
 			 int x, int *result, int result_max,
-			 int forcefeed,    /* -1 for none */
 			 const __u32 *weights);
 
 #endif
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index fba9460fe57..11cf352201b 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -463,15 +463,12 @@ reject:
  * @x: hash input
  * @result: pointer to result vector
  * @result_max: maximum result size
- * @force: force initial replica choice; -1 for none
  */
 int crush_do_rule(const struct crush_map *map,
 		  int ruleno, int x, int *result, int result_max,
-		  int force, const __u32 *weight)
+		  const __u32 *weight)
 {
 	int result_len;
-	int force_context[CRUSH_MAX_DEPTH];
-	int force_pos = -1;
 	int a[CRUSH_MAX_SET];
 	int b[CRUSH_MAX_SET];
 	int c[CRUSH_MAX_SET];
@@ -497,27 +494,6 @@ int crush_do_rule(const struct crush_map *map,
 	w = a;
 	o = b;
 
-	/*
-	 * determine hierarchical context of force, if any.  note
-	 * that this may or may not correspond to the specific types
-	 * referenced by the crush rule.  it will also only affect
-	 * the first descent (TAKE).
-	 */
-	if (force >= 0 &&
-	    force < map->max_devices &&
-	    map->device_parents[force] != 0 &&
-	    !is_out(map, weight, force, x)) {
-		while (1) {
-			force_context[++force_pos] = force;
-			if (force >= 0)
-				force = map->device_parents[force];
-			else
-				force = map->bucket_parents[-1-force];
-			if (force == 0)
-				break;
-		}
-	}
-
 	for (step = 0; step < rule->len; step++) {
 		struct crush_rule_step *curstep = &rule->steps[step];
 
@@ -525,14 +501,6 @@ int crush_do_rule(const struct crush_map *map,
 		switch (curstep->op) {
 		case CRUSH_RULE_TAKE:
 			w[0] = curstep->arg1;
-
-			/* find position in force_context/hierarchy */
-			while (force_pos >= 0 && force_context[force_pos] != w[0])
-				force_pos--;
-			/* and move past it */
-			if (force_pos >= 0)
-				force_pos--;
-
 			wsize = 1;
 			break;
 
@@ -567,20 +535,6 @@ int crush_do_rule(const struct crush_map *map,
 						continue;
 				}
 				j = 0;
-				if (osize == 0 && force_pos >= 0) {
-					/* skip any intermediate types */
-					while (force_pos &&
-					       force_context[force_pos] < 0 &&
-					       curstep->arg2 !=
-					       map->buckets[-1 -
-					       force_context[force_pos]]->type)
-						force_pos--;
-					o[osize] = force_context[force_pos];
-					if (recurse_to_leaf)
-						c[osize] = force_context[0];
-					j++;
-					force_pos--;
-				}
 				osize += crush_choose(map,
 						      map->buckets[-1-w[i]],
 						      weight,
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 7d39f3cb494..9dda36f7aa9 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1069,7 +1069,7 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 	pps += poolid;
 	*num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
 			     min_t(int, pool->v.size, *num),
-			     -1, osdmap->osd_weight);
+			     osdmap->osd_weight);
 	return osds;
 }
 
-- 
cgit v1.2.3-18-g5258


From fc7c3ae5ab9246ad96aab4d0d57f67e9255cfb56 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 7 May 2012 15:36:35 -0700
Subject: crush: remove parent maps

These were used for the ill-fated forcefeed feature.  Remove them.

Reflects ceph.git commit ebdf80edfecfbd5a842b71fbe5732857994380c1.

Reviewed-by: Alex Elder <elder@inktank.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 include/linux/crush/crush.h | 11 -----------
 net/ceph/crush/crush.c      | 25 -------------------------
 net/ceph/osdmap.c           |  7 -------
 3 files changed, 43 deletions(-)

diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 3f50369a50e..158a4d25ca8 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -151,16 +151,6 @@ struct crush_map {
 	struct crush_bucket **buckets;
 	struct crush_rule **rules;
 
-	/*
-	 * Parent pointers to identify the parent bucket a device or
-	 * bucket in the hierarchy.  If an item appears more than
-	 * once, this is the _last_ time it appeared (where buckets
-	 * are processed in bucket id order, from -1 on down to
-	 * -max_buckets.
-	 */
-	__u32 *bucket_parents;
-	__u32 *device_parents;
-
 	__s32 max_buckets;
 	__u32 max_rules;
 	__s32 max_devices;
@@ -169,7 +159,6 @@ struct crush_map {
 
 /* crush.c */
 extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int pos);
-extern void crush_calc_parents(struct crush_map *map);
 extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
 extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
 extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 8dd19a0deed..2160791acf0 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -46,29 +46,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
 	return 0;
 }
 
-/**
- * crush_calc_parents - Calculate parent vectors for the given crush map.
- * @map: crush_map pointer
- */
-void crush_calc_parents(struct crush_map *map)
-{
-	int i, b, c;
-
-	for (b = 0; b < map->max_buckets; b++) {
-		if (map->buckets[b] == NULL)
-			continue;
-		for (i = 0; i < map->buckets[b]->size; i++) {
-			c = map->buckets[b]->items[i];
-			BUG_ON(c >= map->max_devices ||
-			       c < -map->max_buckets);
-			if (c >= 0)
-				map->device_parents[c] = map->buckets[b]->id;
-			else
-				map->bucket_parents[-1-c] = map->buckets[b]->id;
-		}
-	}
-}
-
 void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
 {
 	kfree(b->h.perm);
@@ -143,8 +120,6 @@ void crush_destroy(struct crush_map *map)
 		kfree(map->rules);
 	}
 
-	kfree(map->bucket_parents);
-	kfree(map->device_parents);
 	kfree(map);
 }
 
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 9dda36f7aa9..dac448ba68e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -161,13 +161,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 	c->max_rules = ceph_decode_32(p);
 	c->max_devices = ceph_decode_32(p);
 
-	c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
-	if (c->device_parents == NULL)
-		goto badmem;
-	c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
-	if (c->bucket_parents == NULL)
-		goto badmem;
-
 	c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
 	if (c->buckets == NULL)
 		goto badmem;
-- 
cgit v1.2.3-18-g5258


From f671d4cd9b36691ac4ef42cde44c1b7a84e13631 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 7 May 2012 15:36:49 -0700
Subject: crush: fix tree node weight lookup

Fix the node weight lookup for tree buckets by using a correct accessor.

Reflects ceph.git commit d287ade5bcbdca82a3aef145b92924cf1e856733.

Reviewed-by: Alex Elder <elder@inktank.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 include/linux/crush/crush.h | 5 +++++
 net/ceph/crush/crush.c      | 4 +---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 158a4d25ca8..7c4750811b9 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -166,4 +166,9 @@ extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
 extern void crush_destroy_bucket(struct crush_bucket *b);
 extern void crush_destroy(struct crush_map *map);
 
+static inline int crush_calc_tree_node(int i)
+{
+	return ((i+1) << 1)-1;
+}
+
 #endif
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 2160791acf0..b93575f4eb1 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -37,9 +37,7 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
 	case CRUSH_BUCKET_LIST:
 		return ((struct crush_bucket_list *)b)->item_weights[p];
 	case CRUSH_BUCKET_TREE:
-		if (p & 1)
-			return ((struct crush_bucket_tree *)b)->node_weights[p];
-		return 0;
+		return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)];
 	case CRUSH_BUCKET_STRAW:
 		return ((struct crush_bucket_straw *)b)->item_weights[p];
 	}
-- 
cgit v1.2.3-18-g5258


From 6eb43f4b5a2a74599b4ff17a97c03a342327ca65 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 7 May 2012 15:37:05 -0700
Subject: crush: fix memory leak when destroying tree buckets

Reflects ceph.git commit 46d63d98434b3bc9dad2fc9ab23cbaedc3bcb0e4.

Reported-by: Alexander Lyakas <alex.bolshoy@gmail.com>
Reviewed-by: Alex Elder <elder@inktank.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/crush/crush.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index b93575f4eb1..089613234f0 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -62,6 +62,8 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b)
 
 void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
 {
+	kfree(b->h.perm);
+	kfree(b->h.items);
 	kfree(b->node_weights);
 	kfree(b);
 }
-- 
cgit v1.2.3-18-g5258


From 8b393269008411a612ca549b733b4296e819f2fb Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 7 May 2012 15:37:23 -0700
Subject: crush: warn on do_rule failure

If we get an error code from crush_do_rule(), print an error to the
console.

Reviewed-by: Alex Elder <elder@inktank.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/osdmap.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index dac448ba68e..2592f3cca98 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1027,7 +1027,7 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 	struct ceph_pg_mapping *pg;
 	struct ceph_pg_pool_info *pool;
 	int ruleno;
-	unsigned poolid, ps, pps, t;
+	unsigned poolid, ps, pps, t, r;
 
 	poolid = le32_to_cpu(pgid.pool);
 	ps = le16_to_cpu(pgid.ps);
@@ -1060,9 +1060,16 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 			      le32_to_cpu(pool->v.pgp_num),
 			      pool->pgp_num_mask);
 	pps += poolid;
-	*num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
-			     min_t(int, pool->v.size, *num),
-			     osdmap->osd_weight);
+	r = crush_do_rule(osdmap->crush, ruleno, pps, osds,
+			  min_t(int, pool->v.size, *num),
+			  osdmap->osd_weight);
+	if (r < 0) {
+		pr_err("error %d from crush rule: pool %d ruleset %d type %d"
+		       " size %d\n", r, poolid, pool->v.crush_ruleset,
+		       pool->v.type, pool->v.size);
+		return NULL;
+	}
+	*num = r;
 	return osds;
 }
 
-- 
cgit v1.2.3-18-g5258


From 065a68f9167e20f321a62d044cb2c3024393d455 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreawmhost.com>
Date: Fri, 20 Apr 2012 15:49:43 -0500
Subject: ceph: osd_client: fix endianness bug in osd_req_encode_op()

From Al Viro <viro@zeniv.linux.org.uk>

Al Viro noticed that we were using a non-cpu-encoded value in
a switch statement in osd_req_encode_op().  The result would
clearly not work correctly on a big-endian machine.

Signed-off-by: Alex Elder <elder@dreamhost.com>
---
 net/ceph/osd_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 5e254055c91..daa2716a0c3 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -278,7 +278,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 {
 	dst->op = cpu_to_le16(src->op);
 
-	switch (dst->op) {
+	switch (src->op) {
 	case CEPH_OSD_OP_READ:
 	case CEPH_OSD_OP_WRITE:
 		dst->extent.offset =
-- 
cgit v1.2.3-18-g5258


From 76aa542fb90e3e91edb1146d10ca7cf2cae8e7e9 Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Fri, 20 Apr 2012 15:49:44 -0500
Subject: ceph: fix bounds check in ceph_decode_need and ceph_encode_need

Given a large n, the bounds check (*p + n > end) can be bypassed due to
pointer wraparound.  A safer check is (n > end - *p).

[elder@dreamhost.com: inverted test and renamed ceph_has_room()]

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Reviewed-by: Alex Elder <elder@dreamhost.com>
---
 include/linux/ceph/decode.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index c5b6939fb32..ecf324eb2c9 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -45,9 +45,14 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
 /*
  * bounds check input.
  */
+static inline int ceph_has_room(void **p, void *end, size_t n)
+{
+	return end >= *p && n <= end - *p;
+}
+
 #define ceph_decode_need(p, end, n, bad)		\
 	do {						\
-		if (unlikely(*(p) + (n) > (end))) 	\
+		if (!likely(ceph_has_room(p, end, n)))	\
 			goto bad;			\
 	} while (0)
 
@@ -166,7 +171,7 @@ static inline void ceph_encode_string(void **p, void *end,
 
 #define ceph_encode_need(p, end, n, bad)		\
 	do {						\
-		if (unlikely(*(p) + (n) > (end))) 	\
+		if (!likely(ceph_has_room(p, end, n)))	\
 			goto bad;			\
 	} while (0)
 
-- 
cgit v1.2.3-18-g5258


From f8ad495a8a0277b88c59bf38319e5e944aaf5a4a Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 20 Apr 2012 15:49:44 -0500
Subject: rbd: use gfp_flags parameter in rbd_header_from_disk()

We should use the gfp_flags that the caller specified instead of
GFP_KERNEL here.

There is only one caller and it uses GFP_KERNEL, so this change is
just a cleanup and doesn't change how the code works.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Alex Elder <elder@dreamhost.com>
---
 drivers/block/rbd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a67fa63a966..ca59d4d9471 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -506,11 +506,11 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
 	header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 	if (snap_count) {
 		header->snap_names = kmalloc(header->snap_names_len,
-					     GFP_KERNEL);
+					     gfp_flags);
 		if (!header->snap_names)
 			goto err_snapc;
 		header->snap_sizes = kmalloc(snap_count * sizeof(u64),
-					     GFP_KERNEL);
+					     gfp_flags);
 		if (!header->snap_sizes)
 			goto err_names;
 	} else {
-- 
cgit v1.2.3-18-g5258


From 50f7c4c967d0b5acd8e7ba6ab654dc4a7ac869ac Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Fri, 20 Apr 2012 15:49:44 -0500
Subject: rbd: fix integer overflow in rbd_header_from_disk()

ondisk->snap_count is read from disk via rbd_req_sync_read() and thus
needs validation.  Otherwise, a bogus `snap_count' could overflow the
kmalloc() size, leading to memory corruption.

Also use `u32' consistently for `snap_count'.

[elder@dreamhost.com: changed to use UINT_MAX rather than ULONG_MAX]

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Reviewed-by: Alex Elder <elder@dreamhost.com>
---
 drivers/block/rbd.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ca59d4d9471..a75fe93a25b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -487,16 +487,18 @@ static void rbd_coll_release(struct kref *kref)
  */
 static int rbd_header_from_disk(struct rbd_image_header *header,
 				 struct rbd_image_header_ondisk *ondisk,
-				 int allocated_snaps,
+				 u32 allocated_snaps,
 				 gfp_t gfp_flags)
 {
-	int i;
-	u32 snap_count;
+	u32 i, snap_count;
 
 	if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
 		return -ENXIO;
 
 	snap_count = le32_to_cpu(ondisk->snap_count);
+	if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context))
+			 / sizeof (*ondisk))
+		return -EINVAL;
 	header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
 				snap_count * sizeof (*ondisk),
 				gfp_flags);
@@ -1591,7 +1593,7 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
 {
 	ssize_t rc;
 	struct rbd_image_header_ondisk *dh;
-	int snap_count = 0;
+	u32 snap_count = 0;
 	u64 ver;
 	size_t len;
 
-- 
cgit v1.2.3-18-g5258


From 403f24d3d51760a8b9368d595fa5f48c309f1a0f Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@dreamhost.com>
Date: Mon, 5 Dec 2011 10:47:13 -0800
Subject: rbd: protect read of snapshot sequence number

This is updated whenever a snapshot is added or deleted, and the
snapc pointer is changed with every refresh of the header.

Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
Reviewed-by: Alex Elder <elder@dreamhost.com>
Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 drivers/block/rbd.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a75fe93a25b..5ab9f55d3e0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1684,7 +1684,9 @@ static int rbd_header_add_snap(struct rbd_device *dev,
 	if (ret < 0)
 		return ret;
 
-	dev->header.snapc->seq =  new_snapid;
+	down_write(&dev->header_rwsem);
+	dev->header.snapc->seq = new_snapid;
+	up_write(&dev->header_rwsem);
 
 	return 0;
 bad:
-- 
cgit v1.2.3-18-g5258


From 77dfe99fe3cb0b2b0545e19e2d57b7a9134ee3c0 Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@dreamhost.com>
Date: Mon, 21 Nov 2011 13:04:42 -0800
Subject: rbd: store snapshot id instead of index

When a device was open at a snapshot, and snapshots were deleted or
added, data from the wrong snapshot could be read. Instead of
assuming the snap context is constant, store the actual snap id when
the device is initialized, and rely on the OSDs to signal an error
if we try reading from a snapshot that was deleted.

Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
Reviewed-by: Alex Elder <elder@dreamhost.com>
Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 drivers/block/rbd.c | 27 +++++----------------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 5ab9f55d3e0..c1650bdf2f6 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -175,8 +175,7 @@ struct rbd_device {
 	/* protects updating the header */
 	struct rw_semaphore     header_rwsem;
 	char                    snap_name[RBD_MAX_SNAP_NAME_LEN];
-	u32 cur_snap;	/* index+1 of current snapshot within snap context
-			   0 - for the head */
+	u64                     snap_id;	/* current snapshot id */
 	int read_only;
 
 	struct list_head	node;
@@ -554,21 +553,6 @@ err_snapc:
 	return -ENOMEM;
 }
 
-static int snap_index(struct rbd_image_header *header, int snap_num)
-{
-	return header->total_snaps - snap_num;
-}
-
-static u64 cur_snap_id(struct rbd_device *rbd_dev)
-{
-	struct rbd_image_header *header = &rbd_dev->header;
-
-	if (!rbd_dev->cur_snap)
-		return 0;
-
-	return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
-}
-
 static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
 			u64 *seq, u64 *size)
 {
@@ -607,7 +591,7 @@ static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
 			snapc->seq = header->snap_seq;
 		else
 			snapc->seq = 0;
-		dev->cur_snap = 0;
+		dev->snap_id = CEPH_NOSNAP;
 		dev->read_only = 0;
 		if (size)
 			*size = header->image_size;
@@ -615,8 +599,7 @@ static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
 		ret = snap_by_name(header, dev->snap_name, &snapc->seq, size);
 		if (ret < 0)
 			goto done;
-
-		dev->cur_snap = header->total_snaps - ret;
+		dev->snap_id = snapc->seq;
 		dev->read_only = 1;
 	}
 
@@ -1522,7 +1505,7 @@ static void rbd_rq_fn(struct request_queue *q)
 					      coll, cur_seg);
 			else
 				rbd_req_read(rq, rbd_dev,
-					     cur_snap_id(rbd_dev),
+					     rbd_dev->snap_id,
 					     ofs,
 					     op_size, bio,
 					     coll, cur_seg);
@@ -1657,7 +1640,7 @@ static int rbd_header_add_snap(struct rbd_device *dev,
 	struct ceph_mon_client *monc;
 
 	/* we should create a snapshot only if we're pointing at the head */
-	if (dev->cur_snap)
+	if (dev->snap_id != CEPH_NOSNAP)
 		return -EINVAL;
 
 	monc = &dev->rbd_client->client->monc;
-- 
cgit v1.2.3-18-g5258


From b06e6a6be796bc365a19b0ac5176b553c13abf2f Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@dreamhost.com>
Date: Mon, 21 Nov 2011 18:16:52 -0800
Subject: rbd: remove conditional snapid parameters

The snapid parameters passed to rbd_do_op() and rbd_req_sync_op()
are now always either a valid snapid or an explicit CEPH_NOSNAP.

[elder@dreamhost.com: Rephrased the description]

Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
Reviewed-by: Alex Elder <elder@dreamhost.com>
Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 drivers/block/rbd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c1650bdf2f6..4a0a829f79d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1152,7 +1152,7 @@ static int rbd_req_read(struct request *rq,
 			 int coll_index)
 {
 	return rbd_do_op(rq, rbd_dev, NULL,
-			 (snapid ? snapid : CEPH_NOSNAP),
+			 snapid,
 			 CEPH_OSD_OP_READ,
 			 CEPH_OSD_FLAG_READ,
 			 2,
@@ -1171,7 +1171,7 @@ static int rbd_req_sync_read(struct rbd_device *dev,
 			  u64 *ver)
 {
 	return rbd_req_sync_op(dev, NULL,
-			       (snapid ? snapid : CEPH_NOSNAP),
+			       snapid,
 			       CEPH_OSD_OP_READ,
 			       CEPH_OSD_FLAG_READ,
 			       NULL,
-- 
cgit v1.2.3-18-g5258


From 3591538fb272d2432d112d47d7e0ddd0be4cded2 Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@dreamhost.com>
Date: Mon, 5 Dec 2011 18:25:13 -0800
Subject: rbd: fix snapshot size type

Snapshot sizes should be the same type as regular image sizes. This
only affects their displayed size in sysfs, not the reported size of
an actual block device sizes.

Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
Reviewed-by: Alex Elder <elder@dreamhost.com>
Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 drivers/block/rbd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4a0a829f79d..83597965c48 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -141,7 +141,7 @@ struct rbd_request {
 struct rbd_snap {
 	struct	device		dev;
 	const char		*name;
-	size_t			size;
+	u64			size;
 	struct list_head	node;
 	u64			id;
 };
@@ -1935,7 +1935,7 @@ static ssize_t rbd_snap_size_show(struct device *dev,
 {
 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
 
-	return sprintf(buf, "%zd\n", snap->size);
+	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
 }
 
 static ssize_t rbd_snap_id_show(struct device *dev,
@@ -1944,7 +1944,7 @@ static ssize_t rbd_snap_id_show(struct device *dev,
 {
 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
 
-	return sprintf(buf, "%llu\n", (unsigned long long) snap->id);
+	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
 }
 
 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
-- 
cgit v1.2.3-18-g5258


From 263c6ca007a6693fb724a24c5a55716c49d33573 Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@dreamhost.com>
Date: Mon, 5 Dec 2011 10:43:42 -0800
Subject: rbd: rename __rbd_update_snaps to __rbd_refresh_header

This function rereads the entire header and handles any changes in
it, not just changes in snapshots.

Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
Reviewed-by: Alex Elder <elder@dreamhost.com>
Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 drivers/block/rbd.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 83597965c48..65665c9c42c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -240,7 +240,7 @@ static void rbd_put_dev(struct rbd_device *rbd_dev)
 	put_device(&rbd_dev->dev);
 }
 
-static int __rbd_update_snaps(struct rbd_device *rbd_dev);
+static int __rbd_refresh_header(struct rbd_device *rbd_dev);
 
 static int rbd_open(struct block_device *bdev, fmode_t mode)
 {
@@ -1222,7 +1222,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 	dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
 		notify_id, (int)opcode);
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-	rc = __rbd_update_snaps(dev);
+	rc = __rbd_refresh_header(dev);
 	mutex_unlock(&ctl_mutex);
 	if (rc)
 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
@@ -1689,7 +1689,7 @@ static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
 /*
  * only read the first part of the ondisk header, without the snaps info
  */
-static int __rbd_update_snaps(struct rbd_device *rbd_dev)
+static int __rbd_refresh_header(struct rbd_device *rbd_dev)
 {
 	int ret;
 	struct rbd_image_header h;
@@ -1876,7 +1876,7 @@ static ssize_t rbd_image_refresh(struct device *dev,
 
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 
-	rc = __rbd_update_snaps(rbd_dev);
+	rc = __rbd_refresh_header(rbd_dev);
 	if (rc < 0)
 		ret = rc;
 
@@ -2159,7 +2159,7 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
 					 rbd_dev->header.obj_version);
 		if (ret == -ERANGE) {
 			mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-			rc = __rbd_update_snaps(rbd_dev);
+			rc = __rbd_refresh_header(rbd_dev);
 			mutex_unlock(&ctl_mutex);
 			if (rc < 0)
 				return rc;
@@ -2544,7 +2544,7 @@ static ssize_t rbd_snap_add(struct device *dev,
 	if (ret < 0)
 		goto err_unlock;
 
-	ret = __rbd_update_snaps(rbd_dev);
+	ret = __rbd_refresh_header(rbd_dev);
 	if (ret < 0)
 		goto err_unlock;
 
-- 
cgit v1.2.3-18-g5258


From b7f6519e6bc7a0c5a9e3eadc8a2c79c0d4556050 Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@dreamhost.com>
Date: Thu, 1 Dec 2011 15:12:03 -0800
Subject: rbd: correct sysfs snap attribute documentation

Each attribute is prefixed with "snap_".

Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
Reviewed-by: Alex Elder <elder@dreamhost.com>
Reviewed-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 Documentation/ABI/testing/sysfs-bus-rbd | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd
index dbedafb095e..bcd88eb7ebc 100644
--- a/Documentation/ABI/testing/sysfs-bus-rbd
+++ b/Documentation/ABI/testing/sysfs-bus-rbd
@@ -65,11 +65,11 @@ snap_*
 Entries under /sys/bus/rbd/devices/<dev-id>/snap_<snap-name>
 -------------------------------------------------------------
 
-id
+snap_id
 
 	The rados internal snapshot id assigned for this snapshot
 
-size
+snap_size
 
 	The size of the image when this snapshot was taken.
 
-- 
cgit v1.2.3-18-g5258


From 57dac9d1620942608306d8c17c98a9d1568ffdf4 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 10 May 2012 10:29:50 -0500
Subject: ceph: messenger: use read_partial() in read_partial_message()

There are two blocks of code in read_partial_message()--those that
read the header and footer of the message--that can be replaced by a
call to read_partial().  Do that.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 30 ++++++++++--------------------
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index f0993af2ae4..673133ee318 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1628,7 +1628,7 @@ static int read_partial_message(struct ceph_connection *con)
 {
 	struct ceph_msg *m = con->in_msg;
 	int ret;
-	int to, left;
+	int to;
 	unsigned front_len, middle_len, data_len;
 	bool do_datacrc = !con->msgr->nocrc;
 	int skip;
@@ -1638,15 +1638,10 @@ static int read_partial_message(struct ceph_connection *con)
 	dout("read_partial_message con %p msg %p\n", con, m);
 
 	/* header */
-	while (con->in_base_pos < sizeof(con->in_hdr)) {
-		left = sizeof(con->in_hdr) - con->in_base_pos;
-		ret = ceph_tcp_recvmsg(con->sock,
-				       (char *)&con->in_hdr + con->in_base_pos,
-				       left);
-		if (ret <= 0)
-			return ret;
-		con->in_base_pos += ret;
-	}
+	to = 0;
+	ret = read_partial(con, &to, sizeof (con->in_hdr), &con->in_hdr);
+	if (ret <= 0)
+		return ret;
 
 	crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
 	if (cpu_to_le32(crc) != con->in_hdr.crc) {
@@ -1759,16 +1754,11 @@ static int read_partial_message(struct ceph_connection *con)
 	}
 
 	/* footer */
-	to = sizeof(m->hdr) + sizeof(m->footer);
-	while (con->in_base_pos < to) {
-		left = to - con->in_base_pos;
-		ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
-				       (con->in_base_pos - sizeof(m->hdr)),
-				       left);
-		if (ret <= 0)
-			return ret;
-		con->in_base_pos += ret;
-	}
+	to = sizeof (m->hdr);
+	ret = read_partial(con, &to, sizeof (m->footer), &m->footer);
+	if (ret <= 0)
+		return ret;
+
 	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
 	     m, front_len, m->footer.front_crc, middle_len,
 	     m->footer.middle_crc, data_len, m->footer.data_crc);
-- 
cgit v1.2.3-18-g5258


From e6cee71fac27c946a0bbad754dd076e66c4e9dbd Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 10 May 2012 10:29:50 -0500
Subject: ceph: messenger: update "to" in read_partial() caller

read_partial() always increases whatever "to" value is supplied by
adding the requested size to it, and that's the only thing it does
with that pointed-to value.

Do that pointer advance in the caller (and then only when the
updated value will be subsequently used), and change the "to"
parameter to be an in-only and non-pointer value.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 673133ee318..37fd2aece23 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -992,11 +992,12 @@ static int prepare_read_message(struct ceph_connection *con)
 
 
 static int read_partial(struct ceph_connection *con,
-			int *to, int size, void *object)
+			int to, int size, void *object)
 {
-	*to += size;
-	while (con->in_base_pos < *to) {
-		int left = *to - con->in_base_pos;
+	int end = to + size;
+
+	while (con->in_base_pos < end) {
+		int left = end - con->in_base_pos;
 		int have = size - left;
 		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
 		if (ret <= 0)
@@ -1017,14 +1018,16 @@ static int read_partial_banner(struct ceph_connection *con)
 	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
 
 	/* peer's banner */
-	ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
+	ret = read_partial(con, to, strlen(CEPH_BANNER), con->in_banner);
 	if (ret <= 0)
 		goto out;
-	ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
+	to += strlen(CEPH_BANNER);
+	ret = read_partial(con, to, sizeof(con->actual_peer_addr),
 			   &con->actual_peer_addr);
 	if (ret <= 0)
 		goto out;
-	ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
+	to += sizeof(con->actual_peer_addr);
+	ret = read_partial(con, to, sizeof(con->peer_addr_for_me),
 			   &con->peer_addr_for_me);
 	if (ret <= 0)
 		goto out;
@@ -1038,10 +1041,11 @@ static int read_partial_connect(struct ceph_connection *con)
 
 	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
 
-	ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
+	ret = read_partial(con, to, sizeof(con->in_reply), &con->in_reply);
 	if (ret <= 0)
 		goto out;
-	ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
+	to += sizeof(con->in_reply);
+	ret = read_partial(con, to, le32_to_cpu(con->in_reply.authorizer_len),
 			   con->auth_reply_buf);
 	if (ret <= 0)
 		goto out;
@@ -1491,9 +1495,7 @@ static int process_connect(struct ceph_connection *con)
  */
 static int read_partial_ack(struct ceph_connection *con)
 {
-	int to = 0;
-
-	return read_partial(con, &to, sizeof(con->in_temp_ack),
+	return read_partial(con, 0, sizeof(con->in_temp_ack),
 			    &con->in_temp_ack);
 }
 
@@ -1638,8 +1640,7 @@ static int read_partial_message(struct ceph_connection *con)
 	dout("read_partial_message con %p msg %p\n", con, m);
 
 	/* header */
-	to = 0;
-	ret = read_partial(con, &to, sizeof (con->in_hdr), &con->in_hdr);
+	ret = read_partial(con, 0, sizeof (con->in_hdr), &con->in_hdr);
 	if (ret <= 0)
 		return ret;
 
@@ -1755,7 +1756,7 @@ static int read_partial_message(struct ceph_connection *con)
 
 	/* footer */
 	to = sizeof (m->hdr);
-	ret = read_partial(con, &to, sizeof (m->footer), &m->footer);
+	ret = read_partial(con, to, sizeof (m->footer), &m->footer);
 	if (ret <= 0)
 		return ret;
 
-- 
cgit v1.2.3-18-g5258


From fd51653f78cf40a0516e521b6de22f329c5bad8d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 10 May 2012 10:29:50 -0500
Subject: ceph: messenger: change read_partial() to take "end" arg

Make the second argument to read_partial() be the ending input byte
position rather than the beginning offset it now represents.  This
amounts to moving the addition "to + size" into the caller.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 60 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 22 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 37fd2aece23..a659b4de64a 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -992,10 +992,8 @@ static int prepare_read_message(struct ceph_connection *con)
 
 
 static int read_partial(struct ceph_connection *con,
-			int to, int size, void *object)
+			int end, int size, void *object)
 {
-	int end = to + size;
-
 	while (con->in_base_pos < end) {
 		int left = end - con->in_base_pos;
 		int have = size - left;
@@ -1013,40 +1011,52 @@ static int read_partial(struct ceph_connection *con,
  */
 static int read_partial_banner(struct ceph_connection *con)
 {
-	int ret, to = 0;
+	int size;
+	int end;
+	int ret;
 
 	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
 
 	/* peer's banner */
-	ret = read_partial(con, to, strlen(CEPH_BANNER), con->in_banner);
+	size = strlen(CEPH_BANNER);
+	end = size;
+	ret = read_partial(con, end, size, con->in_banner);
 	if (ret <= 0)
 		goto out;
-	to += strlen(CEPH_BANNER);
-	ret = read_partial(con, to, sizeof(con->actual_peer_addr),
-			   &con->actual_peer_addr);
+
+	size = sizeof (con->actual_peer_addr);
+	end += size;
+	ret = read_partial(con, end, size, &con->actual_peer_addr);
 	if (ret <= 0)
 		goto out;
-	to += sizeof(con->actual_peer_addr);
-	ret = read_partial(con, to, sizeof(con->peer_addr_for_me),
-			   &con->peer_addr_for_me);
+
+	size = sizeof (con->peer_addr_for_me);
+	end += size;
+	ret = read_partial(con, end, size, &con->peer_addr_for_me);
 	if (ret <= 0)
 		goto out;
+
 out:
 	return ret;
 }
 
 static int read_partial_connect(struct ceph_connection *con)
 {
-	int ret, to = 0;
+	int size;
+	int end;
+	int ret;
 
 	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
 
-	ret = read_partial(con, to, sizeof(con->in_reply), &con->in_reply);
+	size = sizeof (con->in_reply);
+	end = size;
+	ret = read_partial(con, end, size, &con->in_reply);
 	if (ret <= 0)
 		goto out;
-	to += sizeof(con->in_reply);
-	ret = read_partial(con, to, le32_to_cpu(con->in_reply.authorizer_len),
-			   con->auth_reply_buf);
+
+	size = le32_to_cpu(con->in_reply.authorizer_len);
+	end += size;
+	ret = read_partial(con, end, size, con->auth_reply_buf);
 	if (ret <= 0)
 		goto out;
 
@@ -1495,8 +1505,10 @@ static int process_connect(struct ceph_connection *con)
  */
 static int read_partial_ack(struct ceph_connection *con)
 {
-	return read_partial(con, 0, sizeof(con->in_temp_ack),
-			    &con->in_temp_ack);
+	int size = sizeof (con->in_temp_ack);
+	int end = size;
+
+	return read_partial(con, end, size, &con->in_temp_ack);
 }
 
 
@@ -1629,8 +1641,9 @@ static int read_partial_message_bio(struct ceph_connection *con,
 static int read_partial_message(struct ceph_connection *con)
 {
 	struct ceph_msg *m = con->in_msg;
+	int size;
+	int end;
 	int ret;
-	int to;
 	unsigned front_len, middle_len, data_len;
 	bool do_datacrc = !con->msgr->nocrc;
 	int skip;
@@ -1640,7 +1653,9 @@ static int read_partial_message(struct ceph_connection *con)
 	dout("read_partial_message con %p msg %p\n", con, m);
 
 	/* header */
-	ret = read_partial(con, 0, sizeof (con->in_hdr), &con->in_hdr);
+	size = sizeof (con->in_hdr);
+	end = size;
+	ret = read_partial(con, end, size, &con->in_hdr);
 	if (ret <= 0)
 		return ret;
 
@@ -1755,8 +1770,9 @@ static int read_partial_message(struct ceph_connection *con)
 	}
 
 	/* footer */
-	to = sizeof (m->hdr);
-	ret = read_partial(con, to, sizeof (m->footer), &m->footer);
+	size = sizeof (m->footer);
+	end += size;
+	ret = read_partial(con, end, size, &m->footer);
 	if (ret <= 0)
 		return ret;
 
-- 
cgit v1.2.3-18-g5258


From 702aeb1f88e707241d76e1e2a1a02dd81e6c2d77 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 14 May 2012 12:34:38 -0700
Subject: ceph: fully initialize new layout

When we are setting a new layout, fully initialize the structure:
 - zero it out
 - always set preferred_osd to -1

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/ioctl.c | 19 +++++++++++++------
 fs/ceph/ioctl.h |  2 ++
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 0e2f021109e..c0b7314b90f 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -80,22 +80,29 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 
 	/* validate changed params against current layout */
 	err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
-	if (!err) {
-		nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
-		nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
-		nl.object_size = ceph_file_layout_object_size(ci->i_layout);
-		nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
-	} else
+	if (err)
 		return err;
 
+	memset(&nl, 0, sizeof(nl));
 	if (l.stripe_count)
 		nl.stripe_count = l.stripe_count;
+	else
+		nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
 	if (l.stripe_unit)
 		nl.stripe_unit = l.stripe_unit;
+	else
+		nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
 	if (l.object_size)
 		nl.object_size = l.object_size;
+	else
+		nl.object_size = ceph_file_layout_object_size(ci->i_layout);
 	if (l.data_pool)
 		nl.data_pool = l.data_pool;
+	else
+		nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout);
+
+	/* this is obsolete, and always -1 */
+	nl.preferred_osd = le64_to_cpu(-1);
 
 	err = __validate_layout(mdsc, &nl);
 	if (err)
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index be4a6048733..c77028afb1e 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -34,6 +34,8 @@
 struct ceph_ioctl_layout {
 	__u64 stripe_unit, stripe_count, object_size;
 	__u64 data_pool;
+
+	/* obsolete.  new values ignored, always return -1 */
 	__s64 preferred_osd;
 };
 
-- 
cgit v1.2.3-18-g5258


From c047be09349752b8a4dac27bc9f130bf4d592f11 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 14 May 2012 12:34:50 -0700
Subject: ceph: ignore preferred_osd field

Old users may not expect EINVAL, and there is no clear user-visibile
behavior change now that we ignore it.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/ioctl.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index c0b7314b90f..8e3fb69fbe6 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -39,10 +39,6 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
 {
 	int i, err;
 
-	/* preferred_osd is no longer supported */
-	if (l->preferred_osd != -1)
-		return -EINVAL;
-
 	/* validate striping parameters */
 	if ((l->object_size & ~PAGE_MASK) ||
 	    (l->stripe_unit & ~PAGE_MASK) ||
-- 
cgit v1.2.3-18-g5258


From d329156f16306449c273002486c28de3ddddfd89 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 15:16:38 -0500
Subject: libceph: don't reset kvec in prepare_write_banner()

Move the kvec reset for a connection out of prepare_write_banner and
into its only caller.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a659b4de64a..bcbd409b5ca 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -686,7 +686,6 @@ static int prepare_connect_authorizer(struct ceph_connection *con)
 static void prepare_write_banner(struct ceph_messenger *msgr,
 				 struct ceph_connection *con)
 {
-	ceph_con_out_kvec_reset(con);
 	ceph_con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
 	ceph_con_out_kvec_add(con, sizeof (msgr->my_enc_addr),
 					&msgr->my_enc_addr);
@@ -726,10 +725,9 @@ static int prepare_write_connect(struct ceph_messenger *msgr,
 	con->out_connect.protocol_version = cpu_to_le32(proto);
 	con->out_connect.flags = 0;
 
+	ceph_con_out_kvec_reset(con);
 	if (include_banner)
 		prepare_write_banner(msgr, con);
-	else
-		ceph_con_out_kvec_reset(con);
 	ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect);
 
 	con->out_more = 0;
-- 
cgit v1.2.3-18-g5258


From 84fb3adf6413862cff51d8af3fce5f0b655586a2 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 15:16:38 -0500
Subject: ceph: messenger: reset connection kvec caller

Reset a connection's kvec fields in the caller rather than in
prepare_write_connect().   This ends up repeating a few lines of
code but it's improving the separation between distinct operations
on the connection, which we can take advantage of later.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index bcbd409b5ca..cca3cf341d7 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -725,7 +725,6 @@ static int prepare_write_connect(struct ceph_messenger *msgr,
 	con->out_connect.protocol_version = cpu_to_le32(proto);
 	con->out_connect.flags = 0;
 
-	ceph_con_out_kvec_reset(con);
 	if (include_banner)
 		prepare_write_banner(msgr, con);
 	ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect);
@@ -1389,6 +1388,7 @@ static int process_connect(struct ceph_connection *con)
 			return -1;
 		}
 		con->auth_retry = 1;
+		ceph_con_out_kvec_reset(con);
 		ret = prepare_write_connect(con->msgr, con, 0);
 		if (ret < 0)
 			return ret;
@@ -1409,6 +1409,7 @@ static int process_connect(struct ceph_connection *con)
 		       ENTITY_NAME(con->peer_name),
 		       ceph_pr_addr(&con->peer_addr.in_addr));
 		reset_connection(con);
+		ceph_con_out_kvec_reset(con);
 		prepare_write_connect(con->msgr, con, 0);
 		prepare_read_connect(con);
 
@@ -1432,6 +1433,7 @@ static int process_connect(struct ceph_connection *con)
 		     le32_to_cpu(con->out_connect.connect_seq),
 		     le32_to_cpu(con->in_connect.connect_seq));
 		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
+		ceph_con_out_kvec_reset(con);
 		prepare_write_connect(con->msgr, con, 0);
 		prepare_read_connect(con);
 		break;
@@ -1446,6 +1448,7 @@ static int process_connect(struct ceph_connection *con)
 		     le32_to_cpu(con->in_connect.global_seq));
 		get_global_seq(con->msgr,
 			       le32_to_cpu(con->in_connect.global_seq));
+		ceph_con_out_kvec_reset(con);
 		prepare_write_connect(con->msgr, con, 0);
 		prepare_read_connect(con);
 		break;
@@ -1851,6 +1854,7 @@ more:
 
 	/* open the socket first? */
 	if (con->sock == NULL) {
+		ceph_con_out_kvec_reset(con);
 		prepare_write_connect(msgr, con, 1);
 		prepare_read_banner(con);
 		set_bit(CONNECTING, &con->state);
-- 
cgit v1.2.3-18-g5258


From 41b90c00858129f52d08e6a05c9cfdb0f2bd074d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 15:16:38 -0500
Subject: ceph: messenger: send banner in process_connect()

prepare_write_connect() has an argument indicating whether a banner
should be sent out before sending out a connection message.  It's
only ever set in one of its callers, so move the code that arranges
to send the banner into that caller and drop the "include_banner"
argument from prepare_write_connect().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index cca3cf341d7..6b38b6fbb25 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -695,8 +695,7 @@ static void prepare_write_banner(struct ceph_messenger *msgr,
 }
 
 static int prepare_write_connect(struct ceph_messenger *msgr,
-				 struct ceph_connection *con,
-				 int include_banner)
+				 struct ceph_connection *con)
 {
 	unsigned global_seq = get_global_seq(con->msgr, 0);
 	int proto;
@@ -725,8 +724,6 @@ static int prepare_write_connect(struct ceph_messenger *msgr,
 	con->out_connect.protocol_version = cpu_to_le32(proto);
 	con->out_connect.flags = 0;
 
-	if (include_banner)
-		prepare_write_banner(msgr, con);
 	ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect);
 
 	con->out_more = 0;
@@ -1389,7 +1386,7 @@ static int process_connect(struct ceph_connection *con)
 		}
 		con->auth_retry = 1;
 		ceph_con_out_kvec_reset(con);
-		ret = prepare_write_connect(con->msgr, con, 0);
+		ret = prepare_write_connect(con->msgr, con);
 		if (ret < 0)
 			return ret;
 		prepare_read_connect(con);
@@ -1410,7 +1407,7 @@ static int process_connect(struct ceph_connection *con)
 		       ceph_pr_addr(&con->peer_addr.in_addr));
 		reset_connection(con);
 		ceph_con_out_kvec_reset(con);
-		prepare_write_connect(con->msgr, con, 0);
+		prepare_write_connect(con->msgr, con);
 		prepare_read_connect(con);
 
 		/* Tell ceph about it. */
@@ -1434,7 +1431,7 @@ static int process_connect(struct ceph_connection *con)
 		     le32_to_cpu(con->in_connect.connect_seq));
 		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
 		ceph_con_out_kvec_reset(con);
-		prepare_write_connect(con->msgr, con, 0);
+		prepare_write_connect(con->msgr, con);
 		prepare_read_connect(con);
 		break;
 
@@ -1449,7 +1446,7 @@ static int process_connect(struct ceph_connection *con)
 		get_global_seq(con->msgr,
 			       le32_to_cpu(con->in_connect.global_seq));
 		ceph_con_out_kvec_reset(con);
-		prepare_write_connect(con->msgr, con, 0);
+		prepare_write_connect(con->msgr, con);
 		prepare_read_connect(con);
 		break;
 
@@ -1855,7 +1852,8 @@ more:
 	/* open the socket first? */
 	if (con->sock == NULL) {
 		ceph_con_out_kvec_reset(con);
-		prepare_write_connect(msgr, con, 1);
+		prepare_write_banner(msgr, con);
+		prepare_write_connect(msgr, con);
 		prepare_read_banner(con);
 		set_bit(CONNECTING, &con->state);
 		clear_bit(NEGOTIATING, &con->state);
-- 
cgit v1.2.3-18-g5258


From e825a66df97776d30a48a187e3a986736af43945 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 15:16:38 -0500
Subject: ceph: drop msgr argument from prepare_write_connect()

In all cases, the value passed as the msgr argument to
prepare_write_connect() is just con->msgr.  Just get the msgr
value from the ceph connection and drop the unneeded argument.

The only msgr passed to prepare_write_banner() is also therefore
just the one from con->msgr, so change that function to drop the
msgr argument as well.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 6b38b6fbb25..47499dc0e41 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -683,19 +683,17 @@ static int prepare_connect_authorizer(struct ceph_connection *con)
 /*
  * We connected to a peer and are saying hello.
  */
-static void prepare_write_banner(struct ceph_messenger *msgr,
-				 struct ceph_connection *con)
+static void prepare_write_banner(struct ceph_connection *con)
 {
 	ceph_con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
-	ceph_con_out_kvec_add(con, sizeof (msgr->my_enc_addr),
-					&msgr->my_enc_addr);
+	ceph_con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
+					&con->msgr->my_enc_addr);
 
 	con->out_more = 0;
 	set_bit(WRITE_PENDING, &con->state);
 }
 
-static int prepare_write_connect(struct ceph_messenger *msgr,
-				 struct ceph_connection *con)
+static int prepare_write_connect(struct ceph_connection *con)
 {
 	unsigned global_seq = get_global_seq(con->msgr, 0);
 	int proto;
@@ -717,7 +715,7 @@ static int prepare_write_connect(struct ceph_messenger *msgr,
 	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
 	     con->connect_seq, global_seq, proto);
 
-	con->out_connect.features = cpu_to_le64(msgr->supported_features);
+	con->out_connect.features = cpu_to_le64(con->msgr->supported_features);
 	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
 	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
 	con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1386,7 +1384,7 @@ static int process_connect(struct ceph_connection *con)
 		}
 		con->auth_retry = 1;
 		ceph_con_out_kvec_reset(con);
-		ret = prepare_write_connect(con->msgr, con);
+		ret = prepare_write_connect(con);
 		if (ret < 0)
 			return ret;
 		prepare_read_connect(con);
@@ -1407,7 +1405,7 @@ static int process_connect(struct ceph_connection *con)
 		       ceph_pr_addr(&con->peer_addr.in_addr));
 		reset_connection(con);
 		ceph_con_out_kvec_reset(con);
-		prepare_write_connect(con->msgr, con);
+		prepare_write_connect(con);
 		prepare_read_connect(con);
 
 		/* Tell ceph about it. */
@@ -1431,7 +1429,7 @@ static int process_connect(struct ceph_connection *con)
 		     le32_to_cpu(con->in_connect.connect_seq));
 		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
 		ceph_con_out_kvec_reset(con);
-		prepare_write_connect(con->msgr, con);
+		prepare_write_connect(con);
 		prepare_read_connect(con);
 		break;
 
@@ -1446,7 +1444,7 @@ static int process_connect(struct ceph_connection *con)
 		get_global_seq(con->msgr,
 			       le32_to_cpu(con->in_connect.global_seq));
 		ceph_con_out_kvec_reset(con);
-		prepare_write_connect(con->msgr, con);
+		prepare_write_connect(con);
 		prepare_read_connect(con);
 		break;
 
@@ -1840,7 +1838,6 @@ static void process_message(struct ceph_connection *con)
  */
 static int try_write(struct ceph_connection *con)
 {
-	struct ceph_messenger *msgr = con->msgr;
 	int ret = 1;
 
 	dout("try_write start %p state %lu nref %d\n", con, con->state,
@@ -1852,8 +1849,8 @@ more:
 	/* open the socket first? */
 	if (con->sock == NULL) {
 		ceph_con_out_kvec_reset(con);
-		prepare_write_banner(msgr, con);
-		prepare_write_connect(msgr, con);
+		prepare_write_banner(con);
+		prepare_write_connect(con);
 		prepare_read_banner(con);
 		set_bit(CONNECTING, &con->state);
 		clear_bit(NEGOTIATING, &con->state);
-- 
cgit v1.2.3-18-g5258


From e10c758e4031a801ea4d2f8fb39bf14c2658d74b Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 15:16:38 -0500
Subject: ceph: don't set WRITE_PENDING too early

prepare_write_connect() prepares a connect message, then sets
WRITE_PENDING on the connection.  Then *after* this, it calls
prepare_connect_authorizer(), which updates the content of the
connection buffer already queued for sending.  It's also possible it
will result in prepare_write_connect() returning -EAGAIN despite the
WRITE_PENDING big getting set.

Fix this by preparing the connect authorizer first, setting the
WRITE_PENDING bit only after that is done.

Partially addresses http://tracker.newdream.net/issues/2424

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 47499dc0e41..cf292939dd1 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -697,6 +697,7 @@ static int prepare_write_connect(struct ceph_connection *con)
 {
 	unsigned global_seq = get_global_seq(con->msgr, 0);
 	int proto;
+	int ret;
 
 	switch (con->peer_name.type) {
 	case CEPH_ENTITY_TYPE_MON:
@@ -723,11 +724,14 @@ static int prepare_write_connect(struct ceph_connection *con)
 	con->out_connect.flags = 0;
 
 	ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect);
+	ret = prepare_connect_authorizer(con);
+	if (ret)
+		return ret;
 
 	con->out_more = 0;
 	set_bit(WRITE_PENDING, &con->state);
 
-	return prepare_connect_authorizer(con);
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3-18-g5258


From 5a0f8fdd8a0ebe320952a388331dc043d7e14ced Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 21:51:59 -0500
Subject: ceph: messenger: check prepare_write_connect() result

prepare_write_connect() can return an error, but only one of its
callers checks for it.  All the rest are in functions that already
return errors, so it should be fine to return the error if one
gets returned.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index cf292939dd1..8e76936a8c1 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1409,7 +1409,9 @@ static int process_connect(struct ceph_connection *con)
 		       ceph_pr_addr(&con->peer_addr.in_addr));
 		reset_connection(con);
 		ceph_con_out_kvec_reset(con);
-		prepare_write_connect(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
 		prepare_read_connect(con);
 
 		/* Tell ceph about it. */
@@ -1433,7 +1435,9 @@ static int process_connect(struct ceph_connection *con)
 		     le32_to_cpu(con->in_connect.connect_seq));
 		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
 		ceph_con_out_kvec_reset(con);
-		prepare_write_connect(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
 		prepare_read_connect(con);
 		break;
 
@@ -1448,7 +1452,9 @@ static int process_connect(struct ceph_connection *con)
 		get_global_seq(con->msgr,
 			       le32_to_cpu(con->in_connect.global_seq));
 		ceph_con_out_kvec_reset(con);
-		prepare_write_connect(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
 		prepare_read_connect(con);
 		break;
 
@@ -1854,7 +1860,9 @@ more:
 	if (con->sock == NULL) {
 		ceph_con_out_kvec_reset(con);
 		prepare_write_banner(con);
-		prepare_write_connect(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			goto out;
 		prepare_read_banner(con);
 		set_bit(CONNECTING, &con->state);
 		clear_bit(NEGOTIATING, &con->state);
-- 
cgit v1.2.3-18-g5258


From b1c6b9803f5491e94041e6da96bc9dec3870e792 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 15:16:38 -0500
Subject: ceph: messenger: rework prepare_connect_authorizer()

Change prepare_connect_authorizer() so it returns without dropping
the connection mutex if the connection has no get_authorizer method.

Use the symbolic CEPH_AUTH_UNKNOWN instead of 0 when assigning
authorization protocols.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 8e76936a8c1..09409a3d950 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -656,19 +656,29 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 static int prepare_connect_authorizer(struct ceph_connection *con)
 {
 	void *auth_buf;
-	int auth_len = 0;
-	int auth_protocol = 0;
+	int auth_len;
+	int auth_protocol;
+
+	if (!con->ops->get_authorizer) {
+		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
+		con->out_connect.authorizer_len = 0;
+
+		return 0;
+	}
+
+	/* Can't hold the mutex while getting authorizer */
 
 	mutex_unlock(&con->mutex);
-	if (con->ops->get_authorizer)
-		con->ops->get_authorizer(con, &auth_buf, &auth_len,
-					 &auth_protocol, &con->auth_reply_buf,
-					 &con->auth_reply_buf_len,
-					 con->auth_retry);
+
+	auth_buf = NULL;
+	auth_len = 0;
+	auth_protocol = CEPH_AUTH_UNKNOWN;
+	con->ops->get_authorizer(con, &auth_buf, &auth_len, &auth_protocol,
+				&con->auth_reply_buf, &con->auth_reply_buf_len,
+				con->auth_retry);
 	mutex_lock(&con->mutex);
 
-	if (test_bit(CLOSED, &con->state) ||
-	    test_bit(OPENING, &con->state))
+	if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state))
 		return -EAGAIN;
 
 	con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
-- 
cgit v1.2.3-18-g5258


From ed96af646011412c2bf1ffe860db170db355fae5 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 15:16:38 -0500
Subject: ceph: messenger: check return from get_authorizer

In prepare_connect_authorizer(), a connection's get_authorizer
method is called but ignores its return value.  This function can
return an error, so check for it and return it if that ever occurs.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 09409a3d950..e0532d5b22f 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -658,6 +658,7 @@ static int prepare_connect_authorizer(struct ceph_connection *con)
 	void *auth_buf;
 	int auth_len;
 	int auth_protocol;
+	int ret;
 
 	if (!con->ops->get_authorizer) {
 		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
@@ -673,11 +674,14 @@ static int prepare_connect_authorizer(struct ceph_connection *con)
 	auth_buf = NULL;
 	auth_len = 0;
 	auth_protocol = CEPH_AUTH_UNKNOWN;
-	con->ops->get_authorizer(con, &auth_buf, &auth_len, &auth_protocol,
-				&con->auth_reply_buf, &con->auth_reply_buf_len,
-				con->auth_retry);
+	ret = con->ops->get_authorizer(con, &auth_buf, &auth_len,
+				&auth_protocol, &con->auth_reply_buf,
+				&con->auth_reply_buf_len, con->auth_retry);
 	mutex_lock(&con->mutex);
 
+	if (ret)
+		return ret;
+
 	if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state))
 		return -EAGAIN;
 
-- 
cgit v1.2.3-18-g5258


From 6c4a19158b96ea1fb8acbe0c1d5493d9dcd2f147 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 15:16:38 -0500
Subject: ceph: define ceph_auth_handshake type

The definitions for the ceph_mds_session and ceph_osd both contain
five fields related only to "authorizers."  Encapsulate those fields
into their own struct type, allowing for better isolation in some
upcoming patches.

Fix the #includes in "linux/ceph/osd_client.h" to lay out their more
complete canonical path.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/mds_client.c            | 32 ++++++++++++++++----------------
 fs/ceph/mds_client.h            |  5 ++---
 include/linux/ceph/auth.h       |  8 ++++++++
 include/linux/ceph/osd_client.h | 11 +++++------
 net/ceph/osd_client.c           | 32 ++++++++++++++++----------------
 5 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 89971e137aa..42013c62048 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -334,10 +334,10 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
 	dout("mdsc put_session %p %d -> %d\n", s,
 	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
 	if (atomic_dec_and_test(&s->s_ref)) {
-		if (s->s_authorizer)
+		if (s->s_auth.authorizer)
 		     s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
 			     s->s_mdsc->fsc->client->monc.auth,
-			     s->s_authorizer);
+			     s->s_auth.authorizer);
 		kfree(s);
 	}
 }
@@ -3404,29 +3404,29 @@ static int get_authorizer(struct ceph_connection *con,
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 	int ret = 0;
 
-	if (force_new && s->s_authorizer) {
-		ac->ops->destroy_authorizer(ac, s->s_authorizer);
-		s->s_authorizer = NULL;
+	if (force_new && s->s_auth.authorizer) {
+		ac->ops->destroy_authorizer(ac, s->s_auth.authorizer);
+		s->s_auth.authorizer = NULL;
 	}
-	if (s->s_authorizer == NULL) {
+	if (s->s_auth.authorizer == NULL) {
 		if (ac->ops->create_authorizer) {
 			ret = ac->ops->create_authorizer(
 				ac, CEPH_ENTITY_TYPE_MDS,
-				&s->s_authorizer,
-				&s->s_authorizer_buf,
-				&s->s_authorizer_buf_len,
-				&s->s_authorizer_reply_buf,
-				&s->s_authorizer_reply_buf_len);
+				&s->s_auth.authorizer,
+				&s->s_auth.authorizer_buf,
+				&s->s_auth.authorizer_buf_len,
+				&s->s_auth.authorizer_reply_buf,
+				&s->s_auth.authorizer_reply_buf_len);
 			if (ret)
 				return ret;
 		}
 	}
 
 	*proto = ac->protocol;
-	*buf = s->s_authorizer_buf;
-	*len = s->s_authorizer_buf_len;
-	*reply_buf = s->s_authorizer_reply_buf;
-	*reply_len = s->s_authorizer_reply_buf_len;
+	*buf = s->s_auth.authorizer_buf;
+	*len = s->s_auth.authorizer_buf_len;
+	*reply_buf = s->s_auth.authorizer_reply_buf;
+	*reply_len = s->s_auth.authorizer_reply_buf_len;
 	return 0;
 }
 
@@ -3437,7 +3437,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 
-	return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
+	return ac->ops->verify_authorizer_reply(ac, s->s_auth.authorizer, len);
 }
 
 static int invalidate_authorizer(struct ceph_connection *con)
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 8c7c04ebb59..dd26846dd71 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -11,6 +11,7 @@
 #include <linux/ceph/types.h>
 #include <linux/ceph/messenger.h>
 #include <linux/ceph/mdsmap.h>
+#include <linux/ceph/auth.h>
 
 /*
  * Some lock dependencies:
@@ -113,9 +114,7 @@ struct ceph_mds_session {
 
 	struct ceph_connection s_con;
 
-	struct ceph_authorizer *s_authorizer;
-	void             *s_authorizer_buf, *s_authorizer_reply_buf;
-	size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len;
+	struct ceph_auth_handshake s_auth;
 
 	/* protected by s_gen_ttl_lock */
 	spinlock_t        s_gen_ttl_lock;
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index aa13392a7ef..5b774d141e0 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -14,6 +14,14 @@
 struct ceph_auth_client;
 struct ceph_authorizer;
 
+struct ceph_auth_handshake {
+	struct ceph_authorizer *authorizer;
+	void *authorizer_buf;
+	size_t authorizer_buf_len;
+	void *authorizer_reply_buf;
+	size_t authorizer_reply_buf_len;
+};
+
 struct ceph_auth_client_ops {
 	const char *name;
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 7c05ac202d9..cedfb1a8434 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -6,9 +6,10 @@
 #include <linux/mempool.h>
 #include <linux/rbtree.h>
 
-#include "types.h"
-#include "osdmap.h"
-#include "messenger.h"
+#include <linux/ceph/types.h>
+#include <linux/ceph/osdmap.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/auth.h>
 
 /* 
  * Maximum object name size 
@@ -40,9 +41,7 @@ struct ceph_osd {
 	struct list_head o_requests;
 	struct list_head o_linger_requests;
 	struct list_head o_osd_lru;
-	struct ceph_authorizer *o_authorizer;
-	void *o_authorizer_buf, *o_authorizer_reply_buf;
-	size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
+	struct ceph_auth_handshake o_auth;
 	unsigned long lru_ttl;
 	int o_marked_for_keepalive;
 	struct list_head o_keepalive_item;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index daa2716a0c3..66b09d6a153 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -667,8 +667,8 @@ static void put_osd(struct ceph_osd *osd)
 	if (atomic_dec_and_test(&osd->o_ref)) {
 		struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
 
-		if (osd->o_authorizer)
-			ac->ops->destroy_authorizer(ac, osd->o_authorizer);
+		if (osd->o_auth.authorizer)
+			ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer);
 		kfree(osd);
 	}
 }
@@ -2117,27 +2117,27 @@ static int get_authorizer(struct ceph_connection *con,
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
 	int ret = 0;
 
-	if (force_new && o->o_authorizer) {
-		ac->ops->destroy_authorizer(ac, o->o_authorizer);
-		o->o_authorizer = NULL;
+	if (force_new && o->o_auth.authorizer) {
+		ac->ops->destroy_authorizer(ac, o->o_auth.authorizer);
+		o->o_auth.authorizer = NULL;
 	}
-	if (o->o_authorizer == NULL) {
+	if (o->o_auth.authorizer == NULL) {
 		ret = ac->ops->create_authorizer(
 			ac, CEPH_ENTITY_TYPE_OSD,
-			&o->o_authorizer,
-			&o->o_authorizer_buf,
-			&o->o_authorizer_buf_len,
-			&o->o_authorizer_reply_buf,
-			&o->o_authorizer_reply_buf_len);
+			&o->o_auth.authorizer,
+			&o->o_auth.authorizer_buf,
+			&o->o_auth.authorizer_buf_len,
+			&o->o_auth.authorizer_reply_buf,
+			&o->o_auth.authorizer_reply_buf_len);
 		if (ret)
 			return ret;
 	}
 
 	*proto = ac->protocol;
-	*buf = o->o_authorizer_buf;
-	*len = o->o_authorizer_buf_len;
-	*reply_buf = o->o_authorizer_reply_buf;
-	*reply_len = o->o_authorizer_reply_buf_len;
+	*buf = o->o_auth.authorizer_buf;
+	*len = o->o_auth.authorizer_buf_len;
+	*reply_buf = o->o_auth.authorizer_reply_buf;
+	*reply_len = o->o_auth.authorizer_reply_buf_len;
 	return 0;
 }
 
@@ -2148,7 +2148,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
 	struct ceph_osd_client *osdc = o->o_osdc;
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
 
-	return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
+	return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len);
 }
 
 static int invalidate_authorizer(struct ceph_connection *con)
-- 
cgit v1.2.3-18-g5258


From 74f1869f76d043bad12ec03b4d5f04a8c3d1f157 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 15:16:39 -0500
Subject: ceph: messenger: reduce args to create_authorizer

Make use of the new ceph_auth_handshake structure in order to reduce
the number of arguments passed to the create_authorizor method in
ceph_auth_client_ops.  Use a local variable of that type as a
shorthand in the get_authorizer method definitions.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/mds_client.c      | 27 ++++++++++++---------------
 include/linux/ceph/auth.h |  4 +---
 net/ceph/auth_none.c      | 15 +++++++--------
 net/ceph/auth_x.c         | 15 +++++++--------
 net/ceph/osd_client.c     | 28 ++++++++++++----------------
 5 files changed, 39 insertions(+), 50 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 42013c62048..b71ffd2c809 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3402,31 +3402,28 @@ static int get_authorizer(struct ceph_connection *con,
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &s->s_auth;
 	int ret = 0;
 
-	if (force_new && s->s_auth.authorizer) {
-		ac->ops->destroy_authorizer(ac, s->s_auth.authorizer);
-		s->s_auth.authorizer = NULL;
+	if (force_new && auth->authorizer) {
+		ac->ops->destroy_authorizer(ac, auth->authorizer);
+		auth->authorizer = NULL;
 	}
-	if (s->s_auth.authorizer == NULL) {
+	if (auth->authorizer == NULL) {
 		if (ac->ops->create_authorizer) {
-			ret = ac->ops->create_authorizer(
-				ac, CEPH_ENTITY_TYPE_MDS,
-				&s->s_auth.authorizer,
-				&s->s_auth.authorizer_buf,
-				&s->s_auth.authorizer_buf_len,
-				&s->s_auth.authorizer_reply_buf,
-				&s->s_auth.authorizer_reply_buf_len);
+			ret = ac->ops->create_authorizer(ac,
+						CEPH_ENTITY_TYPE_MDS, auth);
 			if (ret)
 				return ret;
 		}
 	}
 
 	*proto = ac->protocol;
-	*buf = s->s_auth.authorizer_buf;
-	*len = s->s_auth.authorizer_buf_len;
-	*reply_buf = s->s_auth.authorizer_reply_buf;
-	*reply_len = s->s_auth.authorizer_reply_buf_len;
+	*buf = auth->authorizer_buf;
+	*len = auth->authorizer_buf_len;
+	*reply_buf = auth->authorizer_reply_buf;
+	*reply_len = auth->authorizer_reply_buf_len;
+
 	return 0;
 }
 
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 5b774d141e0..d4080f309b5 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -51,9 +51,7 @@ struct ceph_auth_client_ops {
 	 * the response to authenticate the service.
 	 */
 	int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
-				 struct ceph_authorizer **a,
-				 void **buf, size_t *len,
-				 void **reply_buf, size_t *reply_len);
+				 struct ceph_auth_handshake *auth);
 	int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
 				       struct ceph_authorizer *a, size_t len);
 	void (*destroy_authorizer)(struct ceph_auth_client *ac,
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index 214c2bb43d6..925ca583c09 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -59,9 +59,7 @@ static int handle_reply(struct ceph_auth_client *ac, int result,
  */
 static int ceph_auth_none_create_authorizer(
 	struct ceph_auth_client *ac, int peer_type,
-	struct ceph_authorizer **a,
-	void **buf, size_t *len,
-	void **reply_buf, size_t *reply_len)
+	struct ceph_auth_handshake *auth)
 {
 	struct ceph_auth_none_info *ai = ac->private;
 	struct ceph_none_authorizer *au = &ai->au;
@@ -82,11 +80,12 @@ static int ceph_auth_none_create_authorizer(
 		dout("built authorizer len %d\n", au->buf_len);
 	}
 
-	*a = (struct ceph_authorizer *)au;
-	*buf = au->buf;
-	*len = au->buf_len;
-	*reply_buf = au->reply_buf;
-	*reply_len = sizeof(au->reply_buf);
+	auth->authorizer = (struct ceph_authorizer *) au;
+	auth->authorizer_buf = au->buf;
+	auth->authorizer_buf_len = au->buf_len;
+	auth->authorizer_reply_buf = au->reply_buf;
+	auth->authorizer_reply_buf_len = sizeof (au->reply_buf);
+
 	return 0;
 
 bad2:
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 1587dc6010c..a16bf14eb02 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -526,9 +526,7 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 
 static int ceph_x_create_authorizer(
 	struct ceph_auth_client *ac, int peer_type,
-	struct ceph_authorizer **a,
-	void **buf, size_t *len,
-	void **reply_buf, size_t *reply_len)
+	struct ceph_auth_handshake *auth)
 {
 	struct ceph_x_authorizer *au;
 	struct ceph_x_ticket_handler *th;
@@ -548,11 +546,12 @@ static int ceph_x_create_authorizer(
 		return ret;
 	}
 
-	*a = (struct ceph_authorizer *)au;
-	*buf = au->buf->vec.iov_base;
-	*len = au->buf->vec.iov_len;
-	*reply_buf = au->reply_buf;
-	*reply_len = sizeof(au->reply_buf);
+	auth->authorizer = (struct ceph_authorizer *) au;
+	auth->authorizer_buf = au->buf->vec.iov_base;
+	auth->authorizer_buf_len = au->buf->vec.iov_len;
+	auth->authorizer_reply_buf = au->reply_buf;
+	auth->authorizer_reply_buf_len = sizeof (au->reply_buf);
+
 	return 0;
 }
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 66b09d6a153..2da4b9e97dc 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2115,29 +2115,25 @@ static int get_authorizer(struct ceph_connection *con,
 	struct ceph_osd *o = con->private;
 	struct ceph_osd_client *osdc = o->o_osdc;
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
 	int ret = 0;
 
-	if (force_new && o->o_auth.authorizer) {
-		ac->ops->destroy_authorizer(ac, o->o_auth.authorizer);
-		o->o_auth.authorizer = NULL;
-	}
-	if (o->o_auth.authorizer == NULL) {
-		ret = ac->ops->create_authorizer(
-			ac, CEPH_ENTITY_TYPE_OSD,
-			&o->o_auth.authorizer,
-			&o->o_auth.authorizer_buf,
-			&o->o_auth.authorizer_buf_len,
-			&o->o_auth.authorizer_reply_buf,
-			&o->o_auth.authorizer_reply_buf_len);
+	if (force_new && auth->authorizer) {
+		ac->ops->destroy_authorizer(ac, auth->authorizer);
+		auth->authorizer = NULL;
+	}
+	if (auth->authorizer == NULL) {
+		ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, auth);
 		if (ret)
 			return ret;
 	}
 
 	*proto = ac->protocol;
-	*buf = o->o_auth.authorizer_buf;
-	*len = o->o_auth.authorizer_buf_len;
-	*reply_buf = o->o_auth.authorizer_reply_buf;
-	*reply_len = o->o_auth.authorizer_reply_buf_len;
+	*buf = auth->authorizer_buf;
+	*len = auth->authorizer_buf_len;
+	*reply_buf = auth->authorizer_reply_buf;
+	*reply_len = auth->authorizer_reply_buf_len;
+
 	return 0;
 }
 
-- 
cgit v1.2.3-18-g5258


From a255651d4cad89f1a606edd36135af892ada4f20 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 15:16:39 -0500
Subject: ceph: ensure auth ops are defined before use

In the create_authorizer method for both the mds and osd clients,
the auth_client->ops pointer is blindly dereferenced.  There is no
obvious guarantee that this pointer has been assigned.  And
furthermore, even if the ops pointer is non-null there is definitely
no guarantee that the create_authorizer or destroy_authorizer
methods are defined.

Add checks in both routines to make sure they are defined (non-null)
before use.  Add similar checks in a few other spots in these files
while we're at it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/mds_client.c  | 14 ++++++--------
 net/ceph/osd_client.c | 15 ++++++++++-----
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b71ffd2c809..462281742ae 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3406,16 +3406,14 @@ static int get_authorizer(struct ceph_connection *con,
 	int ret = 0;
 
 	if (force_new && auth->authorizer) {
-		ac->ops->destroy_authorizer(ac, auth->authorizer);
+		if (ac->ops && ac->ops->destroy_authorizer)
+			ac->ops->destroy_authorizer(ac, auth->authorizer);
 		auth->authorizer = NULL;
 	}
-	if (auth->authorizer == NULL) {
-		if (ac->ops->create_authorizer) {
-			ret = ac->ops->create_authorizer(ac,
-						CEPH_ENTITY_TYPE_MDS, auth);
-			if (ret)
-				return ret;
-		}
+	if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) {
+		ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, auth);
+		if (ret)
+			return ret;
 	}
 
 	*proto = ac->protocol;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2da4b9e97dc..f640bdf027e 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -664,10 +664,10 @@ static void put_osd(struct ceph_osd *osd)
 {
 	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
 	     atomic_read(&osd->o_ref) - 1);
-	if (atomic_dec_and_test(&osd->o_ref)) {
+	if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) {
 		struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
 
-		if (osd->o_auth.authorizer)
+		if (ac->ops && ac->ops->destroy_authorizer)
 			ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer);
 		kfree(osd);
 	}
@@ -2119,10 +2119,11 @@ static int get_authorizer(struct ceph_connection *con,
 	int ret = 0;
 
 	if (force_new && auth->authorizer) {
-		ac->ops->destroy_authorizer(ac, auth->authorizer);
+		if (ac->ops && ac->ops->destroy_authorizer)
+			ac->ops->destroy_authorizer(ac, auth->authorizer);
 		auth->authorizer = NULL;
 	}
-	if (auth->authorizer == NULL) {
+	if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) {
 		ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, auth);
 		if (ret)
 			return ret;
@@ -2144,6 +2145,10 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
 	struct ceph_osd_client *osdc = o->o_osdc;
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
 
+	/*
+	 * XXX If ac->ops or ac->ops->verify_authorizer_reply is null,
+	 * XXX which do we do:  succeed or fail?
+	 */
 	return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len);
 }
 
@@ -2153,7 +2158,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	struct ceph_osd_client *osdc = o->o_osdc;
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
 
-	if (ac->ops->invalidate_authorizer)
+	if (ac->ops && ac->ops->invalidate_authorizer)
 		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
 
 	return ceph_monc_validate_auth(&osdc->client->monc);
-- 
cgit v1.2.3-18-g5258


From a3530df33eb91d787d08c7383a0a9982690e42d0 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 15:16:39 -0500
Subject: ceph: have get_authorizer methods return pointers

Have the get_authorizer auth_client method return a ceph_auth
pointer rather than an integer, pointer-encoding any returned
error value.  This is to pave the way for making use of the
returned value in an upcoming patch.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/mds_client.c           | 20 +++++++++++++-------
 include/linux/ceph/messenger.h |  8 +++++---
 net/ceph/messenger.c           |  8 ++++----
 net/ceph/osd_client.c          | 19 ++++++++++++-------
 4 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 462281742ae..67938a9d049 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3395,15 +3395,20 @@ out:
 /*
  * authentication
  */
-static int get_authorizer(struct ceph_connection *con,
-			  void **buf, int *len, int *proto,
-			  void **reply_buf, int *reply_len, int force_new)
+
+/*
+ * Note: returned pointer is the address of a structure that's
+ * managed separately.  Caller must *not* attempt to free it.
+ */
+static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
+					void **buf, int *len, int *proto,
+					void **reply_buf, int *reply_len,
+					int force_new)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 	struct ceph_auth_handshake *auth = &s->s_auth;
-	int ret = 0;
 
 	if (force_new && auth->authorizer) {
 		if (ac->ops && ac->ops->destroy_authorizer)
@@ -3411,9 +3416,10 @@ static int get_authorizer(struct ceph_connection *con,
 		auth->authorizer = NULL;
 	}
 	if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) {
-		ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, auth);
+		int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
+							auth);
 		if (ret)
-			return ret;
+			return ERR_PTR(ret);
 	}
 
 	*proto = ac->protocol;
@@ -3422,7 +3428,7 @@ static int get_authorizer(struct ceph_connection *con,
 	*reply_buf = auth->authorizer_reply_buf;
 	*reply_len = auth->authorizer_reply_buf_len;
 
-	return 0;
+	return auth;
 }
 
 
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 3bff047f6b0..b10b55f8f30 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -25,9 +25,11 @@ struct ceph_connection_operations {
 	void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
 
 	/* authorize an outgoing connection */
-	int (*get_authorizer) (struct ceph_connection *con,
-			       void **buf, int *len, int *proto,
-			       void **reply_buf, int *reply_len, int force_new);
+	struct ceph_auth_handshake *(*get_authorizer) (
+				struct ceph_connection *con,
+				void **buf, int *len, int *proto,
+				void **reply_buf, int *reply_len,
+				int force_new);
 	int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
 	int (*invalidate_authorizer)(struct ceph_connection *con);
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index e0532d5b22f..ac27a2c0694 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -658,7 +658,7 @@ static int prepare_connect_authorizer(struct ceph_connection *con)
 	void *auth_buf;
 	int auth_len;
 	int auth_protocol;
-	int ret;
+	struct ceph_auth_handshake *auth;
 
 	if (!con->ops->get_authorizer) {
 		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
@@ -674,13 +674,13 @@ static int prepare_connect_authorizer(struct ceph_connection *con)
 	auth_buf = NULL;
 	auth_len = 0;
 	auth_protocol = CEPH_AUTH_UNKNOWN;
-	ret = con->ops->get_authorizer(con, &auth_buf, &auth_len,
+	auth = con->ops->get_authorizer(con, &auth_buf, &auth_len,
 				&auth_protocol, &con->auth_reply_buf,
 				&con->auth_reply_buf_len, con->auth_retry);
 	mutex_lock(&con->mutex);
 
-	if (ret)
-		return ret;
+	if (IS_ERR(auth))
+		return PTR_ERR(auth);
 
 	if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state))
 		return -EAGAIN;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index f640bdf027e..fa74ae0ea91 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2108,15 +2108,19 @@ static void put_osd_con(struct ceph_connection *con)
 /*
  * authentication
  */
-static int get_authorizer(struct ceph_connection *con,
-			  void **buf, int *len, int *proto,
-			  void **reply_buf, int *reply_len, int force_new)
+/*
+ * Note: returned pointer is the address of a structure that's
+ * managed separately.  Caller must *not* attempt to free it.
+ */
+static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
+					void **buf, int *len, int *proto,
+					void **reply_buf, int *reply_len,
+					int force_new)
 {
 	struct ceph_osd *o = con->private;
 	struct ceph_osd_client *osdc = o->o_osdc;
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
 	struct ceph_auth_handshake *auth = &o->o_auth;
-	int ret = 0;
 
 	if (force_new && auth->authorizer) {
 		if (ac->ops && ac->ops->destroy_authorizer)
@@ -2124,9 +2128,10 @@ static int get_authorizer(struct ceph_connection *con,
 		auth->authorizer = NULL;
 	}
 	if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) {
-		ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, auth);
+		int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
+							auth);
 		if (ret)
-			return ret;
+			return ERR_PTR(ret);
 	}
 
 	*proto = ac->protocol;
@@ -2135,7 +2140,7 @@ static int get_authorizer(struct ceph_connection *con,
 	*reply_buf = auth->authorizer_reply_buf;
 	*reply_len = auth->authorizer_reply_buf_len;
 
-	return 0;
+	return auth;
 }
 
 
-- 
cgit v1.2.3-18-g5258


From 8f43fb53894079bf0caab6e348ceaffe7adc651a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 15:16:39 -0500
Subject: ceph: use info returned by get_authorizer

Rather than passing a bunch of arguments to be filled in with the
content of the ceph_auth_handshake buffer now returned by the
get_authorizer method, just use the returned information in the
caller, and drop the unnecessary arguments.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/mds_client.c           |  9 +--------
 include/linux/ceph/messenger.h |  4 +---
 net/ceph/messenger.c           | 13 +++++++------
 net/ceph/osd_client.c          |  9 +--------
 4 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 67938a9d049..200bc87eceb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3401,9 +3401,7 @@ out:
  * managed separately.  Caller must *not* attempt to free it.
  */
 static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
-					void **buf, int *len, int *proto,
-					void **reply_buf, int *reply_len,
-					int force_new)
+					int *proto, int force_new)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
@@ -3421,12 +3419,7 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
 		if (ret)
 			return ERR_PTR(ret);
 	}
-
 	*proto = ac->protocol;
-	*buf = auth->authorizer_buf;
-	*len = auth->authorizer_buf_len;
-	*reply_buf = auth->authorizer_reply_buf;
-	*reply_len = auth->authorizer_reply_buf_len;
 
 	return auth;
 }
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index b10b55f8f30..2521a95fa6d 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -27,9 +27,7 @@ struct ceph_connection_operations {
 	/* authorize an outgoing connection */
 	struct ceph_auth_handshake *(*get_authorizer) (
 				struct ceph_connection *con,
-				void **buf, int *len, int *proto,
-				void **reply_buf, int *reply_len,
-				int force_new);
+			       int *proto, int force_new);
 	int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
 	int (*invalidate_authorizer)(struct ceph_connection *con);
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index ac27a2c0694..6d82c1a1a89 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -671,20 +671,21 @@ static int prepare_connect_authorizer(struct ceph_connection *con)
 
 	mutex_unlock(&con->mutex);
 
-	auth_buf = NULL;
-	auth_len = 0;
 	auth_protocol = CEPH_AUTH_UNKNOWN;
-	auth = con->ops->get_authorizer(con, &auth_buf, &auth_len,
-				&auth_protocol, &con->auth_reply_buf,
-				&con->auth_reply_buf_len, con->auth_retry);
+	auth = con->ops->get_authorizer(con, &auth_protocol, con->auth_retry);
+
 	mutex_lock(&con->mutex);
 
 	if (IS_ERR(auth))
 		return PTR_ERR(auth);
-
 	if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state))
 		return -EAGAIN;
 
+	auth_buf = auth->authorizer_buf;
+	auth_len = auth->authorizer_buf_len;
+	con->auth_reply_buf = auth->authorizer_reply_buf;
+	con->auth_reply_buf_len = auth->authorizer_reply_buf_len;
+
 	con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
 	con->out_connect.authorizer_len = cpu_to_le32(auth_len);
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index fa74ae0ea91..b7d633cc96a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2113,9 +2113,7 @@ static void put_osd_con(struct ceph_connection *con)
  * managed separately.  Caller must *not* attempt to free it.
  */
 static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
-					void **buf, int *len, int *proto,
-					void **reply_buf, int *reply_len,
-					int force_new)
+					int *proto, int force_new)
 {
 	struct ceph_osd *o = con->private;
 	struct ceph_osd_client *osdc = o->o_osdc;
@@ -2133,12 +2131,7 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
 		if (ret)
 			return ERR_PTR(ret);
 	}
-
 	*proto = ac->protocol;
-	*buf = auth->authorizer_buf;
-	*len = auth->authorizer_buf_len;
-	*reply_buf = auth->authorizer_reply_buf;
-	*reply_len = auth->authorizer_reply_buf_len;
 
 	return auth;
 }
-- 
cgit v1.2.3-18-g5258


From 729796be9190f57ca40ccca315e8ad34a1eb8fef Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 15:16:39 -0500
Subject: ceph: return pointer from prepare_connect_authorizer()

Change prepare_connect_authorizer() so it returns a pointer (or
pointer-coded error).

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 6d82c1a1a89..f92d564c150 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -653,7 +653,7 @@ static void prepare_write_keepalive(struct ceph_connection *con)
  * Connection negotiation.
  */
 
-static int prepare_connect_authorizer(struct ceph_connection *con)
+static struct ceph_auth_handshake *prepare_connect_authorizer(struct ceph_connection *con)
 {
 	void *auth_buf;
 	int auth_len;
@@ -664,7 +664,7 @@ static int prepare_connect_authorizer(struct ceph_connection *con)
 		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
 		con->out_connect.authorizer_len = 0;
 
-		return 0;
+		return NULL;
 	}
 
 	/* Can't hold the mutex while getting authorizer */
@@ -677,9 +677,9 @@ static int prepare_connect_authorizer(struct ceph_connection *con)
 	mutex_lock(&con->mutex);
 
 	if (IS_ERR(auth))
-		return PTR_ERR(auth);
+		return auth;
 	if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state))
-		return -EAGAIN;
+		return ERR_PTR(-EAGAIN);
 
 	auth_buf = auth->authorizer_buf;
 	auth_len = auth->authorizer_buf_len;
@@ -692,7 +692,7 @@ static int prepare_connect_authorizer(struct ceph_connection *con)
 	if (auth_len)
 		ceph_con_out_kvec_add(con, auth_len, auth_buf);
 
-	return 0;
+	return auth;
 }
 
 /*
@@ -712,7 +712,7 @@ static int prepare_write_connect(struct ceph_connection *con)
 {
 	unsigned global_seq = get_global_seq(con->msgr, 0);
 	int proto;
-	int ret;
+	struct ceph_auth_handshake *auth;
 
 	switch (con->peer_name.type) {
 	case CEPH_ENTITY_TYPE_MON:
@@ -739,9 +739,9 @@ static int prepare_write_connect(struct ceph_connection *con)
 	con->out_connect.flags = 0;
 
 	ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect);
-	ret = prepare_connect_authorizer(con);
-	if (ret)
-		return ret;
+	auth = prepare_connect_authorizer(con);
+	if (IS_ERR(auth))
+		return PTR_ERR(auth);
 
 	con->out_more = 0;
 	set_bit(WRITE_PENDING, &con->state);
-- 
cgit v1.2.3-18-g5258


From dac1e716c60161867a47745bca592987ca3a9cb2 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 15:16:39 -0500
Subject: ceph: rename prepare_connect_authorizer()

Change the name of prepare_connect_authorizer().  The next
patch is going to make this function no longer add anything to the
connection's out_kvec, so it will no longer fit the pattern of
the rest of the prepare_connect_*() functions.

In addition, pass the address of a variable that will hold the
authorization protocol to use.  Move the assignment of that to the
connection's out_connect structure into prepare_write_connect().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index f92d564c150..bfddd87db78 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -653,11 +653,11 @@ static void prepare_write_keepalive(struct ceph_connection *con)
  * Connection negotiation.
  */
 
-static struct ceph_auth_handshake *prepare_connect_authorizer(struct ceph_connection *con)
+static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection *con,
+						int *auth_proto)
 {
 	void *auth_buf;
 	int auth_len;
-	int auth_protocol;
 	struct ceph_auth_handshake *auth;
 
 	if (!con->ops->get_authorizer) {
@@ -671,8 +671,7 @@ static struct ceph_auth_handshake *prepare_connect_authorizer(struct ceph_connec
 
 	mutex_unlock(&con->mutex);
 
-	auth_protocol = CEPH_AUTH_UNKNOWN;
-	auth = con->ops->get_authorizer(con, &auth_protocol, con->auth_retry);
+	auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry);
 
 	mutex_lock(&con->mutex);
 
@@ -686,7 +685,6 @@ static struct ceph_auth_handshake *prepare_connect_authorizer(struct ceph_connec
 	con->auth_reply_buf = auth->authorizer_reply_buf;
 	con->auth_reply_buf_len = auth->authorizer_reply_buf_len;
 
-	con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
 	con->out_connect.authorizer_len = cpu_to_le32(auth_len);
 
 	if (auth_len)
@@ -712,6 +710,7 @@ static int prepare_write_connect(struct ceph_connection *con)
 {
 	unsigned global_seq = get_global_seq(con->msgr, 0);
 	int proto;
+	int auth_proto;
 	struct ceph_auth_handshake *auth;
 
 	switch (con->peer_name.type) {
@@ -739,9 +738,11 @@ static int prepare_write_connect(struct ceph_connection *con)
 	con->out_connect.flags = 0;
 
 	ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect);
-	auth = prepare_connect_authorizer(con);
+	auth_proto = CEPH_AUTH_UNKNOWN;
+	auth = get_connect_authorizer(con, &auth_proto);
 	if (IS_ERR(auth))
 		return PTR_ERR(auth);
+	con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
 
 	con->out_more = 0;
 	set_bit(WRITE_PENDING, &con->state);
-- 
cgit v1.2.3-18-g5258


From 3da54776e2c0385c32d143fd497a7f40a88e29dd Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 16 May 2012 15:16:39 -0500
Subject: ceph: add auth buf in prepare_write_connect()

Move the addition of the authorizer buffer to a connection's
out_kvec out of get_connect_authorizer() and into its caller.  This
way, the caller--prepare_write_connect()--can avoid adding the
connect header to out_kvec before it has been fully initialized.

Prior to this patch, it was possible for a connect header to be
sent over the wire before the authorizer protocol or buffer length
fields were initialized.  An authorizer buffer associated with that
header could also be queued to send only after the connection header
that describes it was on the wire.

Fixes http://tracker.newdream.net/issues/2424

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 net/ceph/messenger.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index bfddd87db78..1a80907282c 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -656,8 +656,6 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection *con,
 						int *auth_proto)
 {
-	void *auth_buf;
-	int auth_len;
 	struct ceph_auth_handshake *auth;
 
 	if (!con->ops->get_authorizer) {
@@ -680,15 +678,9 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection
 	if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state))
 		return ERR_PTR(-EAGAIN);
 
-	auth_buf = auth->authorizer_buf;
-	auth_len = auth->authorizer_buf_len;
 	con->auth_reply_buf = auth->authorizer_reply_buf;
 	con->auth_reply_buf_len = auth->authorizer_reply_buf_len;
 
-	con->out_connect.authorizer_len = cpu_to_le32(auth_len);
-
-	if (auth_len)
-		ceph_con_out_kvec_add(con, auth_len, auth_buf);
 
 	return auth;
 }
@@ -737,12 +729,20 @@ static int prepare_write_connect(struct ceph_connection *con)
 	con->out_connect.protocol_version = cpu_to_le32(proto);
 	con->out_connect.flags = 0;
 
-	ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect);
 	auth_proto = CEPH_AUTH_UNKNOWN;
 	auth = get_connect_authorizer(con, &auth_proto);
 	if (IS_ERR(auth))
 		return PTR_ERR(auth);
+
 	con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
+	con->out_connect.authorizer_len = auth ?
+		cpu_to_le32(auth->authorizer_buf_len) : 0;
+
+	ceph_con_out_kvec_add(con, sizeof (con->out_connect),
+					&con->out_connect);
+	if (auth && auth->authorizer_buf_len)
+		ceph_con_out_kvec_add(con, auth->authorizer_buf_len,
+					auth->authorizer_buf);
 
 	con->out_more = 0;
 	set_bit(WRITE_PENDING, &con->state);
-- 
cgit v1.2.3-18-g5258


From 35f9f8a09e1e88e31bd34a1e645ca0e5f070dd5c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 16 May 2012 15:16:38 -0500
Subject: libceph: avoid unregistering osd request when not registered

There is a race between two __unregister_request() callers: the
reply path and the ceph_osdc_wait_request().  If we get a reply
*and* the timeout expires at roughly the same time, both callers
will try to unregister the request, and the second one will do bad
things.

Simply check if the request is still already unregistered; if so,
return immediately and do nothing.

Fixes http://tracker.newdream.net/issues/2420

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/osd_client.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b7d633cc96a..b098e7b591f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -841,6 +841,12 @@ static void register_request(struct ceph_osd_client *osdc,
 static void __unregister_request(struct ceph_osd_client *osdc,
 				 struct ceph_osd_request *req)
 {
+	if (RB_EMPTY_NODE(&req->r_node)) {
+		dout("__unregister_request %p tid %lld not registered\n",
+			req, req->r_tid);
+		return;
+	}
+
 	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
 	rb_erase(&req->r_node, &osdc->requests);
 	osdc->num_requests--;
-- 
cgit v1.2.3-18-g5258


From 6bd9adbdf9ca6a052b0b7455ac67b925eb38cfad Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 21 May 2012 09:45:23 -0700
Subject: libceph: fix pg_temp updates

Usually, we are adding pg_temp entries or removing them.  Occasionally they
update.  In that case, osdmap_apply_incremental() was failing because the
rbtree entry already exists.

Fix by removing the existing entry before inserting a new one.

Fixes http://tracker.newdream.net/issues/2446

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 net/ceph/osdmap.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 2592f3cca98..1892c523c43 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -883,8 +883,12 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		pglen = ceph_decode_32(p);
 
 		if (pglen) {
-			/* insert */
 			ceph_decode_need(p, end, pglen*sizeof(u32), bad);
+
+			/* removing existing (if any) */
+			(void) __remove_pg_mapping(&map->pg_temp, pgid);
+
+			/* insert */
 			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
 			if (!pg) {
 				err = -ENOMEM;
-- 
cgit v1.2.3-18-g5258