aboutsummaryrefslogtreecommitdiff
path: root/drivers/block/rbd.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/rbd.c')
-rw-r--r--drivers/block/rbd.c843
1 files changed, 563 insertions, 280 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4ad2ad9a5bb..b2c98c1bc03 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -41,6 +41,7 @@
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/idr.h>
#include "rbd_types.h"
@@ -89,9 +90,9 @@ static int atomic_dec_return_safe(atomic_t *v)
}
#define RBD_DRV_NAME "rbd"
-#define RBD_DRV_NAME_LONG "rbd (rados block device)"
-#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
+#define RBD_MINORS_PER_MAJOR 256
+#define RBD_SINGLE_MAJOR_PART_SHIFT 4
#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
#define RBD_MAX_SNAP_NAME_LEN \
@@ -323,6 +324,7 @@ struct rbd_device {
int dev_id; /* blkdev unique id */
int major; /* blkdev assigned major */
+ int minor;
struct gendisk *disk; /* blkdev's gendisk and rq */
u32 image_format; /* Either 1 or 2 */
@@ -386,6 +388,17 @@ static struct kmem_cache *rbd_img_request_cache;
static struct kmem_cache *rbd_obj_request_cache;
static struct kmem_cache *rbd_segment_name_cache;
+static int rbd_major;
+static DEFINE_IDA(rbd_dev_id_ida);
+
+/*
+ * Default to false for now, as single-major requires >= 0.75 version of
+ * userspace rbd utility.
+ */
+static bool single_major = false;
+module_param(single_major, bool, S_IRUGO);
+MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
+
static int rbd_img_request_submit(struct rbd_img_request *img_request);
static void rbd_dev_device_release(struct device *dev);
@@ -394,18 +407,56 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf,
size_t count);
static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
size_t count);
+static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
+ size_t count);
+static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
+ size_t count);
static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
static void rbd_spec_put(struct rbd_spec *spec);
-static struct bus_attribute rbd_bus_attrs[] = {
- __ATTR(add, S_IWUSR, NULL, rbd_add),
- __ATTR(remove, S_IWUSR, NULL, rbd_remove),
- __ATTR_NULL
+static int rbd_dev_id_to_minor(int dev_id)
+{
+ return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
+}
+
+static int minor_to_rbd_dev_id(int minor)
+{
+ return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
+}
+
+static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
+static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
+static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
+static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
+
+static struct attribute *rbd_bus_attrs[] = {
+ &bus_attr_add.attr,
+ &bus_attr_remove.attr,
+ &bus_attr_add_single_major.attr,
+ &bus_attr_remove_single_major.attr,
+ NULL,
+};
+
+static umode_t rbd_bus_is_visible(struct kobject *kobj,
+ struct attribute *attr, int index)
+{
+ if (!single_major &&
+ (attr == &bus_attr_add_single_major.attr ||
+ attr == &bus_attr_remove_single_major.attr))
+ return 0;
+
+ return attr->mode;
+}
+
+static const struct attribute_group rbd_bus_group = {
+ .attrs = rbd_bus_attrs,
+ .is_visible = rbd_bus_is_visible,
};
+__ATTRIBUTE_GROUPS(rbd_bus);
static struct bus_type rbd_bus_type = {
.name = "rbd",
- .bus_attrs = rbd_bus_attrs,
+ .bus_groups = rbd_bus_groups,
};
static void rbd_root_dev_release(struct device *dev)
@@ -490,7 +541,6 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
return -ENOENT;
(void) get_device(&rbd_dev->dev);
- set_device_ro(bdev, rbd_dev->mapping.read_only);
return 0;
}
@@ -508,10 +558,76 @@ static void rbd_release(struct gendisk *disk, fmode_t mode)
put_device(&rbd_dev->dev);
}
+static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
+{
+ int ret = 0;
+ int val;
+ bool ro;
+ bool ro_changed = false;
+
+ /* get_user() may sleep, so call it before taking rbd_dev->lock */
+ if (get_user(val, (int __user *)(arg)))
+ return -EFAULT;
+
+ ro = val ? true : false;
+ /* Snapshot doesn't allow to write*/
+ if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
+ return -EROFS;
+
+ spin_lock_irq(&rbd_dev->lock);
+ /* prevent others open this device */
+ if (rbd_dev->open_count > 1) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (rbd_dev->mapping.read_only != ro) {
+ rbd_dev->mapping.read_only = ro;
+ ro_changed = true;
+ }
+
+out:
+ spin_unlock_irq(&rbd_dev->lock);
+ /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
+ if (ret == 0 && ro_changed)
+ set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
+
+ return ret;
+}
+
+static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
+ int ret = 0;
+
+ switch (cmd) {
+ case BLKROSET:
+ ret = rbd_ioctl_set_ro(rbd_dev, arg);
+ break;
+ default:
+ ret = -ENOTTY;
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ return rbd_ioctl(bdev, mode, cmd, arg);
+}
+#endif /* CONFIG_COMPAT */
+
static const struct block_device_operations rbd_bd_ops = {
.owner = THIS_MODULE,
.open = rbd_open,
.release = rbd_release,
+ .ioctl = rbd_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = rbd_compat_ioctl,
+#endif
};
/*
@@ -927,12 +1043,14 @@ static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
u64 snap_id)
{
u32 which;
+ const char *snap_name;
which = rbd_dev_snap_index(rbd_dev, snap_id);
if (which == BAD_SNAP_INDEX)
- return NULL;
+ return ERR_PTR(-ENOENT);
- return _rbd_dev_v1_snap_name(rbd_dev, which);
+ snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
+ return snap_name ? snap_name : ERR_PTR(-ENOMEM);
}
static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
@@ -1035,9 +1153,9 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
name_format = "%s.%012llx";
if (rbd_dev->image_format == 2)
name_format = "%s.%016llx";
- ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format,
+ ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
rbd_dev->header.object_prefix, segment);
- if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
+ if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
pr_err("error formatting segment name for #%llu (%d)\n",
segment, ret);
kfree(name);
@@ -1103,23 +1221,23 @@ static void bio_chain_put(struct bio *chain)
*/
static void zero_bio_chain(struct bio *chain, int start_ofs)
{
- struct bio_vec *bv;
+ struct bio_vec bv;
+ struct bvec_iter iter;
unsigned long flags;
void *buf;
- int i;
int pos = 0;
while (chain) {
- bio_for_each_segment(bv, chain, i) {
- if (pos + bv->bv_len > start_ofs) {
+ bio_for_each_segment(bv, chain, iter) {
+ if (pos + bv.bv_len > start_ofs) {
int remainder = max(start_ofs - pos, 0);
- buf = bvec_kmap_irq(bv, &flags);
+ buf = bvec_kmap_irq(&bv, &flags);
memset(buf + remainder, 0,
- bv->bv_len - remainder);
- flush_dcache_page(bv->bv_page);
+ bv.bv_len - remainder);
+ flush_dcache_page(bv.bv_page);
bvec_kunmap_irq(buf, &flags);
}
- pos += bv->bv_len;
+ pos += bv.bv_len;
}
chain = chain->bi_next;
@@ -1167,74 +1285,14 @@ static struct bio *bio_clone_range(struct bio *bio_src,
unsigned int len,
gfp_t gfpmask)
{
- struct bio_vec *bv;
- unsigned int resid;
- unsigned short idx;
- unsigned int voff;
- unsigned short end_idx;
- unsigned short vcnt;
struct bio *bio;
- /* Handle the easy case for the caller */
-
- if (!offset && len == bio_src->bi_size)
- return bio_clone(bio_src, gfpmask);
-
- if (WARN_ON_ONCE(!len))
- return NULL;
- if (WARN_ON_ONCE(len > bio_src->bi_size))
- return NULL;
- if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
- return NULL;
-
- /* Find first affected segment... */
-
- resid = offset;
- bio_for_each_segment(bv, bio_src, idx) {
- if (resid < bv->bv_len)
- break;
- resid -= bv->bv_len;
- }
- voff = resid;
-
- /* ...and the last affected segment */
-
- resid += len;
- __bio_for_each_segment(bv, bio_src, end_idx, idx) {
- if (resid <= bv->bv_len)
- break;
- resid -= bv->bv_len;
- }
- vcnt = end_idx - idx + 1;
-
- /* Build the clone */
-
- bio = bio_alloc(gfpmask, (unsigned int) vcnt);
+ bio = bio_clone(bio_src, gfpmask);
if (!bio)
return NULL; /* ENOMEM */
- bio->bi_bdev = bio_src->bi_bdev;
- bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
- bio->bi_rw = bio_src->bi_rw;
- bio->bi_flags |= 1 << BIO_CLONED;
-
- /*
- * Copy over our part of the bio_vec, then update the first
- * and last (or only) entries.
- */
- memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
- vcnt * sizeof (struct bio_vec));
- bio->bi_io_vec[0].bv_offset += voff;
- if (vcnt > 1) {
- bio->bi_io_vec[0].bv_len -= voff;
- bio->bi_io_vec[vcnt - 1].bv_len = resid;
- } else {
- bio->bi_io_vec[0].bv_len = len;
- }
-
- bio->bi_vcnt = vcnt;
- bio->bi_size = len;
- bio->bi_idx = 0;
+ bio_advance(bio, offset);
+ bio->bi_iter.bi_size = len;
return bio;
}
@@ -1265,7 +1323,7 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
/* Build up a chain of clone bios up to the limit */
- if (!bi || off >= bi->bi_size || !len)
+ if (!bi || off >= bi->bi_iter.bi_size || !len)
return NULL; /* Nothing to clone */
end = &chain;
@@ -1277,7 +1335,7 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
rbd_warn(NULL, "bio_chain exhausted with %u left", len);
goto out_err; /* EINVAL; ran out of bio's */
}
- bi_size = min_t(unsigned int, bi->bi_size - off, len);
+ bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
bio = bio_clone_range(bi, off, bi_size, gfpmask);
if (!bio)
goto out_err; /* ENOMEM */
@@ -1286,7 +1344,7 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
end = &bio->bi_next;
off += bi_size;
- if (off == bi->bi_size) {
+ if (off == bi->bi_iter.bi_size) {
bi = bi->bi_next;
off = 0;
}
@@ -1373,6 +1431,14 @@ static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
}
+static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
+{
+ struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
+
+ return obj_request->img_offset <
+ round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
+}
+
static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
{
dout("%s: obj %p (was %d)\n", __func__, obj_request,
@@ -1389,6 +1455,13 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
kref_put(&obj_request->kref, rbd_obj_request_destroy);
}
+static void rbd_img_request_get(struct rbd_img_request *img_request)
+{
+ dout("%s: img %p (was %d)\n", __func__, img_request,
+ atomic_read(&img_request->kref.refcount));
+ kref_get(&img_request->kref);
+}
+
static bool img_request_child_test(struct rbd_img_request *img_request);
static void rbd_parent_request_destroy(struct kref *kref);
static void rbd_img_request_destroy(struct kref *kref);
@@ -1557,11 +1630,12 @@ rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
obj_request, obj_request->img_request, obj_request->result,
xferred, length);
/*
- * ENOENT means a hole in the image. We zero-fill the
- * entire length of the request. A short read also implies
- * zero-fill to the end of the request. Either way we
- * update the xferred count to indicate the whole request
- * was satisfied.
+ * ENOENT means a hole in the image. We zero-fill the entire
+ * length of the request. A short read also implies zero-fill
+ * to the end of the request. An error requires the whole
+ * length of the request to be reported finished with an error
+ * to the block layer. In each case we update the xferred
+ * count to indicate the whole request was satisfied.
*/
rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
if (obj_request->result == -ENOENT) {
@@ -1570,14 +1644,13 @@ rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
else
zero_pages(obj_request->pages, 0, length);
obj_request->result = 0;
- obj_request->xferred = length;
} else if (xferred < length && !obj_request->result) {
if (obj_request->type == OBJ_REQUEST_BIO)
zero_bio_chain(obj_request->bio_list, xferred);
else
zero_pages(obj_request->pages, xferred, length);
- obj_request->xferred = length;
}
+ obj_request->xferred = length;
obj_request_done_set(obj_request);
}
@@ -1661,7 +1734,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
if (osd_req->r_result < 0)
obj_request->result = osd_req->r_result;
- BUG_ON(osd_req->r_num_ops > 2);
+ rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
/*
* We support a 64-bit length, but ultimately it has to be
@@ -1669,11 +1742,15 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
*/
obj_request->xferred = osd_req->r_reply_op_len[0];
rbd_assert(obj_request->xferred < (u64)UINT_MAX);
+
opcode = osd_req->r_ops[0].op;
switch (opcode) {
case CEPH_OSD_OP_READ:
rbd_osd_read_callback(obj_request);
break;
+ case CEPH_OSD_OP_SETALLOCHINT:
+ rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
+ /* fall through */
case CEPH_OSD_OP_WRITE:
rbd_osd_write_callback(obj_request);
break;
@@ -1722,9 +1799,16 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
snapc, CEPH_NOSNAP, &mtime);
}
+/*
+ * Create an osd request. A read request has one osd op (read).
+ * A write request has either one (watch) or two (hint+write) osd ops.
+ * (All rbd data writes are prefixed with an allocation hint op, but
+ * technically osd watch is a write request, hence this distinction.)
+ */
static struct ceph_osd_request *rbd_osd_req_create(
struct rbd_device *rbd_dev,
bool write_request,
+ unsigned int num_ops,
struct rbd_obj_request *obj_request)
{
struct ceph_snap_context *snapc = NULL;
@@ -1740,10 +1824,13 @@ static struct ceph_osd_request *rbd_osd_req_create(
snapc = img_request->snapc;
}
- /* Allocate and initialize the request, for the single op */
+ rbd_assert(num_ops == 1 || (write_request && num_ops == 2));
+
+ /* Allocate and initialize the request, for the num_ops ops */
osdc = &rbd_dev->rbd_client->client->osdc;
- osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
+ osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
+ GFP_ATOMIC);
if (!osd_req)
return NULL; /* ENOMEM */
@@ -1755,19 +1842,16 @@ static struct ceph_osd_request *rbd_osd_req_create(
osd_req->r_callback = rbd_osd_req_callback;
osd_req->r_priv = obj_request;
- osd_req->r_oid_len = strlen(obj_request->object_name);
- rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
- memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
-
- osd_req->r_file_layout = rbd_dev->layout; /* struct */
+ osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+ ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
return osd_req;
}
/*
* Create a copyup osd request based on the information in the
- * object request supplied. A copyup request has two osd ops,
- * a copyup method call, and a "normal" write request.
+ * object request supplied. A copyup request has three osd ops,
+ * a copyup method call, a hint op, and a write op.
*/
static struct ceph_osd_request *
rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
@@ -1783,12 +1867,12 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
rbd_assert(img_request);
rbd_assert(img_request_write_test(img_request));
- /* Allocate and initialize the request, for the two ops */
+ /* Allocate and initialize the request, for the three ops */
snapc = img_request->snapc;
rbd_dev = img_request->rbd_dev;
osdc = &rbd_dev->rbd_client->client->osdc;
- osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
+ osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC);
if (!osd_req)
return NULL; /* ENOMEM */
@@ -1796,11 +1880,8 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
osd_req->r_callback = rbd_osd_req_callback;
osd_req->r_priv = obj_request;
- osd_req->r_oid_len = strlen(obj_request->object_name);
- rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
- memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
-
- osd_req->r_file_layout = rbd_dev->layout; /* struct */
+ osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+ ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
return osd_req;
}
@@ -2122,7 +2203,6 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
rbd_assert(img_request->obj_request_count > 0);
rbd_assert(which != BAD_WHICH);
rbd_assert(which < img_request->obj_request_count);
- rbd_assert(which >= img_request->next_completion);
spin_lock_irq(&img_request->completion_lock);
if (which != img_request->next_completion)
@@ -2142,6 +2222,7 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
img_request->next_completion = which;
out:
spin_unlock_irq(&img_request->completion_lock);
+ rbd_img_request_put(img_request);
if (!more)
rbd_img_request_complete(img_request);
@@ -2163,9 +2244,9 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
struct rbd_obj_request *obj_request = NULL;
struct rbd_obj_request *next_obj_request;
bool write_request = img_request_write_test(img_request);
- struct bio *bio_list = 0;
+ struct bio *bio_list = NULL;
unsigned int bio_offset = 0;
- struct page **pages = 0;
+ struct page **pages = NULL;
u64 img_offset;
u64 resid;
u16 opcode;
@@ -2180,7 +2261,8 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
if (type == OBJ_REQUEST_BIO) {
bio_list = data_desc;
- rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
+ rbd_assert(img_offset ==
+ bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
} else {
rbd_assert(type == OBJ_REQUEST_PAGES);
pages = data_desc;
@@ -2191,6 +2273,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
const char *object_name;
u64 offset;
u64 length;
+ unsigned int which = 0;
object_name = rbd_segment_name(rbd_dev, img_offset);
if (!object_name)
@@ -2204,6 +2287,12 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
if (!obj_request)
goto out_unwind;
+ /*
+ * set obj_request->img_request before creating the
+ * osd_request so that it gets the right snapc
+ */
+ rbd_img_obj_request_add(img_request, obj_request);
+
if (type == OBJ_REQUEST_BIO) {
unsigned int clone_size;
@@ -2215,7 +2304,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
clone_size,
GFP_ATOMIC);
if (!obj_request->bio_list)
- goto out_partial;
+ goto out_unwind;
} else {
unsigned int page_count;
@@ -2228,27 +2317,31 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
}
osd_req = rbd_osd_req_create(rbd_dev, write_request,
- obj_request);
+ (write_request ? 2 : 1),
+ obj_request);
if (!osd_req)
- goto out_partial;
+ goto out_unwind;
obj_request->osd_req = osd_req;
obj_request->callback = rbd_img_obj_callback;
+ rbd_img_request_get(img_request);
+
+ if (write_request) {
+ osd_req_op_alloc_hint_init(osd_req, which,
+ rbd_obj_bytes(&rbd_dev->header),
+ rbd_obj_bytes(&rbd_dev->header));
+ which++;
+ }
- osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
- 0, 0);
+ osd_req_op_extent_init(osd_req, which, opcode, offset, length,
+ 0, 0);
if (type == OBJ_REQUEST_BIO)
- osd_req_op_extent_osd_data_bio(osd_req, 0,
+ osd_req_op_extent_osd_data_bio(osd_req, which,
obj_request->bio_list, length);
else
- osd_req_op_extent_osd_data_pages(osd_req, 0,
+ osd_req_op_extent_osd_data_pages(osd_req, which,
obj_request->pages, length,
offset & ~PAGE_MASK, false, false);
- /*
- * set obj_request->img_request before formatting
- * the osd_request so that it gets the right snapc
- */
- rbd_img_obj_request_add(img_request, obj_request);
if (write_request)
rbd_osd_req_format_write(obj_request);
else
@@ -2262,11 +2355,9 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
return 0;
-out_partial:
- rbd_obj_request_put(obj_request);
out_unwind:
for_each_obj_request_safe(img_request, obj_request, next_obj_request)
- rbd_obj_request_put(obj_request);
+ rbd_img_obj_request_del(img_request, obj_request);
return -ENOMEM;
}
@@ -2366,7 +2457,7 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
/*
* The original osd request is of no use to use any more.
- * We need a new one that can hold the two ops in a copyup
+ * We need a new one that can hold the three ops in a copyup
* request. Allocate the new copyup osd request for the
* original request, and release the old one.
*/
@@ -2385,17 +2476,22 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
false, false);
- /* Then the original write request op */
+ /* Then the hint op */
+
+ osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header),
+ rbd_obj_bytes(&rbd_dev->header));
+
+ /* And the original write request op */
offset = orig_request->offset;
length = orig_request->length;
- osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
+ osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE,
offset, length, 0, 0);
if (orig_request->type == OBJ_REQUEST_BIO)
- osd_req_op_extent_osd_data_bio(osd_req, 1,
+ osd_req_op_extent_osd_data_bio(osd_req, 2,
orig_request->bio_list, length);
else
- osd_req_op_extent_osd_data_pages(osd_req, 1,
+ osd_req_op_extent_osd_data_pages(osd_req, 2,
orig_request->pages, length,
offset & ~PAGE_MASK, false, false);
@@ -2616,8 +2712,8 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
rbd_assert(obj_request->img_request);
rbd_dev = obj_request->img_request->rbd_dev;
- stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
- stat_request);
+ stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+ stat_request);
if (!stat_request->osd_req)
goto out;
stat_request->callback = rbd_img_obj_exists_callback;
@@ -2660,7 +2756,7 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
*/
if (!img_request_write_test(img_request) ||
!img_request_layered_test(img_request) ||
- rbd_dev->parent_overlap <= obj_request->img_offset ||
+ !obj_request_overlaps_parent(obj_request) ||
((known = obj_request_known_test(obj_request)) &&
obj_request_exists_test(obj_request))) {
@@ -2808,7 +2904,7 @@ out_err:
obj_request_done_set(obj_request);
}
-static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
+static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
{
struct rbd_obj_request *obj_request;
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
@@ -2820,19 +2916,21 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
return -ENOMEM;
ret = -ENOMEM;
- obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+ obj_request);
if (!obj_request->osd_req)
goto out;
- obj_request->callback = rbd_obj_request_put;
osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
notify_id, 0, 0);
rbd_osd_req_format_read(obj_request);
ret = rbd_obj_request_submit(osdc, obj_request);
-out:
if (ret)
- rbd_obj_request_put(obj_request);
+ goto out;
+ ret = rbd_obj_request_wait(obj_request);
+out:
+ rbd_obj_request_put(obj_request);
return ret;
}
@@ -2852,59 +2950,59 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
if (ret)
rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
- rbd_obj_notify_ack(rbd_dev, notify_id);
+ rbd_obj_notify_ack_sync(rbd_dev, notify_id);
}
/*
- * Request sync osd watch/unwatch. The value of "start" determines
- * whether a watch request is being initiated or torn down.
+ * Initiate a watch request, synchronously.
*/
-static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
{
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
struct rbd_obj_request *obj_request;
int ret;
- rbd_assert(start ^ !!rbd_dev->watch_event);
- rbd_assert(start ^ !!rbd_dev->watch_request);
+ rbd_assert(!rbd_dev->watch_event);
+ rbd_assert(!rbd_dev->watch_request);
- if (start) {
- ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
- &rbd_dev->watch_event);
- if (ret < 0)
- return ret;
- rbd_assert(rbd_dev->watch_event != NULL);
- }
+ ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
+ &rbd_dev->watch_event);
+ if (ret < 0)
+ return ret;
+
+ rbd_assert(rbd_dev->watch_event);
- ret = -ENOMEM;
obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
- OBJ_REQUEST_NODATA);
- if (!obj_request)
+ OBJ_REQUEST_NODATA);
+ if (!obj_request) {
+ ret = -ENOMEM;
goto out_cancel;
+ }
- obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
- if (!obj_request->osd_req)
- goto out_cancel;
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
+ obj_request);
+ if (!obj_request->osd_req) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
- if (start)
- ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
- else
- ceph_osdc_unregister_linger_request(osdc,
- rbd_dev->watch_request->osd_req);
+ ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
- rbd_dev->watch_event->cookie, 0, start ? 1 : 0);
+ rbd_dev->watch_event->cookie, 0, 1);
rbd_osd_req_format_write(obj_request);
ret = rbd_obj_request_submit(osdc, obj_request);
if (ret)
- goto out_cancel;
+ goto out_linger;
+
ret = rbd_obj_request_wait(obj_request);
if (ret)
- goto out_cancel;
+ goto out_linger;
+
ret = obj_request->result;
if (ret)
- goto out_cancel;
+ goto out_linger;
/*
* A watch request is set to linger, so the underlying osd
@@ -2914,26 +3012,90 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
* it. We'll drop that reference (below) after we've
* unregistered it.
*/
- if (start) {
- rbd_dev->watch_request = obj_request;
+ rbd_dev->watch_request = obj_request;
- return 0;
+ return 0;
+
+out_linger:
+ ceph_osdc_unregister_linger_request(osdc, obj_request->osd_req);
+out_put:
+ rbd_obj_request_put(obj_request);
+out_cancel:
+ ceph_osdc_cancel_event(rbd_dev->watch_event);
+ rbd_dev->watch_event = NULL;
+
+ return ret;
+}
+
+/*
+ * Tear down a watch request, synchronously.
+ */
+static int __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
+{
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ struct rbd_obj_request *obj_request;
+ int ret;
+
+ rbd_assert(rbd_dev->watch_event);
+ rbd_assert(rbd_dev->watch_request);
+
+ obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
+ OBJ_REQUEST_NODATA);
+ if (!obj_request) {
+ ret = -ENOMEM;
+ goto out_cancel;
}
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
+ obj_request);
+ if (!obj_request->osd_req) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
+ rbd_dev->watch_event->cookie, 0, 0);
+ rbd_osd_req_format_write(obj_request);
+
+ ret = rbd_obj_request_submit(osdc, obj_request);
+ if (ret)
+ goto out_put;
+
+ ret = rbd_obj_request_wait(obj_request);
+ if (ret)
+ goto out_put;
+
+ ret = obj_request->result;
+ if (ret)
+ goto out_put;
+
/* We have successfully torn down the watch request */
+ ceph_osdc_unregister_linger_request(osdc,
+ rbd_dev->watch_request->osd_req);
rbd_obj_request_put(rbd_dev->watch_request);
rbd_dev->watch_request = NULL;
+
+out_put:
+ rbd_obj_request_put(obj_request);
out_cancel:
- /* Cancel the event if we're tearing down, or on error */
ceph_osdc_cancel_event(rbd_dev->watch_event);
rbd_dev->watch_event = NULL;
- if (obj_request)
- rbd_obj_request_put(obj_request);
return ret;
}
+static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
+{
+ int ret;
+
+ ret = __rbd_dev_header_unwatch_sync(rbd_dev);
+ if (ret) {
+ rbd_warn(rbd_dev, "unable to tear down watch request: %d\n",
+ ret);
+ }
+}
+
/*
* Synchronous osd object method call. Returns the number of bytes
* returned in the outbound buffer, or a negative error code.
@@ -2974,7 +3136,8 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
obj_request->pages = pages;
obj_request->page_count = page_count;
- obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+ obj_request);
if (!obj_request->osd_req)
goto out;
@@ -3024,7 +3187,6 @@ static void rbd_request_fn(struct request_queue *q)
__releases(q->queue_lock) __acquires(q->queue_lock)
{
struct rbd_device *rbd_dev = q->queuedata;
- bool read_only = rbd_dev->mapping.read_only;
struct request *rq;
int result;
@@ -3060,7 +3222,7 @@ static void rbd_request_fn(struct request_queue *q)
if (write_request) {
result = -EROFS;
- if (read_only)
+ if (rbd_dev->mapping.read_only)
goto end_request;
rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
}
@@ -3207,7 +3369,8 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
obj_request->pages = pages;
obj_request->page_count = page_count;
- obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
+ obj_request);
if (!obj_request->osd_req)
goto out;
@@ -3324,6 +3487,31 @@ static void rbd_exists_validate(struct rbd_device *rbd_dev)
clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
}
+static void rbd_dev_update_size(struct rbd_device *rbd_dev)
+{
+ sector_t size;
+ bool removing;
+
+ /*
+ * Don't hold the lock while doing disk operations,
+ * or lock ordering will conflict with the bdev mutex via:
+ * rbd_add() -> blkdev_get() -> rbd_open()
+ */
+ spin_lock_irq(&rbd_dev->lock);
+ removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
+ spin_unlock_irq(&rbd_dev->lock);
+ /*
+ * If the device is being removed, rbd_dev->disk has
+ * been destroyed, so don't try to update its size
+ */
+ if (!removing) {
+ size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
+ dout("setting size to %llu sectors", (unsigned long long)size);
+ set_capacity(rbd_dev->disk, size);
+ revalidate_disk(rbd_dev->disk);
+ }
+}
+
static int rbd_dev_refresh(struct rbd_device *rbd_dev)
{
u64 mapping_size;
@@ -3343,12 +3531,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
up_write(&rbd_dev->header_rwsem);
if (mapping_size != rbd_dev->mapping.size) {
- sector_t size;
-
- size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
- dout("setting size to %llu sectors", (unsigned long long)size);
- set_capacity(rbd_dev->disk, size);
- revalidate_disk(rbd_dev->disk);
+ rbd_dev_update_size(rbd_dev);
}
return ret;
@@ -3361,14 +3544,18 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
u64 segment_size;
/* create gendisk info */
- disk = alloc_disk(RBD_MINORS_PER_MAJOR);
+ disk = alloc_disk(single_major ?
+ (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
+ RBD_MINORS_PER_MAJOR);
if (!disk)
return -ENOMEM;
snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
rbd_dev->dev_id);
disk->major = rbd_dev->major;
- disk->first_minor = 0;
+ disk->first_minor = rbd_dev->minor;
+ if (single_major)
+ disk->flags |= GENHD_FL_EXT_DEVT;
disk->fops = &rbd_bd_ops;
disk->private_data = rbd_dev;
@@ -3440,7 +3627,14 @@ static ssize_t rbd_major_show(struct device *dev,
return sprintf(buf, "%d\n", rbd_dev->major);
return sprintf(buf, "(none)\n");
+}
+static ssize_t rbd_minor_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+ return sprintf(buf, "%d\n", rbd_dev->minor);
}
static ssize_t rbd_client_id_show(struct device *dev,
@@ -3562,6 +3756,7 @@ static ssize_t rbd_image_refresh(struct device *dev,
static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
+static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
@@ -3575,6 +3770,7 @@ static struct attribute *rbd_attrs[] = {
&dev_attr_size.attr,
&dev_attr_features.attr,
&dev_attr_major.attr,
+ &dev_attr_minor.attr,
&dev_attr_client_id.attr,
&dev_attr_pool.attr,
&dev_attr_pool_id.attr,
@@ -3702,12 +3898,14 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
if (ret < sizeof (size_buf))
return -ERANGE;
- if (order)
+ if (order) {
*order = size_buf.order;
+ dout(" order %u", (unsigned int)*order);
+ }
*snap_size = le64_to_cpu(size_buf.size);
- dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
- (unsigned long long)snap_id, (unsigned int)*order,
+ dout(" snap_id 0x%016llx snap_size = %llu\n",
+ (unsigned long long)snap_id,
(unsigned long long)*snap_size);
return 0;
@@ -4055,8 +4253,13 @@ static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
snap_id = snapc->snaps[which];
snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
- if (IS_ERR(snap_name))
- break;
+ if (IS_ERR(snap_name)) {
+ /* ignore no-longer existing snapshots */
+ if (PTR_ERR(snap_name) == -ENOENT)
+ continue;
+ else
+ break;
+ }
found = !strcmp(name, snap_name);
kfree(snap_name);
}
@@ -4135,8 +4338,8 @@ static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
/* Look up the snapshot name, and make a copy */
snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
- if (!snap_name) {
- ret = -ENOMEM;
+ if (IS_ERR(snap_name)) {
+ ret = PTR_ERR(snap_name);
goto out_err;
}
@@ -4338,21 +4541,29 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
device_unregister(&rbd_dev->dev);
}
-static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
-
/*
* Get a unique rbd identifier for the given new rbd_dev, and add
- * the rbd_dev to the global list. The minimum rbd id is 1.
+ * the rbd_dev to the global list.
*/
-static void rbd_dev_id_get(struct rbd_device *rbd_dev)
+static int rbd_dev_id_get(struct rbd_device *rbd_dev)
{
- rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
+ int new_dev_id;
+
+ new_dev_id = ida_simple_get(&rbd_dev_id_ida,
+ 0, minor_to_rbd_dev_id(1 << MINORBITS),
+ GFP_KERNEL);
+ if (new_dev_id < 0)
+ return new_dev_id;
+
+ rbd_dev->dev_id = new_dev_id;
spin_lock(&rbd_dev_list_lock);
list_add_tail(&rbd_dev->node, &rbd_dev_list);
spin_unlock(&rbd_dev_list_lock);
- dout("rbd_dev %p given dev id %llu\n", rbd_dev,
- (unsigned long long) rbd_dev->dev_id);
+
+ dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
+
+ return 0;
}
/*
@@ -4361,49 +4572,13 @@ static void rbd_dev_id_get(struct rbd_device *rbd_dev)
*/
static void rbd_dev_id_put(struct rbd_device *rbd_dev)
{
- struct list_head *tmp;
- int rbd_id = rbd_dev->dev_id;
- int max_id;
-
- rbd_assert(rbd_id > 0);
-
- dout("rbd_dev %p released dev id %llu\n", rbd_dev,
- (unsigned long long) rbd_dev->dev_id);
spin_lock(&rbd_dev_list_lock);
list_del_init(&rbd_dev->node);
-
- /*
- * If the id being "put" is not the current maximum, there
- * is nothing special we need to do.
- */
- if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
- spin_unlock(&rbd_dev_list_lock);
- return;
- }
-
- /*
- * We need to update the current maximum id. Search the
- * list to find out what it is. We're more likely to find
- * the maximum at the end, so search the list backward.
- */
- max_id = 0;
- list_for_each_prev(tmp, &rbd_dev_list) {
- struct rbd_device *rbd_dev;
-
- rbd_dev = list_entry(tmp, struct rbd_device, node);
- if (rbd_dev->dev_id > max_id)
- max_id = rbd_dev->dev_id;
- }
spin_unlock(&rbd_dev_list_lock);
- /*
- * The max id could have been updated by rbd_dev_id_get(), in
- * which case it now accurately reflects the new maximum.
- * Be careful not to overwrite the maximum value in that
- * case.
- */
- atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
- dout(" max dev id has been reset\n");
+ ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
+
+ dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
}
/*
@@ -4636,6 +4811,38 @@ out_err:
}
/*
+ * Return pool id (>= 0) or a negative error code.
+ */
+static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
+{
+ u64 newest_epoch;
+ unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
+ int tries = 0;
+ int ret;
+
+again:
+ ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
+ if (ret == -ENOENT && tries++ < 1) {
+ ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
+ &newest_epoch);
+ if (ret < 0)
+ return ret;
+
+ if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
+ ceph_monc_request_next_osdmap(&rbdc->client->monc);
+ (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
+ newest_epoch, timeout);
+ goto again;
+ } else {
+ /* the osdmap we have is new enough */
+ return -ENOENT;
+ }
+ }
+
+ return ret;
+}
+
+/*
* An rbd format 2 image has a unique identifier, distinct from the
* name given to it by the user. Internally, that identifier is
* what's used to specify the names of objects related to the image.
@@ -4705,7 +4912,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
image_id = ceph_extract_encoded_string(&p, p + ret,
NULL, GFP_NOIO);
- ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
+ ret = PTR_ERR_OR_ZERO(image_id);
if (!ret)
rbd_dev->image_format = 2;
} else {
@@ -4826,20 +5033,29 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
{
int ret;
- /* generate unique id: find highest unique id, add one */
- rbd_dev_id_get(rbd_dev);
+ /* Get an id and fill in device name. */
+
+ ret = rbd_dev_id_get(rbd_dev);
+ if (ret)
+ return ret;
- /* Fill in the device name, now that we have its id. */
BUILD_BUG_ON(DEV_NAME_LEN
< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
- /* Get our block major device number. */
+ /* Record our major and minor device numbers. */
- ret = register_blkdev(0, rbd_dev->name);
- if (ret < 0)
- goto err_out_id;
- rbd_dev->major = ret;
+ if (!single_major) {
+ ret = register_blkdev(0, rbd_dev->name);
+ if (ret < 0)
+ goto err_out_id;
+
+ rbd_dev->major = ret;
+ rbd_dev->minor = 0;
+ } else {
+ rbd_dev->major = rbd_major;
+ rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
+ }
/* Set up the blkdev mapping. */
@@ -4851,6 +5067,7 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
if (ret)
goto err_out_disk;
set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
+ set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
ret = rbd_bus_add_dev(rbd_dev);
if (ret)
@@ -4871,7 +5088,8 @@ err_out_mapping:
err_out_disk:
rbd_free_disk(rbd_dev);
err_out_blkdev:
- unregister_blkdev(rbd_dev->major, rbd_dev->name);
+ if (!single_major)
+ unregister_blkdev(rbd_dev->major, rbd_dev->name);
err_out_id:
rbd_dev_id_put(rbd_dev);
rbd_dev_mapping_clear(rbd_dev);
@@ -4927,7 +5145,6 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev)
static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
{
int ret;
- int tmp;
/*
* Get the id from the image id object. Unless there's an
@@ -4946,7 +5163,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
goto err_out_format;
if (mapping) {
- ret = rbd_dev_header_watch_sync(rbd_dev, true);
+ ret = rbd_dev_header_watch_sync(rbd_dev);
if (ret)
goto out_header_name;
}
@@ -4973,12 +5190,8 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
err_out_probe:
rbd_dev_unprobe(rbd_dev);
err_out_watch:
- if (mapping) {
- tmp = rbd_dev_header_watch_sync(rbd_dev, false);
- if (tmp)
- rbd_warn(rbd_dev, "unable to tear down "
- "watch request (%d)\n", tmp);
- }
+ if (mapping)
+ rbd_dev_header_unwatch_sync(rbd_dev);
out_header_name:
kfree(rbd_dev->header_name);
rbd_dev->header_name = NULL;
@@ -4992,16 +5205,15 @@ err_out_format:
return ret;
}
-static ssize_t rbd_add(struct bus_type *bus,
- const char *buf,
- size_t count)
+static ssize_t do_rbd_add(struct bus_type *bus,
+ const char *buf,
+ size_t count)
{
struct rbd_device *rbd_dev = NULL;
struct ceph_options *ceph_opts = NULL;
struct rbd_options *rbd_opts = NULL;
struct rbd_spec *spec = NULL;
struct rbd_client *rbdc;
- struct ceph_osd_client *osdc;
bool read_only;
int rc = -ENOMEM;
@@ -5023,8 +5235,7 @@ static ssize_t rbd_add(struct bus_type *bus,
}
/* pick the pool */
- osdc = &rbdc->client->osdc;
- rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
+ rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
if (rc < 0)
goto err_out_client;
spec->pool_id = (u64)rc;
@@ -5056,6 +5267,12 @@ static ssize_t rbd_add(struct bus_type *bus,
rc = rbd_dev_device_setup(rbd_dev);
if (rc) {
+ /*
+ * rbd_dev_header_unwatch_sync() can't be moved into
+ * rbd_dev_image_release() without refactoring, see
+ * commit 1f3ef78861ac.
+ */
+ rbd_dev_header_unwatch_sync(rbd_dev);
rbd_dev_image_release(rbd_dev);
goto err_out_module;
}
@@ -5076,6 +5293,23 @@ err_out_module:
return (ssize_t)rc;
}
+static ssize_t rbd_add(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+{
+ if (single_major)
+ return -EINVAL;
+
+ return do_rbd_add(bus, buf, count);
+}
+
+static ssize_t rbd_add_single_major(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+{
+ return do_rbd_add(bus, buf, count);
+}
+
static void rbd_dev_device_release(struct device *dev)
{
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
@@ -5083,8 +5317,8 @@ static void rbd_dev_device_release(struct device *dev)
rbd_free_disk(rbd_dev);
clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
rbd_dev_mapping_clear(rbd_dev);
- unregister_blkdev(rbd_dev->major, rbd_dev->name);
- rbd_dev->major = 0;
+ if (!single_major)
+ unregister_blkdev(rbd_dev->major, rbd_dev->name);
rbd_dev_id_put(rbd_dev);
rbd_dev_mapping_clear(rbd_dev);
}
@@ -5115,9 +5349,9 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
}
}
-static ssize_t rbd_remove(struct bus_type *bus,
- const char *buf,
- size_t count)
+static ssize_t do_rbd_remove(struct bus_type *bus,
+ const char *buf,
+ size_t count)
{
struct rbd_device *rbd_dev = NULL;
struct list_head *tmp;
@@ -5126,7 +5360,7 @@ static ssize_t rbd_remove(struct bus_type *bus,
bool already = false;
int ret;
- ret = strict_strtoul(buf, 10, &ul);
+ ret = kstrtoul(buf, 10, &ul);
if (ret)
return ret;
@@ -5157,16 +5391,44 @@ static ssize_t rbd_remove(struct bus_type *bus,
if (ret < 0 || already)
return ret;
+ rbd_dev_header_unwatch_sync(rbd_dev);
+ /*
+ * flush remaining watch callbacks - these must be complete
+ * before the osd_client is shutdown
+ */
+ dout("%s: flushing notifies", __func__);
+ ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
+
+ /*
+ * Don't free anything from rbd_dev->disk until after all
+ * notifies are completely processed. Otherwise
+ * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
+ * in a potential use after free of rbd_dev->disk or rbd_dev.
+ */
rbd_bus_del_dev(rbd_dev);
- ret = rbd_dev_header_watch_sync(rbd_dev, false);
- if (ret)
- rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
rbd_dev_image_release(rbd_dev);
module_put(THIS_MODULE);
return count;
}
+static ssize_t rbd_remove(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+{
+ if (single_major)
+ return -EINVAL;
+
+ return do_rbd_remove(bus, buf, count);
+}
+
+static ssize_t rbd_remove_single_major(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+{
+ return do_rbd_remove(bus, buf, count);
+}
+
/*
* create control files in sysfs
* /sys/bus/rbd/...
@@ -5212,7 +5474,7 @@ static int rbd_slab_init(void)
rbd_assert(!rbd_segment_name_cache);
rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
- MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
+ CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
if (rbd_segment_name_cache)
return 0;
out_err:
@@ -5248,24 +5510,46 @@ static int __init rbd_init(void)
if (!libceph_compatible(NULL)) {
rbd_warn(NULL, "libceph incompatibility (quitting)");
-
return -EINVAL;
}
+
rc = rbd_slab_init();
if (rc)
return rc;
+
+ if (single_major) {
+ rbd_major = register_blkdev(0, RBD_DRV_NAME);
+ if (rbd_major < 0) {
+ rc = rbd_major;
+ goto err_out_slab;
+ }
+ }
+
rc = rbd_sysfs_init();
if (rc)
- rbd_slab_exit();
+ goto err_out_blkdev;
+
+ if (single_major)
+ pr_info("loaded (major %d)\n", rbd_major);
else
- pr_info("loaded " RBD_DRV_NAME_LONG "\n");
+ pr_info("loaded\n");
+ return 0;
+
+err_out_blkdev:
+ if (single_major)
+ unregister_blkdev(rbd_major, RBD_DRV_NAME);
+err_out_slab:
+ rbd_slab_exit();
return rc;
}
static void __exit rbd_exit(void)
{
+ ida_destroy(&rbd_dev_id_ida);
rbd_sysfs_cleanup();
+ if (single_major)
+ unregister_blkdev(rbd_major, RBD_DRV_NAME);
rbd_slab_exit();
}
@@ -5275,9 +5559,8 @@ module_exit(rbd_exit);
MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
-MODULE_DESCRIPTION("rados block device");
-
/* following authorship retained from original osdblk.c */
MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
+MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
MODULE_LICENSE("GPL");