diff options
Diffstat (limited to 'drivers/block/rbd.c')
| -rw-r--r-- | drivers/block/rbd.c | 1038 |
1 files changed, 669 insertions, 369 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d6d314027b5..b2c98c1bc03 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -41,6 +41,7 @@ #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/slab.h> +#include <linux/idr.h> #include "rbd_types.h" @@ -89,9 +90,9 @@ static int atomic_dec_return_safe(atomic_t *v) } #define RBD_DRV_NAME "rbd" -#define RBD_DRV_NAME_LONG "rbd (rados block device)" -#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ +#define RBD_MINORS_PER_MAJOR 256 +#define RBD_SINGLE_MAJOR_PART_SHIFT 4 #define RBD_SNAP_DEV_NAME_PREFIX "snap_" #define RBD_MAX_SNAP_NAME_LEN \ @@ -323,6 +324,7 @@ struct rbd_device { int dev_id; /* blkdev unique id */ int major; /* blkdev assigned major */ + int minor; struct gendisk *disk; /* blkdev's gendisk and rq */ u32 image_format; /* Either 1 or 2 */ @@ -372,7 +374,7 @@ enum rbd_dev_flags { RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ }; -static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ +static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ static LIST_HEAD(rbd_dev_list); /* devices */ static DEFINE_SPINLOCK(rbd_dev_list_lock); @@ -386,6 +388,17 @@ static struct kmem_cache *rbd_img_request_cache; static struct kmem_cache *rbd_obj_request_cache; static struct kmem_cache *rbd_segment_name_cache; +static int rbd_major; +static DEFINE_IDA(rbd_dev_id_ida); + +/* + * Default to false for now, as single-major requires >= 0.75 version of + * userspace rbd utility. + */ +static bool single_major = false; +module_param(single_major, bool, S_IRUGO); +MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); + static int rbd_img_request_submit(struct rbd_img_request *img_request); static void rbd_dev_device_release(struct device *dev); @@ -394,18 +407,56 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count); +static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, + size_t count); +static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, + size_t count); static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); static void rbd_spec_put(struct rbd_spec *spec); -static struct bus_attribute rbd_bus_attrs[] = { - __ATTR(add, S_IWUSR, NULL, rbd_add), - __ATTR(remove, S_IWUSR, NULL, rbd_remove), - __ATTR_NULL +static int rbd_dev_id_to_minor(int dev_id) +{ + return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; +} + +static int minor_to_rbd_dev_id(int minor) +{ + return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; +} + +static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); +static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); +static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); +static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); + +static struct attribute *rbd_bus_attrs[] = { + &bus_attr_add.attr, + &bus_attr_remove.attr, + &bus_attr_add_single_major.attr, + &bus_attr_remove_single_major.attr, + NULL, }; +static umode_t rbd_bus_is_visible(struct kobject *kobj, + struct attribute *attr, int index) +{ + if (!single_major && + (attr == &bus_attr_add_single_major.attr || + attr == &bus_attr_remove_single_major.attr)) + return 0; + + return attr->mode; +} + +static const struct attribute_group rbd_bus_group = { + .attrs = rbd_bus_attrs, + .is_visible = rbd_bus_is_visible, +}; +__ATTRIBUTE_GROUPS(rbd_bus); + static struct bus_type rbd_bus_type = { .name = "rbd", - .bus_attrs = rbd_bus_attrs, + .bus_groups = rbd_bus_groups, }; static void rbd_root_dev_release(struct device *dev) @@ -489,10 +540,7 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) if (removing) return -ENOENT; - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); (void) get_device(&rbd_dev->dev); - set_device_ro(bdev, rbd_dev->mapping.read_only); - mutex_unlock(&ctl_mutex); return 0; } @@ -507,20 +555,84 @@ static void rbd_release(struct gendisk *disk, fmode_t mode) spin_unlock_irq(&rbd_dev->lock); rbd_assert(open_count_before > 0); - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); put_device(&rbd_dev->dev); - mutex_unlock(&ctl_mutex); } +static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) +{ + int ret = 0; + int val; + bool ro; + bool ro_changed = false; + + /* get_user() may sleep, so call it before taking rbd_dev->lock */ + if (get_user(val, (int __user *)(arg))) + return -EFAULT; + + ro = val ? true : false; + /* Snapshot doesn't allow to write*/ + if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) + return -EROFS; + + spin_lock_irq(&rbd_dev->lock); + /* prevent others open this device */ + if (rbd_dev->open_count > 1) { + ret = -EBUSY; + goto out; + } + + if (rbd_dev->mapping.read_only != ro) { + rbd_dev->mapping.read_only = ro; + ro_changed = true; + } + +out: + spin_unlock_irq(&rbd_dev->lock); + /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ + if (ret == 0 && ro_changed) + set_disk_ro(rbd_dev->disk, ro ? 1 : 0); + + return ret; +} + +static int rbd_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct rbd_device *rbd_dev = bdev->bd_disk->private_data; + int ret = 0; + + switch (cmd) { + case BLKROSET: + ret = rbd_ioctl_set_ro(rbd_dev, arg); + break; + default: + ret = -ENOTTY; + } + + return ret; +} + +#ifdef CONFIG_COMPAT +static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + return rbd_ioctl(bdev, mode, cmd, arg); +} +#endif /* CONFIG_COMPAT */ + static const struct block_device_operations rbd_bd_ops = { .owner = THIS_MODULE, .open = rbd_open, .release = rbd_release, + .ioctl = rbd_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = rbd_compat_ioctl, +#endif }; /* - * Initialize an rbd client instance. - * We own *ceph_opts. + * Initialize an rbd client instance. Success or not, this function + * consumes ceph_opts. Caller holds client_mutex. */ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) { @@ -535,30 +647,25 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) kref_init(&rbdc->kref); INIT_LIST_HEAD(&rbdc->node); - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); if (IS_ERR(rbdc->client)) - goto out_mutex; + goto out_rbdc; ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ ret = ceph_open_session(rbdc->client); if (ret < 0) - goto out_err; + goto out_client; spin_lock(&rbd_client_list_lock); list_add_tail(&rbdc->node, &rbd_client_list); spin_unlock(&rbd_client_list_lock); - mutex_unlock(&ctl_mutex); dout("%s: rbdc %p\n", __func__, rbdc); return rbdc; - -out_err: +out_client: ceph_destroy_client(rbdc->client); -out_mutex: - mutex_unlock(&ctl_mutex); +out_rbdc: kfree(rbdc); out_opt: if (ceph_opts) @@ -675,17 +782,20 @@ static int parse_rbd_opts_token(char *c, void *private) /* * Get a ceph client with specific addr and configuration, if one does - * not exist create it. + * not exist create it. Either way, ceph_opts is consumed by this + * function. */ static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) { struct rbd_client *rbdc; + mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); rbdc = rbd_client_find(ceph_opts); if (rbdc) /* using an existing client */ ceph_destroy_options(ceph_opts); else rbdc = rbd_client_create(ceph_opts); + mutex_unlock(&client_mutex); return rbdc; } @@ -839,7 +949,6 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev, /* We won't fail any more, fill in the header */ - down_write(&rbd_dev->header_rwsem); if (first_time) { header->object_prefix = object_prefix; header->obj_order = ondisk->options.order; @@ -868,8 +977,6 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev, if (rbd_dev->mapping.size != header->image_size) rbd_dev->mapping.size = header->image_size; - up_write(&rbd_dev->header_rwsem); - return 0; out_2big: ret = -EIO; @@ -936,12 +1043,14 @@ static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u64 snap_id) { u32 which; + const char *snap_name; which = rbd_dev_snap_index(rbd_dev, snap_id); if (which == BAD_SNAP_INDEX) - return NULL; + return ERR_PTR(-ENOENT); - return _rbd_dev_v1_snap_name(rbd_dev, which); + snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); + return snap_name ? snap_name : ERR_PTR(-ENOMEM); } static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) @@ -1035,14 +1144,18 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) char *name; u64 segment; int ret; + char *name_format; name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); if (!name) return NULL; segment = offset >> rbd_dev->header.obj_order; - ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", + name_format = "%s.%012llx"; + if (rbd_dev->image_format == 2) + name_format = "%s.%016llx"; + ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, rbd_dev->header.object_prefix, segment); - if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { + if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { pr_err("error formatting segment name for #%llu (%d)\n", segment, ret); kfree(name); @@ -1108,22 +1221,23 @@ static void bio_chain_put(struct bio *chain) */ static void zero_bio_chain(struct bio *chain, int start_ofs) { - struct bio_vec *bv; + struct bio_vec bv; + struct bvec_iter iter; unsigned long flags; void *buf; - int i; int pos = 0; while (chain) { - bio_for_each_segment(bv, chain, i) { - if (pos + bv->bv_len > start_ofs) { + bio_for_each_segment(bv, chain, iter) { + if (pos + bv.bv_len > start_ofs) { int remainder = max(start_ofs - pos, 0); - buf = bvec_kmap_irq(bv, &flags); + buf = bvec_kmap_irq(&bv, &flags); memset(buf + remainder, 0, - bv->bv_len - remainder); + bv.bv_len - remainder); + flush_dcache_page(bv.bv_page); bvec_kunmap_irq(buf, &flags); } - pos += bv->bv_len; + pos += bv.bv_len; } chain = chain->bi_next; @@ -1148,11 +1262,12 @@ static void zero_pages(struct page **pages, u64 offset, u64 end) unsigned long flags; void *kaddr; - page_offset = (size_t)(offset & ~PAGE_MASK); - length = min(PAGE_SIZE - page_offset, (size_t)(end - offset)); + page_offset = offset & ~PAGE_MASK; + length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); local_irq_save(flags); kaddr = kmap_atomic(*page); memset(kaddr + page_offset, 0, length); + flush_dcache_page(*page); kunmap_atomic(kaddr); local_irq_restore(flags); @@ -1170,74 +1285,14 @@ static struct bio *bio_clone_range(struct bio *bio_src, unsigned int len, gfp_t gfpmask) { - struct bio_vec *bv; - unsigned int resid; - unsigned short idx; - unsigned int voff; - unsigned short end_idx; - unsigned short vcnt; struct bio *bio; - /* Handle the easy case for the caller */ - - if (!offset && len == bio_src->bi_size) - return bio_clone(bio_src, gfpmask); - - if (WARN_ON_ONCE(!len)) - return NULL; - if (WARN_ON_ONCE(len > bio_src->bi_size)) - return NULL; - if (WARN_ON_ONCE(offset > bio_src->bi_size - len)) - return NULL; - - /* Find first affected segment... */ - - resid = offset; - bio_for_each_segment(bv, bio_src, idx) { - if (resid < bv->bv_len) - break; - resid -= bv->bv_len; - } - voff = resid; - - /* ...and the last affected segment */ - - resid += len; - __bio_for_each_segment(bv, bio_src, end_idx, idx) { - if (resid <= bv->bv_len) - break; - resid -= bv->bv_len; - } - vcnt = end_idx - idx + 1; - - /* Build the clone */ - - bio = bio_alloc(gfpmask, (unsigned int) vcnt); + bio = bio_clone(bio_src, gfpmask); if (!bio) return NULL; /* ENOMEM */ - bio->bi_bdev = bio_src->bi_bdev; - bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT); - bio->bi_rw = bio_src->bi_rw; - bio->bi_flags |= 1 << BIO_CLONED; - - /* - * Copy over our part of the bio_vec, then update the first - * and last (or only) entries. - */ - memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx], - vcnt * sizeof (struct bio_vec)); - bio->bi_io_vec[0].bv_offset += voff; - if (vcnt > 1) { - bio->bi_io_vec[0].bv_len -= voff; - bio->bi_io_vec[vcnt - 1].bv_len = resid; - } else { - bio->bi_io_vec[0].bv_len = len; - } - - bio->bi_vcnt = vcnt; - bio->bi_size = len; - bio->bi_idx = 0; + bio_advance(bio, offset); + bio->bi_iter.bi_size = len; return bio; } @@ -1268,7 +1323,7 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src, /* Build up a chain of clone bios up to the limit */ - if (!bi || off >= bi->bi_size || !len) + if (!bi || off >= bi->bi_iter.bi_size || !len) return NULL; /* Nothing to clone */ end = &chain; @@ -1280,7 +1335,7 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src, rbd_warn(NULL, "bio_chain exhausted with %u left", len); goto out_err; /* EINVAL; ran out of bio's */ } - bi_size = min_t(unsigned int, bi->bi_size - off, len); + bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); bio = bio_clone_range(bi, off, bi_size, gfpmask); if (!bio) goto out_err; /* ENOMEM */ @@ -1289,7 +1344,7 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src, end = &bio->bi_next; off += bi_size; - if (off == bi->bi_size) { + if (off == bi->bi_iter.bi_size) { bi = bi->bi_next; off = 0; } @@ -1376,6 +1431,14 @@ static bool obj_request_exists_test(struct rbd_obj_request *obj_request) return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; } +static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) +{ + struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; + + return obj_request->img_offset < + round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header)); +} + static void rbd_obj_request_get(struct rbd_obj_request *obj_request) { dout("%s: obj %p (was %d)\n", __func__, obj_request, @@ -1392,6 +1455,13 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request) kref_put(&obj_request->kref, rbd_obj_request_destroy); } +static void rbd_img_request_get(struct rbd_img_request *img_request) +{ + dout("%s: img %p (was %d)\n", __func__, img_request, + atomic_read(&img_request->kref.refcount)); + kref_get(&img_request->kref); +} + static bool img_request_child_test(struct rbd_img_request *img_request); static void rbd_parent_request_destroy(struct kref *kref); static void rbd_img_request_destroy(struct kref *kref); @@ -1560,11 +1630,12 @@ rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) obj_request, obj_request->img_request, obj_request->result, xferred, length); /* - * ENOENT means a hole in the image. We zero-fill the - * entire length of the request. A short read also implies - * zero-fill to the end of the request. Either way we - * update the xferred count to indicate the whole request - * was satisfied. + * ENOENT means a hole in the image. We zero-fill the entire + * length of the request. A short read also implies zero-fill + * to the end of the request. An error requires the whole + * length of the request to be reported finished with an error + * to the block layer. In each case we update the xferred + * count to indicate the whole request was satisfied. */ rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); if (obj_request->result == -ENOENT) { @@ -1573,14 +1644,13 @@ rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) else zero_pages(obj_request->pages, 0, length); obj_request->result = 0; - obj_request->xferred = length; } else if (xferred < length && !obj_request->result) { if (obj_request->type == OBJ_REQUEST_BIO) zero_bio_chain(obj_request->bio_list, xferred); else zero_pages(obj_request->pages, xferred, length); - obj_request->xferred = length; } + obj_request->xferred = length; obj_request_done_set(obj_request); } @@ -1664,7 +1734,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, if (osd_req->r_result < 0) obj_request->result = osd_req->r_result; - BUG_ON(osd_req->r_num_ops > 2); + rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP); /* * We support a 64-bit length, but ultimately it has to be @@ -1672,11 +1742,15 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, */ obj_request->xferred = osd_req->r_reply_op_len[0]; rbd_assert(obj_request->xferred < (u64)UINT_MAX); + opcode = osd_req->r_ops[0].op; switch (opcode) { case CEPH_OSD_OP_READ: rbd_osd_read_callback(obj_request); break; + case CEPH_OSD_OP_SETALLOCHINT: + rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE); + /* fall through */ case CEPH_OSD_OP_WRITE: rbd_osd_write_callback(obj_request); break; @@ -1725,9 +1799,16 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) snapc, CEPH_NOSNAP, &mtime); } +/* + * Create an osd request. A read request has one osd op (read). + * A write request has either one (watch) or two (hint+write) osd ops. + * (All rbd data writes are prefixed with an allocation hint op, but + * technically osd watch is a write request, hence this distinction.) + */ static struct ceph_osd_request *rbd_osd_req_create( struct rbd_device *rbd_dev, bool write_request, + unsigned int num_ops, struct rbd_obj_request *obj_request) { struct ceph_snap_context *snapc = NULL; @@ -1743,10 +1824,13 @@ static struct ceph_osd_request *rbd_osd_req_create( snapc = img_request->snapc; } - /* Allocate and initialize the request, for the single op */ + rbd_assert(num_ops == 1 || (write_request && num_ops == 2)); + + /* Allocate and initialize the request, for the num_ops ops */ osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); + osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, + GFP_ATOMIC); if (!osd_req) return NULL; /* ENOMEM */ @@ -1758,19 +1842,16 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; - osd_req->r_oid_len = strlen(obj_request->object_name); - rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); - memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); - - osd_req->r_file_layout = rbd_dev->layout; /* struct */ + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); return osd_req; } /* * Create a copyup osd request based on the information in the - * object request supplied. A copyup request has two osd ops, - * a copyup method call, and a "normal" write request. + * object request supplied. A copyup request has three osd ops, + * a copyup method call, a hint op, and a write op. */ static struct ceph_osd_request * rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) @@ -1786,12 +1867,12 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) rbd_assert(img_request); rbd_assert(img_request_write_test(img_request)); - /* Allocate and initialize the request, for the two ops */ + /* Allocate and initialize the request, for the three ops */ snapc = img_request->snapc; rbd_dev = img_request->rbd_dev; osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC); + osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC); if (!osd_req) return NULL; /* ENOMEM */ @@ -1799,11 +1880,8 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; - osd_req->r_oid_len = strlen(obj_request->object_name); - rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); - memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); - - osd_req->r_file_layout = rbd_dev->layout; /* struct */ + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); return osd_req; } @@ -2125,7 +2203,6 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) rbd_assert(img_request->obj_request_count > 0); rbd_assert(which != BAD_WHICH); rbd_assert(which < img_request->obj_request_count); - rbd_assert(which >= img_request->next_completion); spin_lock_irq(&img_request->completion_lock); if (which != img_request->next_completion) @@ -2145,6 +2222,7 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) img_request->next_completion = which; out: spin_unlock_irq(&img_request->completion_lock); + rbd_img_request_put(img_request); if (!more) rbd_img_request_complete(img_request); @@ -2166,9 +2244,9 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, struct rbd_obj_request *obj_request = NULL; struct rbd_obj_request *next_obj_request; bool write_request = img_request_write_test(img_request); - struct bio *bio_list; + struct bio *bio_list = NULL; unsigned int bio_offset = 0; - struct page **pages; + struct page **pages = NULL; u64 img_offset; u64 resid; u16 opcode; @@ -2183,7 +2261,8 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, if (type == OBJ_REQUEST_BIO) { bio_list = data_desc; - rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT); + rbd_assert(img_offset == + bio_list->bi_iter.bi_sector << SECTOR_SHIFT); } else { rbd_assert(type == OBJ_REQUEST_PAGES); pages = data_desc; @@ -2194,6 +2273,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, const char *object_name; u64 offset; u64 length; + unsigned int which = 0; object_name = rbd_segment_name(rbd_dev, img_offset); if (!object_name) @@ -2207,6 +2287,12 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, if (!obj_request) goto out_unwind; + /* + * set obj_request->img_request before creating the + * osd_request so that it gets the right snapc + */ + rbd_img_obj_request_add(img_request, obj_request); + if (type == OBJ_REQUEST_BIO) { unsigned int clone_size; @@ -2218,7 +2304,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, clone_size, GFP_ATOMIC); if (!obj_request->bio_list) - goto out_partial; + goto out_unwind; } else { unsigned int page_count; @@ -2231,19 +2317,28 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, } osd_req = rbd_osd_req_create(rbd_dev, write_request, - obj_request); + (write_request ? 2 : 1), + obj_request); if (!osd_req) - goto out_partial; + goto out_unwind; obj_request->osd_req = osd_req; obj_request->callback = rbd_img_obj_callback; + rbd_img_request_get(img_request); - osd_req_op_extent_init(osd_req, 0, opcode, offset, length, - 0, 0); + if (write_request) { + osd_req_op_alloc_hint_init(osd_req, which, + rbd_obj_bytes(&rbd_dev->header), + rbd_obj_bytes(&rbd_dev->header)); + which++; + } + + osd_req_op_extent_init(osd_req, which, opcode, offset, length, + 0, 0); if (type == OBJ_REQUEST_BIO) - osd_req_op_extent_osd_data_bio(osd_req, 0, + osd_req_op_extent_osd_data_bio(osd_req, which, obj_request->bio_list, length); else - osd_req_op_extent_osd_data_pages(osd_req, 0, + osd_req_op_extent_osd_data_pages(osd_req, which, obj_request->pages, length, offset & ~PAGE_MASK, false, false); @@ -2253,7 +2348,6 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, rbd_osd_req_format_read(obj_request); obj_request->img_offset = img_offset; - rbd_img_obj_request_add(img_request, obj_request); img_offset += length; resid -= length; @@ -2261,11 +2355,9 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, return 0; -out_partial: - rbd_obj_request_put(obj_request); out_unwind: for_each_obj_request_safe(img_request, obj_request, next_obj_request) - rbd_obj_request_put(obj_request); + rbd_img_obj_request_del(img_request, obj_request); return -ENOMEM; } @@ -2365,7 +2457,7 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) /* * The original osd request is of no use to use any more. - * We need a new one that can hold the two ops in a copyup + * We need a new one that can hold the three ops in a copyup * request. Allocate the new copyup osd request for the * original request, and release the old one. */ @@ -2384,17 +2476,22 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, false, false); - /* Then the original write request op */ + /* Then the hint op */ + + osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header), + rbd_obj_bytes(&rbd_dev->header)); + + /* And the original write request op */ offset = orig_request->offset; length = orig_request->length; - osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE, + osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE, offset, length, 0, 0); if (orig_request->type == OBJ_REQUEST_BIO) - osd_req_op_extent_osd_data_bio(osd_req, 1, + osd_req_op_extent_osd_data_bio(osd_req, 2, orig_request->bio_list, length); else - osd_req_op_extent_osd_data_pages(osd_req, 1, + osd_req_op_extent_osd_data_pages(osd_req, 2, orig_request->pages, length, offset & ~PAGE_MASK, false, false); @@ -2526,6 +2623,7 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) */ orig_request = obj_request->obj_request; obj_request->obj_request = NULL; + rbd_obj_request_put(orig_request); rbd_assert(orig_request); rbd_assert(orig_request->img_request); @@ -2546,7 +2644,6 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) if (!rbd_dev->parent_overlap) { struct ceph_osd_client *osdc; - rbd_obj_request_put(orig_request); osdc = &rbd_dev->rbd_client->client->osdc; result = rbd_obj_request_submit(osdc, orig_request); if (!result) @@ -2576,7 +2673,6 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) out: if (orig_request->result) rbd_obj_request_complete(orig_request); - rbd_obj_request_put(orig_request); } static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) @@ -2616,8 +2712,8 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) rbd_assert(obj_request->img_request); rbd_dev = obj_request->img_request->rbd_dev; - stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, - stat_request); + stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + stat_request); if (!stat_request->osd_req) goto out; stat_request->callback = rbd_img_obj_exists_callback; @@ -2660,7 +2756,7 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) */ if (!img_request_write_test(img_request) || !img_request_layered_test(img_request) || - rbd_dev->parent_overlap <= obj_request->img_offset || + !obj_request_overlaps_parent(obj_request) || ((known = obj_request_known_test(obj_request)) && obj_request_exists_test(obj_request))) { @@ -2808,7 +2904,7 @@ out_err: obj_request_done_set(obj_request); } -static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id) +static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) { struct rbd_obj_request *obj_request; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; @@ -2820,19 +2916,21 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id) return -ENOMEM; ret = -ENOMEM; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + obj_request); if (!obj_request->osd_req) goto out; - obj_request->callback = rbd_obj_request_put; osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, notify_id, 0, 0); rbd_osd_req_format_read(obj_request); ret = rbd_obj_request_submit(osdc, obj_request); -out: if (ret) - rbd_obj_request_put(obj_request); + goto out; + ret = rbd_obj_request_wait(obj_request); +out: + rbd_obj_request_put(obj_request); return ret; } @@ -2850,61 +2948,61 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) (unsigned int)opcode); ret = rbd_dev_refresh(rbd_dev); if (ret) - rbd_warn(rbd_dev, ": header refresh error (%d)\n", ret); + rbd_warn(rbd_dev, "header refresh error (%d)\n", ret); - rbd_obj_notify_ack(rbd_dev, notify_id); + rbd_obj_notify_ack_sync(rbd_dev, notify_id); } /* - * Request sync osd watch/unwatch. The value of "start" determines - * whether a watch request is being initiated or torn down. + * Initiate a watch request, synchronously. */ -static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; int ret; - rbd_assert(start ^ !!rbd_dev->watch_event); - rbd_assert(start ^ !!rbd_dev->watch_request); + rbd_assert(!rbd_dev->watch_event); + rbd_assert(!rbd_dev->watch_request); - if (start) { - ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, - &rbd_dev->watch_event); - if (ret < 0) - return ret; - rbd_assert(rbd_dev->watch_event != NULL); - } + ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, + &rbd_dev->watch_event); + if (ret < 0) + return ret; + + rbd_assert(rbd_dev->watch_event); - ret = -ENOMEM; obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, - OBJ_REQUEST_NODATA); - if (!obj_request) + OBJ_REQUEST_NODATA); + if (!obj_request) { + ret = -ENOMEM; goto out_cancel; + } - obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request); - if (!obj_request->osd_req) - goto out_cancel; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, + obj_request); + if (!obj_request->osd_req) { + ret = -ENOMEM; + goto out_put; + } - if (start) - ceph_osdc_set_request_linger(osdc, obj_request->osd_req); - else - ceph_osdc_unregister_linger_request(osdc, - rbd_dev->watch_request->osd_req); + ceph_osdc_set_request_linger(osdc, obj_request->osd_req); osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, - rbd_dev->watch_event->cookie, 0, start ? 1 : 0); + rbd_dev->watch_event->cookie, 0, 1); rbd_osd_req_format_write(obj_request); ret = rbd_obj_request_submit(osdc, obj_request); if (ret) - goto out_cancel; + goto out_linger; + ret = rbd_obj_request_wait(obj_request); if (ret) - goto out_cancel; + goto out_linger; + ret = obj_request->result; if (ret) - goto out_cancel; + goto out_linger; /* * A watch request is set to linger, so the underlying osd @@ -2914,26 +3012,90 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) * it. We'll drop that reference (below) after we've * unregistered it. */ - if (start) { - rbd_dev->watch_request = obj_request; + rbd_dev->watch_request = obj_request; - return 0; + return 0; + +out_linger: + ceph_osdc_unregister_linger_request(osdc, obj_request->osd_req); +out_put: + rbd_obj_request_put(obj_request); +out_cancel: + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_dev->watch_event = NULL; + + return ret; +} + +/* + * Tear down a watch request, synchronously. + */ +static int __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_obj_request *obj_request; + int ret; + + rbd_assert(rbd_dev->watch_event); + rbd_assert(rbd_dev->watch_request); + + obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, + OBJ_REQUEST_NODATA); + if (!obj_request) { + ret = -ENOMEM; + goto out_cancel; + } + + obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, + obj_request); + if (!obj_request->osd_req) { + ret = -ENOMEM; + goto out_put; } + osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, + rbd_dev->watch_event->cookie, 0, 0); + rbd_osd_req_format_write(obj_request); + + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out_put; + + ret = rbd_obj_request_wait(obj_request); + if (ret) + goto out_put; + + ret = obj_request->result; + if (ret) + goto out_put; + /* We have successfully torn down the watch request */ + ceph_osdc_unregister_linger_request(osdc, + rbd_dev->watch_request->osd_req); rbd_obj_request_put(rbd_dev->watch_request); rbd_dev->watch_request = NULL; + +out_put: + rbd_obj_request_put(obj_request); out_cancel: - /* Cancel the event if we're tearing down, or on error */ ceph_osdc_cancel_event(rbd_dev->watch_event); rbd_dev->watch_event = NULL; - if (obj_request) - rbd_obj_request_put(obj_request); return ret; } +static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) +{ + int ret; + + ret = __rbd_dev_header_unwatch_sync(rbd_dev); + if (ret) { + rbd_warn(rbd_dev, "unable to tear down watch request: %d\n", + ret); + } +} + /* * Synchronous osd object method call. Returns the number of bytes * returned in the outbound buffer, or a negative error code. @@ -2974,7 +3136,8 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + obj_request); if (!obj_request->osd_req) goto out; @@ -3024,7 +3187,6 @@ static void rbd_request_fn(struct request_queue *q) __releases(q->queue_lock) __acquires(q->queue_lock) { struct rbd_device *rbd_dev = q->queuedata; - bool read_only = rbd_dev->mapping.read_only; struct request *rq; int result; @@ -3060,7 +3222,7 @@ static void rbd_request_fn(struct request_queue *q) if (write_request) { result = -EROFS; - if (read_only) + if (rbd_dev->mapping.read_only) goto end_request; rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); } @@ -3207,7 +3369,8 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request); + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + obj_request); if (!obj_request->osd_req) goto out; @@ -3324,14 +3487,39 @@ static void rbd_exists_validate(struct rbd_device *rbd_dev) clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); } +static void rbd_dev_update_size(struct rbd_device *rbd_dev) +{ + sector_t size; + bool removing; + + /* + * Don't hold the lock while doing disk operations, + * or lock ordering will conflict with the bdev mutex via: + * rbd_add() -> blkdev_get() -> rbd_open() + */ + spin_lock_irq(&rbd_dev->lock); + removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); + spin_unlock_irq(&rbd_dev->lock); + /* + * If the device is being removed, rbd_dev->disk has + * been destroyed, so don't try to update its size + */ + if (!removing) { + size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; + dout("setting size to %llu sectors", (unsigned long long)size); + set_capacity(rbd_dev->disk, size); + revalidate_disk(rbd_dev->disk); + } +} + static int rbd_dev_refresh(struct rbd_device *rbd_dev) { u64 mapping_size; int ret; rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + down_write(&rbd_dev->header_rwsem); mapping_size = rbd_dev->mapping.size; - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); if (rbd_dev->image_format == 1) ret = rbd_dev_v1_header_info(rbd_dev); else @@ -3340,14 +3528,10 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) /* If it's a mapped snapshot, validate its EXISTS flag */ rbd_exists_validate(rbd_dev); - mutex_unlock(&ctl_mutex); - if (mapping_size != rbd_dev->mapping.size) { - sector_t size; + up_write(&rbd_dev->header_rwsem); - size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; - dout("setting size to %llu sectors", (unsigned long long)size); - set_capacity(rbd_dev->disk, size); - revalidate_disk(rbd_dev->disk); + if (mapping_size != rbd_dev->mapping.size) { + rbd_dev_update_size(rbd_dev); } return ret; @@ -3360,14 +3544,18 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) u64 segment_size; /* create gendisk info */ - disk = alloc_disk(RBD_MINORS_PER_MAJOR); + disk = alloc_disk(single_major ? + (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : + RBD_MINORS_PER_MAJOR); if (!disk) return -ENOMEM; snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", rbd_dev->dev_id); disk->major = rbd_dev->major; - disk->first_minor = 0; + disk->first_minor = rbd_dev->minor; + if (single_major) + disk->flags |= GENHD_FL_EXT_DEVT; disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; @@ -3439,7 +3627,14 @@ static ssize_t rbd_major_show(struct device *dev, return sprintf(buf, "%d\n", rbd_dev->major); return sprintf(buf, "(none)\n"); +} + +static ssize_t rbd_minor_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + return sprintf(buf, "%d\n", rbd_dev->minor); } static ssize_t rbd_client_id_show(struct device *dev, @@ -3561,6 +3756,7 @@ static ssize_t rbd_image_refresh(struct device *dev, static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); +static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); @@ -3574,6 +3770,7 @@ static struct attribute *rbd_attrs[] = { &dev_attr_size.attr, &dev_attr_features.attr, &dev_attr_major.attr, + &dev_attr_minor.attr, &dev_attr_client_id.attr, &dev_attr_pool.attr, &dev_attr_pool_id.attr, @@ -3701,12 +3898,14 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, if (ret < sizeof (size_buf)) return -ERANGE; - if (order) + if (order) { *order = size_buf.order; + dout(" order %u", (unsigned int)*order); + } *snap_size = le64_to_cpu(size_buf.size); - dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", - (unsigned long long)snap_id, (unsigned int)*order, + dout(" snap_id 0x%016llx snap_size = %llu\n", + (unsigned long long)snap_id, (unsigned long long)*snap_size); return 0; @@ -3804,6 +4003,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) void *end; u64 pool_id; char *image_id; + u64 snap_id; u64 overlap; int ret; @@ -3863,24 +4063,56 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) (unsigned long long)pool_id, U32_MAX); goto out_err; } - parent_spec->pool_id = pool_id; image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(image_id)) { ret = PTR_ERR(image_id); goto out_err; } - parent_spec->image_id = image_id; - ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); + ceph_decode_64_safe(&p, end, snap_id, out_err); ceph_decode_64_safe(&p, end, overlap, out_err); - if (overlap) { - rbd_spec_put(rbd_dev->parent_spec); + /* + * The parent won't change (except when the clone is + * flattened, already handled that). So we only need to + * record the parent spec we have not already done so. + */ + if (!rbd_dev->parent_spec) { + parent_spec->pool_id = pool_id; + parent_spec->image_id = image_id; + parent_spec->snap_id = snap_id; rbd_dev->parent_spec = parent_spec; parent_spec = NULL; /* rbd_dev now owns this */ - rbd_dev->parent_overlap = overlap; - } else { - rbd_warn(rbd_dev, "ignoring parent of clone with overlap 0\n"); + } + + /* + * We always update the parent overlap. If it's zero we + * treat it specially. + */ + rbd_dev->parent_overlap = overlap; + smp_mb(); + if (!overlap) { + + /* A null parent_spec indicates it's the initial probe */ + + if (parent_spec) { + /* + * The overlap has become zero, so the clone + * must have been resized down to 0 at some + * point. Treat this the same as a flatten. + */ + rbd_dev_parent_put(rbd_dev); + pr_info("%s: clone image now standalone\n", + rbd_dev->disk->disk_name); + } else { + /* + * For the initial probe, if we find the + * overlap is zero we just pretend there was + * no parent image. + */ + rbd_warn(rbd_dev, "ignoring parent of " + "clone with overlap 0\n"); + } } out: ret = 0; @@ -4021,8 +4253,13 @@ static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) snap_id = snapc->snaps[which]; snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); - if (IS_ERR(snap_name)) - break; + if (IS_ERR(snap_name)) { + /* ignore no-longer existing snapshots */ + if (PTR_ERR(snap_name) == -ENOENT) + continue; + else + break; + } found = !strcmp(name, snap_name); kfree(snap_name); } @@ -4101,8 +4338,8 @@ static int rbd_dev_spec_update(struct rbd_device *rbd_dev) /* Look up the snapshot name, and make a copy */ snap_name = rbd_snap_name(rbd_dev, spec->snap_id); - if (!snap_name) { - ret = -ENOMEM; + if (IS_ERR(snap_name)) { + ret = PTR_ERR(snap_name); goto out_err; } @@ -4236,12 +4473,14 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) bool first_time = rbd_dev->header.object_prefix == NULL; int ret; - down_write(&rbd_dev->header_rwsem); + ret = rbd_dev_v2_image_size(rbd_dev); + if (ret) + return ret; if (first_time) { ret = rbd_dev_v2_header_onetime(rbd_dev); if (ret) - goto out; + return ret; } /* @@ -4256,7 +4495,7 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) ret = rbd_dev_v2_parent_info(rbd_dev); if (ret) - goto out; + return ret; /* * Print a warning if this is the initial probe and @@ -4271,18 +4510,12 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) "is EXPERIMENTAL!"); } - ret = rbd_dev_v2_image_size(rbd_dev); - if (ret) - goto out; - if (rbd_dev->spec->snap_id == CEPH_NOSNAP) if (rbd_dev->mapping.size != rbd_dev->header.image_size) rbd_dev->mapping.size = rbd_dev->header.image_size; ret = rbd_dev_v2_snap_context(rbd_dev); dout("rbd_dev_v2_snap_context returned %d\n", ret); -out: - up_write(&rbd_dev->header_rwsem); return ret; } @@ -4292,8 +4525,6 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev) struct device *dev; int ret; - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - dev = &rbd_dev->dev; dev->bus = &rbd_bus_type; dev->type = &rbd_device_type; @@ -4302,8 +4533,6 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev) dev_set_name(dev, "%d", rbd_dev->dev_id); ret = device_register(dev); - mutex_unlock(&ctl_mutex); - return ret; } @@ -4312,21 +4541,29 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev) device_unregister(&rbd_dev->dev); } -static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); - /* * Get a unique rbd identifier for the given new rbd_dev, and add - * the rbd_dev to the global list. The minimum rbd id is 1. + * the rbd_dev to the global list. */ -static void rbd_dev_id_get(struct rbd_device *rbd_dev) +static int rbd_dev_id_get(struct rbd_device *rbd_dev) { - rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); + int new_dev_id; + + new_dev_id = ida_simple_get(&rbd_dev_id_ida, + 0, minor_to_rbd_dev_id(1 << MINORBITS), + GFP_KERNEL); + if (new_dev_id < 0) + return new_dev_id; + + rbd_dev->dev_id = new_dev_id; spin_lock(&rbd_dev_list_lock); list_add_tail(&rbd_dev->node, &rbd_dev_list); spin_unlock(&rbd_dev_list_lock); - dout("rbd_dev %p given dev id %llu\n", rbd_dev, - (unsigned long long) rbd_dev->dev_id); + + dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id); + + return 0; } /* @@ -4335,49 +4572,13 @@ static void rbd_dev_id_get(struct rbd_device *rbd_dev) */ static void rbd_dev_id_put(struct rbd_device *rbd_dev) { - struct list_head *tmp; - int rbd_id = rbd_dev->dev_id; - int max_id; - - rbd_assert(rbd_id > 0); - - dout("rbd_dev %p released dev id %llu\n", rbd_dev, - (unsigned long long) rbd_dev->dev_id); spin_lock(&rbd_dev_list_lock); list_del_init(&rbd_dev->node); - - /* - * If the id being "put" is not the current maximum, there - * is nothing special we need to do. - */ - if (rbd_id != atomic64_read(&rbd_dev_id_max)) { - spin_unlock(&rbd_dev_list_lock); - return; - } - - /* - * We need to update the current maximum id. Search the - * list to find out what it is. We're more likely to find - * the maximum at the end, so search the list backward. - */ - max_id = 0; - list_for_each_prev(tmp, &rbd_dev_list) { - struct rbd_device *rbd_dev; - - rbd_dev = list_entry(tmp, struct rbd_device, node); - if (rbd_dev->dev_id > max_id) - max_id = rbd_dev->dev_id; - } spin_unlock(&rbd_dev_list_lock); - /* - * The max id could have been updated by rbd_dev_id_get(), in - * which case it now accurately reflects the new maximum. - * Be careful not to overwrite the maximum value in that - * case. - */ - atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); - dout(" max dev id has been reset\n"); + ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); + + dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id); } /* @@ -4610,6 +4811,38 @@ out_err: } /* + * Return pool id (>= 0) or a negative error code. + */ +static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) +{ + u64 newest_epoch; + unsigned long timeout = rbdc->client->options->mount_timeout * HZ; + int tries = 0; + int ret; + +again: + ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); + if (ret == -ENOENT && tries++ < 1) { + ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", + &newest_epoch); + if (ret < 0) + return ret; + + if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { + ceph_monc_request_next_osdmap(&rbdc->client->monc); + (void) ceph_monc_wait_osdmap(&rbdc->client->monc, + newest_epoch, timeout); + goto again; + } else { + /* the osdmap we have is new enough */ + return -ENOENT; + } + } + + return ret; +} + +/* * An rbd format 2 image has a unique identifier, distinct from the * name given to it by the user. Internally, that identifier is * what's used to specify the names of objects related to the image. @@ -4679,7 +4912,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) image_id = ceph_extract_encoded_string(&p, p + ret, NULL, GFP_NOIO); - ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0; + ret = PTR_ERR_OR_ZERO(image_id); if (!ret) rbd_dev->image_format = 2; } else { @@ -4697,8 +4930,10 @@ out: return ret; } -/* Undo whatever state changes are made by v1 or v2 image probe */ - +/* + * Undo whatever state changes are made by v1 or v2 header info + * call. + */ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) { struct rbd_image_header *header; @@ -4798,20 +5033,29 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) { int ret; - /* generate unique id: find highest unique id, add one */ - rbd_dev_id_get(rbd_dev); + /* Get an id and fill in device name. */ + + ret = rbd_dev_id_get(rbd_dev); + if (ret) + return ret; - /* Fill in the device name, now that we have its id. */ BUILD_BUG_ON(DEV_NAME_LEN < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); - /* Get our block major device number. */ + /* Record our major and minor device numbers. */ - ret = register_blkdev(0, rbd_dev->name); - if (ret < 0) - goto err_out_id; - rbd_dev->major = ret; + if (!single_major) { + ret = register_blkdev(0, rbd_dev->name); + if (ret < 0) + goto err_out_id; + + rbd_dev->major = ret; + rbd_dev->minor = 0; + } else { + rbd_dev->major = rbd_major; + rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); + } /* Set up the blkdev mapping. */ @@ -4823,6 +5067,7 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) if (ret) goto err_out_disk; set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); + set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); ret = rbd_bus_add_dev(rbd_dev); if (ret) @@ -4843,7 +5088,8 @@ err_out_mapping: err_out_disk: rbd_free_disk(rbd_dev); err_out_blkdev: - unregister_blkdev(rbd_dev->major, rbd_dev->name); + if (!single_major) + unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_id: rbd_dev_id_put(rbd_dev); rbd_dev_mapping_clear(rbd_dev); @@ -4899,12 +5145,12 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev) static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) { int ret; - int tmp; /* - * Get the id from the image id object. If it's not a - * format 2 image, we'll get ENOENT back, and we'll assume - * it's a format 1 image. + * Get the id from the image id object. Unless there's an + * error, rbd_dev->spec->image_id will be filled in with + * a dynamically-allocated string, and rbd_dev->image_format + * will be set to either 1 or 2. */ ret = rbd_dev_image_id(rbd_dev); if (ret) @@ -4917,7 +5163,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) goto err_out_format; if (mapping) { - ret = rbd_dev_header_watch_sync(rbd_dev, true); + ret = rbd_dev_header_watch_sync(rbd_dev); if (ret) goto out_header_name; } @@ -4944,12 +5190,8 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) err_out_probe: rbd_dev_unprobe(rbd_dev); err_out_watch: - if (mapping) { - tmp = rbd_dev_header_watch_sync(rbd_dev, false); - if (tmp) - rbd_warn(rbd_dev, "unable to tear down " - "watch request (%d)\n", tmp); - } + if (mapping) + rbd_dev_header_unwatch_sync(rbd_dev); out_header_name: kfree(rbd_dev->header_name); rbd_dev->header_name = NULL; @@ -4963,16 +5205,15 @@ err_out_format: return ret; } -static ssize_t rbd_add(struct bus_type *bus, - const char *buf, - size_t count) +static ssize_t do_rbd_add(struct bus_type *bus, + const char *buf, + size_t count) { struct rbd_device *rbd_dev = NULL; struct ceph_options *ceph_opts = NULL; struct rbd_options *rbd_opts = NULL; struct rbd_spec *spec = NULL; struct rbd_client *rbdc; - struct ceph_osd_client *osdc; bool read_only; int rc = -ENOMEM; @@ -4992,11 +5233,9 @@ static ssize_t rbd_add(struct bus_type *bus, rc = PTR_ERR(rbdc); goto err_out_args; } - ceph_opts = NULL; /* rbd_dev client now owns this */ /* pick the pool */ - osdc = &rbdc->client->osdc; - rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); + rc = rbd_add_get_pool_id(rbdc, spec->pool_name); if (rc < 0) goto err_out_client; spec->pool_id = (u64)rc; @@ -5027,18 +5266,24 @@ static ssize_t rbd_add(struct bus_type *bus, rbd_dev->mapping.read_only = read_only; rc = rbd_dev_device_setup(rbd_dev); - if (!rc) - return count; + if (rc) { + /* + * rbd_dev_header_unwatch_sync() can't be moved into + * rbd_dev_image_release() without refactoring, see + * commit 1f3ef78861ac. + */ + rbd_dev_header_unwatch_sync(rbd_dev); + rbd_dev_image_release(rbd_dev); + goto err_out_module; + } + + return count; - rbd_dev_image_release(rbd_dev); err_out_rbd_dev: rbd_dev_destroy(rbd_dev); err_out_client: rbd_put_client(rbdc); err_out_args: - if (ceph_opts) - ceph_destroy_options(ceph_opts); - kfree(rbd_opts); rbd_spec_put(spec); err_out_module: module_put(THIS_MODULE); @@ -5048,21 +5293,21 @@ err_out_module: return (ssize_t)rc; } -static struct rbd_device *__rbd_get_dev(unsigned long dev_id) +static ssize_t rbd_add(struct bus_type *bus, + const char *buf, + size_t count) { - struct list_head *tmp; - struct rbd_device *rbd_dev; + if (single_major) + return -EINVAL; - spin_lock(&rbd_dev_list_lock); - list_for_each(tmp, &rbd_dev_list) { - rbd_dev = list_entry(tmp, struct rbd_device, node); - if (rbd_dev->dev_id == dev_id) { - spin_unlock(&rbd_dev_list_lock); - return rbd_dev; - } - } - spin_unlock(&rbd_dev_list_lock); - return NULL; + return do_rbd_add(bus, buf, count); +} + +static ssize_t rbd_add_single_major(struct bus_type *bus, + const char *buf, + size_t count) +{ + return do_rbd_add(bus, buf, count); } static void rbd_dev_device_release(struct device *dev) @@ -5072,8 +5317,8 @@ static void rbd_dev_device_release(struct device *dev) rbd_free_disk(rbd_dev); clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); rbd_dev_mapping_clear(rbd_dev); - unregister_blkdev(rbd_dev->major, rbd_dev->name); - rbd_dev->major = 0; + if (!single_major) + unregister_blkdev(rbd_dev->major, rbd_dev->name); rbd_dev_id_put(rbd_dev); rbd_dev_mapping_clear(rbd_dev); } @@ -5104,51 +5349,84 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) } } -static ssize_t rbd_remove(struct bus_type *bus, - const char *buf, - size_t count) +static ssize_t do_rbd_remove(struct bus_type *bus, + const char *buf, + size_t count) { struct rbd_device *rbd_dev = NULL; - int target_id; + struct list_head *tmp; + int dev_id; unsigned long ul; + bool already = false; int ret; - ret = strict_strtoul(buf, 10, &ul); + ret = kstrtoul(buf, 10, &ul); if (ret) return ret; /* convert to int; abort if we lost anything in the conversion */ - target_id = (int) ul; - if (target_id != ul) + dev_id = (int)ul; + if (dev_id != ul) return -EINVAL; - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - - rbd_dev = __rbd_get_dev(target_id); - if (!rbd_dev) { - ret = -ENOENT; - goto done; + ret = -ENOENT; + spin_lock(&rbd_dev_list_lock); + list_for_each(tmp, &rbd_dev_list) { + rbd_dev = list_entry(tmp, struct rbd_device, node); + if (rbd_dev->dev_id == dev_id) { + ret = 0; + break; + } + } + if (!ret) { + spin_lock_irq(&rbd_dev->lock); + if (rbd_dev->open_count) + ret = -EBUSY; + else + already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, + &rbd_dev->flags); + spin_unlock_irq(&rbd_dev->lock); } + spin_unlock(&rbd_dev_list_lock); + if (ret < 0 || already) + return ret; - spin_lock_irq(&rbd_dev->lock); - if (rbd_dev->open_count) - ret = -EBUSY; - else - set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); - spin_unlock_irq(&rbd_dev->lock); - if (ret < 0) - goto done; + rbd_dev_header_unwatch_sync(rbd_dev); + /* + * flush remaining watch callbacks - these must be complete + * before the osd_client is shutdown + */ + dout("%s: flushing notifies", __func__); + ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); + + /* + * Don't free anything from rbd_dev->disk until after all + * notifies are completely processed. Otherwise + * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting + * in a potential use after free of rbd_dev->disk or rbd_dev. + */ rbd_bus_del_dev(rbd_dev); - ret = rbd_dev_header_watch_sync(rbd_dev, false); - if (ret) - rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); rbd_dev_image_release(rbd_dev); module_put(THIS_MODULE); - ret = count; -done: - mutex_unlock(&ctl_mutex); - return ret; + return count; +} + +static ssize_t rbd_remove(struct bus_type *bus, + const char *buf, + size_t count) +{ + if (single_major) + return -EINVAL; + + return do_rbd_remove(bus, buf, count); +} + +static ssize_t rbd_remove_single_major(struct bus_type *bus, + const char *buf, + size_t count) +{ + return do_rbd_remove(bus, buf, count); } /* @@ -5196,7 +5474,7 @@ static int rbd_slab_init(void) rbd_assert(!rbd_segment_name_cache); rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", - MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL); + CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); if (rbd_segment_name_cache) return 0; out_err: @@ -5232,35 +5510,57 @@ static int __init rbd_init(void) if (!libceph_compatible(NULL)) { rbd_warn(NULL, "libceph incompatibility (quitting)"); - return -EINVAL; } + rc = rbd_slab_init(); if (rc) return rc; + + if (single_major) { + rbd_major = register_blkdev(0, RBD_DRV_NAME); + if (rbd_major < 0) { + rc = rbd_major; + goto err_out_slab; + } + } + rc = rbd_sysfs_init(); if (rc) - rbd_slab_exit(); + goto err_out_blkdev; + + if (single_major) + pr_info("loaded (major %d)\n", rbd_major); else - pr_info("loaded " RBD_DRV_NAME_LONG "\n"); + pr_info("loaded\n"); + + return 0; +err_out_blkdev: + if (single_major) + unregister_blkdev(rbd_major, RBD_DRV_NAME); +err_out_slab: + rbd_slab_exit(); return rc; } static void __exit rbd_exit(void) { + ida_destroy(&rbd_dev_id_ida); rbd_sysfs_cleanup(); + if (single_major) + unregister_blkdev(rbd_major, RBD_DRV_NAME); rbd_slab_exit(); } module_init(rbd_init); module_exit(rbd_exit); +MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); -MODULE_DESCRIPTION("rados block device"); - /* following authorship retained from original osdblk.c */ MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); +MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); MODULE_LICENSE("GPL"); |
