diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-21 12:38:28 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-21 12:38:28 -0700 |
commit | 2017bd19454ea7cdae19922d15b6930f6c8088a2 (patch) | |
tree | 53974657ab3a2c98f2da7b3fcb050ff5b697f876 | |
parent | 9f1ad09493451c19d00c004da479acf699eeedd6 (diff) | |
parent | efa4c1206eaff047c474af2136748a58eb8cc33b (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (22 commits)
ceph: do not carry i_lock for readdir from dcache
fs/ceph/xattr.c: Use kmemdup
rbd: passing wrong variable to bvec_kunmap_irq()
rbd: null vs ERR_PTR
ceph: fix num_pages_free accounting in pagelist
ceph: add CEPH_MDS_OP_SETDIRLAYOUT and associated ioctl.
ceph: don't crash when passed bad mount options
ceph: fix debugfs warnings
block: rbd: removing unnecessary test
block: rbd: fixed may leaks
ceph: switch from BKL to lock_flocks()
ceph: preallocate flock state without locks held
ceph: add pagelist_reserve, pagelist_truncate, pagelist_set_cursor
ceph: use mapping->nrpages to determine if mapping is empty
ceph: only invalidate on check_caps if we actually have pages
ceph: do not hide .snap in root directory
rbd: introduce rados block device (rbd), based on libceph
ceph: factor out libceph from Ceph file system
ceph-rbd: osdc support for osd call and rollback operations
ceph: messenger and osdc changes for rbd
...
-rw-r--r-- | MAINTAINERS | 11 | ||||
-rw-r--r-- | drivers/block/Kconfig | 17 | ||||
-rw-r--r-- | drivers/block/Makefile | 1 | ||||
-rw-r--r-- | drivers/block/rbd.c | 1841 | ||||
-rw-r--r-- | drivers/block/rbd_types.h | 73 | ||||
-rw-r--r-- | fs/ceph/Kconfig | 14 | ||||
-rw-r--r-- | fs/ceph/Makefile | 11 | ||||
-rw-r--r-- | fs/ceph/README | 20 | ||||
-rw-r--r-- | fs/ceph/addr.c | 65 | ||||
-rw-r--r-- | fs/ceph/caps.c | 50 | ||||
-rw-r--r-- | fs/ceph/ceph_frag.c | 3 | ||||
-rw-r--r-- | fs/ceph/debugfs.c | 406 | ||||
-rw-r--r-- | fs/ceph/dir.c | 97 | ||||
-rw-r--r-- | fs/ceph/export.c | 5 | ||||
-rw-r--r-- | fs/ceph/file.c | 207 | ||||
-rw-r--r-- | fs/ceph/inode.c | 19 | ||||
-rw-r--r-- | fs/ceph/ioctl.c | 77 | ||||
-rw-r--r-- | fs/ceph/ioctl.h | 4 | ||||
-rw-r--r-- | fs/ceph/locks.c | 23 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 129 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 20 | ||||
-rw-r--r-- | fs/ceph/mdsmap.c | 11 | ||||
-rw-r--r-- | fs/ceph/pagelist.c | 63 | ||||
-rw-r--r-- | fs/ceph/snap.c | 10 | ||||
-rw-r--r-- | fs/ceph/strings.c (renamed from fs/ceph/ceph_strings.c) | 82 | ||||
-rw-r--r-- | fs/ceph/super.c | 1154 | ||||
-rw-r--r-- | fs/ceph/super.h | 400 | ||||
-rw-r--r-- | fs/ceph/xattr.c | 18 | ||||
-rw-r--r-- | include/linux/ceph/auth.h (renamed from fs/ceph/auth.h) | 4 | ||||
-rw-r--r-- | include/linux/ceph/buffer.h (renamed from fs/ceph/buffer.h) | 0 | ||||
-rw-r--r-- | include/linux/ceph/ceph_debug.h (renamed from fs/ceph/ceph_debug.h) | 5 | ||||
-rw-r--r-- | include/linux/ceph/ceph_frag.h (renamed from fs/ceph/ceph_frag.h) | 0 | ||||
-rw-r--r-- | include/linux/ceph/ceph_fs.h (renamed from fs/ceph/ceph_fs.h) | 1 | ||||
-rw-r--r-- | include/linux/ceph/ceph_hash.h (renamed from fs/ceph/ceph_hash.h) | 0 | ||||
-rw-r--r-- | include/linux/ceph/debugfs.h | 33 | ||||
-rw-r--r-- | include/linux/ceph/decode.h (renamed from fs/ceph/decode.h) | 5 | ||||
-rw-r--r-- | include/linux/ceph/libceph.h | 249 | ||||
-rw-r--r-- | include/linux/ceph/mdsmap.h (renamed from fs/ceph/mdsmap.h) | 0 | ||||
-rw-r--r-- | include/linux/ceph/messenger.h (renamed from fs/ceph/messenger.h) | 12 | ||||
-rw-r--r-- | include/linux/ceph/mon_client.h (renamed from fs/ceph/mon_client.h) | 1 | ||||
-rw-r--r-- | include/linux/ceph/msgpool.h (renamed from fs/ceph/msgpool.h) | 0 | ||||
-rw-r--r-- | include/linux/ceph/msgr.h (renamed from fs/ceph/msgr.h) | 0 | ||||
-rw-r--r-- | include/linux/ceph/osd_client.h (renamed from fs/ceph/osd_client.h) | 67 | ||||
-rw-r--r-- | include/linux/ceph/osdmap.h (renamed from fs/ceph/osdmap.h) | 4 | ||||
-rw-r--r-- | include/linux/ceph/pagelist.h (renamed from fs/ceph/pagelist.h) | 23 | ||||
-rw-r--r-- | include/linux/ceph/rados.h (renamed from fs/ceph/rados.h) | 0 | ||||
-rw-r--r-- | include/linux/ceph/types.h (renamed from fs/ceph/types.h) | 0 | ||||
-rw-r--r-- | include/linux/crush/crush.h (renamed from fs/ceph/crush/crush.h) | 0 | ||||
-rw-r--r-- | include/linux/crush/hash.h (renamed from fs/ceph/crush/hash.h) | 0 | ||||
-rw-r--r-- | include/linux/crush/mapper.h (renamed from fs/ceph/crush/mapper.h) | 0 | ||||
-rw-r--r-- | net/Kconfig | 1 | ||||
-rw-r--r-- | net/Makefile | 1 | ||||
-rw-r--r-- | net/ceph/Kconfig | 28 | ||||
-rw-r--r-- | net/ceph/Makefile | 37 | ||||
-rw-r--r-- | net/ceph/armor.c (renamed from fs/ceph/armor.c) | 0 | ||||
-rw-r--r-- | net/ceph/auth.c (renamed from fs/ceph/auth.c) | 10 | ||||
-rw-r--r-- | net/ceph/auth_none.c (renamed from fs/ceph/auth_none.c) | 7 | ||||
-rw-r--r-- | net/ceph/auth_none.h (renamed from fs/ceph/auth_none.h) | 3 | ||||
-rw-r--r-- | net/ceph/auth_x.c (renamed from fs/ceph/auth_x.c) | 9 | ||||
-rw-r--r-- | net/ceph/auth_x.h (renamed from fs/ceph/auth_x.h) | 3 | ||||
-rw-r--r-- | net/ceph/auth_x_protocol.h (renamed from fs/ceph/auth_x_protocol.h) | 0 | ||||
-rw-r--r-- | net/ceph/buffer.c (renamed from fs/ceph/buffer.c) | 9 | ||||
-rw-r--r-- | net/ceph/ceph_common.c | 529 | ||||
-rw-r--r-- | net/ceph/ceph_fs.c (renamed from fs/ceph/ceph_fs.c) | 5 | ||||
-rw-r--r-- | net/ceph/ceph_hash.c (renamed from fs/ceph/ceph_hash.c) | 2 | ||||
-rw-r--r-- | net/ceph/ceph_strings.c | 84 | ||||
-rw-r--r-- | net/ceph/crush/crush.c (renamed from fs/ceph/crush/crush.c) | 2 | ||||
-rw-r--r-- | net/ceph/crush/hash.c (renamed from fs/ceph/crush/hash.c) | 2 | ||||
-rw-r--r-- | net/ceph/crush/mapper.c (renamed from fs/ceph/crush/mapper.c) | 4 | ||||
-rw-r--r-- | net/ceph/crypto.c (renamed from fs/ceph/crypto.c) | 4 | ||||
-rw-r--r-- | net/ceph/crypto.h (renamed from fs/ceph/crypto.h) | 4 | ||||
-rw-r--r-- | net/ceph/debugfs.c | 267 | ||||
-rw-r--r-- | net/ceph/messenger.c (renamed from fs/ceph/messenger.c) | 296 | ||||
-rw-r--r-- | net/ceph/mon_client.c (renamed from fs/ceph/mon_client.c) | 73 | ||||
-rw-r--r-- | net/ceph/msgpool.c (renamed from fs/ceph/msgpool.c) | 4 | ||||
-rw-r--r-- | net/ceph/osd_client.c (renamed from fs/ceph/osd_client.c) | 400 | ||||
-rw-r--r-- | net/ceph/osdmap.c (renamed from fs/ceph/osdmap.c) | 30 | ||||
-rw-r--r-- | net/ceph/pagelist.c | 154 | ||||
-rw-r--r-- | net/ceph/pagevec.c | 223 |
79 files changed, 5380 insertions, 2046 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index f2a2b8e647c..3d4179fbc52 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1527,6 +1527,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git S: Supported F: Documentation/filesystems/ceph.txt F: fs/ceph +F: net/ceph +F: include/linux/ceph CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: M: David Vrabel <david.vrabel@csr.com> @@ -4805,6 +4807,15 @@ F: fs/qnx4/ F: include/linux/qnx4_fs.h F: include/linux/qnxtypes.h +RADOS BLOCK DEVICE (RBD) +F: include/linux/qnxtypes.h +M: Yehuda Sadeh <yehuda@hq.newdream.net> +M: Sage Weil <sage@newdream.net> +M: ceph-devel@vger.kernel.org +S: Supported +F: drivers/block/rbd.c +F: drivers/block/rbd_types.h + RADEON FRAMEBUFFER DISPLAY DRIVER M: Benjamin Herrenschmidt <benh@kernel.crashing.org> L: linux-fbdev@vger.kernel.org diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index de277689da6..4b9359a6f6c 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -488,4 +488,21 @@ config BLK_DEV_HD If unsure, say N. +config BLK_DEV_RBD + tristate "Rados block device (RBD)" + depends on INET && EXPERIMENTAL && BLOCK + select CEPH_LIB + select LIBCRC32C + select CRYPTO_AES + select CRYPTO + default n + help + Say Y here if you want include the Rados block device, which stripes + a block device over objects stored in the Ceph distributed object + store. + + More information at http://ceph.newdream.net/. + + If unsure, say N. + endif # BLK_DEV diff --git a/drivers/block/Makefile b/drivers/block/Makefile index aff5ac925c3..d7f463d6312 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD) += hd.o obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ +obj-$(CONFIG_BLK_DEV_RBD) += rbd.o swim_mod-objs := swim.o swim_asm.o diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c new file mode 100644 index 00000000000..6ec9d53806c --- /dev/null +++ b/drivers/block/rbd.c @@ -0,0 +1,1841 @@ +/* + rbd.c -- Export ceph rados objects as a Linux block device + + + based on drivers/block/osdblk.c: + + Copyright 2009 Red Hat, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + + + Instructions for use + -------------------- + + 1) Map a Linux block device to an existing rbd image. + + Usage: <mon ip addr> <options> <pool name> <rbd image name> [snap name] + + $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add + + The snapshot name can be "-" or omitted to map the image read/write. + + 2) List all active blkdev<->object mappings. + + In this example, we have performed step #1 twice, creating two blkdevs, + mapped to two separate rados objects in the rados rbd pool + + $ cat /sys/class/rbd/list + #id major client_name pool name snap KB + 0 254 client4143 rbd foo - 1024000 + + The columns, in order, are: + - blkdev unique id + - blkdev assigned major + - rados client id + - rados pool name + - rados block device name + - mapped snapshot ("-" if none) + - device size in KB + + + 3) Create a snapshot. + + Usage: <blkdev id> <snapname> + + $ echo "0 mysnap" > /sys/class/rbd/snap_create + + + 4) Listing a snapshot. + + $ cat /sys/class/rbd/snaps_list + #id snap KB + 0 - 1024000 (*) + 0 foo 1024000 + + The columns, in order, are: + - blkdev unique id + - snapshot name, '-' means none (active read/write version) + - size of device at time of snapshot + - the (*) indicates this is the active version + + 5) Rollback to snapshot. + + Usage: <blkdev id> <snapname> + + $ echo "0 mysnap" > /sys/class/rbd/snap_rollback + + + 6) Mapping an image using snapshot. + + A snapshot mapping is read-only. This is being done by passing + snap=<snapname> to the options when adding a device. + + $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add + + + 7) Remove an active blkdev<->rbd image mapping. + + In this example, we remove the mapping with blkdev unique id 1. + + $ echo 1 > /sys/class/rbd/remove + + + NOTE: The actual creation and deletion of rados objects is outside the scope + of this driver. + + */ + +#include <linux/ceph/libceph.h> +#include <linux/ceph/osd_client.h> +#include <linux/ceph/mon_client.h> +#include <linux/ceph/decode.h> + +#include <linux/kernel.h> +#include <linux/device.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/blkdev.h> + +#include "rbd_types.h" + +#define DRV_NAME "rbd" +#define DRV_NAME_LONG "rbd (rados block device)" + +#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ + +#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX)) +#define RBD_MAX_POOL_NAME_LEN 64 +#define RBD_MAX_SNAP_NAME_LEN 32 +#define RBD_MAX_OPT_LEN 1024 + +#define RBD_SNAP_HEAD_NAME "-" + +#define DEV_NAME_LEN 32 + +/* + * block device image metadata (in-memory version) + */ +struct rbd_image_header { + u64 image_size; + char block_name[32]; + __u8 obj_order; + __u8 crypt_type; + __u8 comp_type; + struct rw_semaphore snap_rwsem; + struct ceph_snap_context *snapc; + size_t snap_names_len; + u64 snap_seq; + u32 total_snaps; + + char *snap_names; + u64 *snap_sizes; +}; + +/* + * an instance of the client. multiple devices may share a client. + */ +struct rbd_client { + struct ceph_client *client; + struct kref kref; + struct list_head node; +}; + +/* + * a single io request + */ +struct rbd_request { + struct request *rq; /* blk layer request */ + struct bio *bio; /* cloned bio */ + struct page **pages; /* list of used pages */ + u64 len; +}; + +/* + * a single device + */ +struct rbd_device { + int id; /* blkdev unique id */ + + int major; /* blkdev assigned major */ + struct gendisk *disk; /* blkdev's gendisk and rq */ + struct request_queue *q; + + struct ceph_client *client; + struct rbd_client *rbd_client; + + char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ + + spinlock_t lock; /* queue lock */ + + struct rbd_image_header header; + char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */ + int obj_len; + char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */ + char pool_name[RBD_MAX_POOL_NAME_LEN]; + int poolid; + + char snap_name[RBD_MAX_SNAP_NAME_LEN]; + u32 cur_snap; /* index+1 of current snapshot within snap context + 0 - for the head */ + int read_only; + + struct list_head node; +}; + +static spinlock_t node_lock; /* protects client get/put */ + +static struct class *class_rbd; /* /sys/class/rbd */ +static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ +static LIST_HEAD(rbd_dev_list); /* devices */ +static LIST_HEAD(rbd_client_list); /* clients */ + + +static int rbd_open(struct block_device *bdev, fmode_t mode) +{ + struct gendisk *disk = bdev->bd_disk; + struct rbd_device *rbd_dev = disk->private_data; + + set_device_ro(bdev, rbd_dev->read_only); + + if ((mode & FMODE_WRITE) && rbd_dev->read_only) + return -EROFS; + + return 0; +} + +static const struct block_device_operations rbd_bd_ops = { + .owner = THIS_MODULE, + .open = rbd_open, +}; + +/* + * Initialize an rbd client instance. + * We own *opt. + */ +static struct rbd_client *rbd_client_create(struct ceph_options *opt) +{ + struct rbd_client *rbdc; + int ret = -ENOMEM; + + dout("rbd_client_create\n"); + rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); + if (!rbdc) + goto out_opt; + + kref_init(&rbdc->kref); + INIT_LIST_HEAD(&rbdc->node); + + rbdc->client = ceph_create_client(opt, rbdc); + if (IS_ERR(rbdc->client)) + goto out_rbdc; + opt = NULL; /* Now rbdc->client is responsible for opt */ + + ret = ceph_open_session(rbdc->client); + if (ret < 0) + goto out_err; + + spin_lock(&node_lock); + list_add_tail(&rbdc->node, &rbd_client_list); + spin_unlock(&node_lock); + + dout("rbd_client_create created %p\n", rbdc); + return rbdc; + +out_err: + ceph_destroy_client(rbdc->client); +out_rbdc: + kfree(rbdc); +out_opt: + if (opt) + ceph_destroy_options(opt); + return ERR_PTR(ret); +} + +/* + * Find a ceph client with specific addr and configuration. + */ +static struct rbd_client *__rbd_client_find(struct ceph_options *opt) +{ + struct rbd_client *client_node; + + if (opt->flags & CEPH_OPT_NOSHARE) + return NULL; + + list_for_each_entry(client_node, &rbd_client_list, node) + if (ceph_compare_options(opt, client_node->client) == 0) + return client_node; + return NULL; +} + +/* + * Get a ceph client with specific addr and configuration, if one does + * not exist create it. + */ +static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, + char *options) +{ + struct rbd_client *rbdc; + struct ceph_options *opt; + int ret; + + ret = ceph_parse_options(&opt, options, mon_addr, + mon_addr + strlen(mon_addr), NULL, NULL); + if (ret < 0) + return ret; + + spin_lock(&node_lock); + rbdc = __rbd_client_find(opt); + if (rbdc) { + ceph_destroy_options(opt); + + /* using an existing client */ + kref_get(&rbdc->kref); + rbd_dev->rbd_client = rbdc; + rbd_dev->client = rbdc->client; + spin_unlock(&node_lock); + return 0; + } + spin_unlock(&node_lock); + + rbdc = rbd_client_create(opt); + if (IS_ERR(rbdc)) + return PTR_ERR(rbdc); + + rbd_dev->rbd_client = rbdc; + rbd_dev->client = rbdc->client; + return 0; +} + +/* + * Destroy ceph client + */ +static void rbd_client_release(struct kref *kref) +{ + struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); + + dout("rbd_release_client %p\n", rbdc); + spin_lock(&node_lock); + list_del(&rbdc->node); + spin_unlock(&node_lock); + + ceph_destroy_client(rbdc->client); + kfree(rbdc); +} + +/* + * Drop reference to ceph client node. If it's not referenced anymore, release + * it. + */ +static void rbd_put_client(struct rbd_device *rbd_dev) +{ + kref_put(&rbd_dev->rbd_client->kref, rbd_client_release); + rbd_dev->rbd_client = NULL; + rbd_dev->client = NULL; +} + + +/* + * Create a new header structure, translate header format from the on-disk + * header. + */ +static int rbd_header_from_disk(struct rbd_image_header *header, + struct rbd_image_header_ondisk *ondisk, + int allocated_snaps, + gfp_t gfp_flags) +{ + int i; + u32 snap_count = le32_to_cpu(ondisk->snap_count); + int ret = -ENOMEM; + + init_rwsem(&header->snap_rwsem); + + header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); + header->snapc = kmalloc(sizeof(struct ceph_snap_context) + + snap_count * + sizeof(struct rbd_image_snap_ondisk), + gfp_flags); + if (!header->snapc) + return -ENOMEM; + if (snap_count) { + header->snap_names = kmalloc(header->snap_names_len, + GFP_KERNEL); + if (!header->snap_names) + goto err_snapc; + header->snap_sizes = kmalloc(snap_count * sizeof(u64), + GFP_KERNEL); + if (!header->snap_sizes) + goto err_names; + } else { + header->snap_names = NULL; + header->snap_sizes = NULL; + } + memcpy(header->block_name, ondisk->block_name, + sizeof(ondisk->block_name)); + + header->image_size = le64_to_cpu(ondisk->image_size); + header->obj_order = ondisk->options.order; + header->crypt_type = ondisk->options.crypt_type; + header->comp_type = ondisk->options.comp_type; + + atomic_set(&header->snapc->nref, 1); + header->snap_seq = le64_to_cpu(ondisk->snap_seq); + header->snapc->num_snaps = snap_count; + header->total_snaps = snap_count; + + if (snap_count && + allocated_snaps == snap_count) { + for (i = 0; i < snap_count; i++) { + header->snapc->snaps[i] = + le64_to_cpu(ondisk->snaps[i].id); + header->snap_sizes[i] = + le64_to_cpu(ondisk->snaps[i].image_size); + } + + /* copy snapshot names */ + memcpy(header->snap_names, &ondisk->snaps[i], + header->snap_names_len); + } + + return 0; + +err_names: + kfree(header->snap_names); +err_snapc: + kfree(header->snapc); + return ret; +} + +static int snap_index(struct rbd_image_header *header, int snap_num) +{ + return header->total_snaps - snap_num; +} + |