aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-10-21 12:38:28 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-21 12:38:28 -0700
commit2017bd19454ea7cdae19922d15b6930f6c8088a2 (patch)
tree53974657ab3a2c98f2da7b3fcb050ff5b697f876
parent9f1ad09493451c19d00c004da479acf699eeedd6 (diff)
parentefa4c1206eaff047c474af2136748a58eb8cc33b (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (22 commits) ceph: do not carry i_lock for readdir from dcache fs/ceph/xattr.c: Use kmemdup rbd: passing wrong variable to bvec_kunmap_irq() rbd: null vs ERR_PTR ceph: fix num_pages_free accounting in pagelist ceph: add CEPH_MDS_OP_SETDIRLAYOUT and associated ioctl. ceph: don't crash when passed bad mount options ceph: fix debugfs warnings block: rbd: removing unnecessary test block: rbd: fixed may leaks ceph: switch from BKL to lock_flocks() ceph: preallocate flock state without locks held ceph: add pagelist_reserve, pagelist_truncate, pagelist_set_cursor ceph: use mapping->nrpages to determine if mapping is empty ceph: only invalidate on check_caps if we actually have pages ceph: do not hide .snap in root directory rbd: introduce rados block device (rbd), based on libceph ceph: factor out libceph from Ceph file system ceph-rbd: osdc support for osd call and rollback operations ceph: messenger and osdc changes for rbd ...
-rw-r--r--MAINTAINERS11
-rw-r--r--drivers/block/Kconfig17
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/rbd.c1841
-rw-r--r--drivers/block/rbd_types.h73
-rw-r--r--fs/ceph/Kconfig14
-rw-r--r--fs/ceph/Makefile11
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c65
-rw-r--r--fs/ceph/caps.c50
-rw-r--r--fs/ceph/ceph_frag.c3
-rw-r--r--fs/ceph/debugfs.c406
-rw-r--r--fs/ceph/dir.c97
-rw-r--r--fs/ceph/export.c5
-rw-r--r--fs/ceph/file.c207
-rw-r--r--fs/ceph/inode.c19
-rw-r--r--fs/ceph/ioctl.c77
-rw-r--r--fs/ceph/ioctl.h4
-rw-r--r--fs/ceph/locks.c23
-rw-r--r--fs/ceph/mds_client.c129
-rw-r--r--fs/ceph/mds_client.h20
-rw-r--r--fs/ceph/mdsmap.c11
-rw-r--r--fs/ceph/pagelist.c63
-rw-r--r--fs/ceph/snap.c10
-rw-r--r--fs/ceph/strings.c (renamed from fs/ceph/ceph_strings.c)82
-rw-r--r--fs/ceph/super.c1154
-rw-r--r--fs/ceph/super.h400
-rw-r--r--fs/ceph/xattr.c18
-rw-r--r--include/linux/ceph/auth.h (renamed from fs/ceph/auth.h)4
-rw-r--r--include/linux/ceph/buffer.h (renamed from fs/ceph/buffer.h)0
-rw-r--r--include/linux/ceph/ceph_debug.h (renamed from fs/ceph/ceph_debug.h)5
-rw-r--r--include/linux/ceph/ceph_frag.h (renamed from fs/ceph/ceph_frag.h)0
-rw-r--r--include/linux/ceph/ceph_fs.h (renamed from fs/ceph/ceph_fs.h)1
-rw-r--r--include/linux/ceph/ceph_hash.h (renamed from fs/ceph/ceph_hash.h)0
-rw-r--r--include/linux/ceph/debugfs.h33
-rw-r--r--include/linux/ceph/decode.h (renamed from fs/ceph/decode.h)5
-rw-r--r--include/linux/ceph/libceph.h249
-rw-r--r--include/linux/ceph/mdsmap.h (renamed from fs/ceph/mdsmap.h)0
-rw-r--r--include/linux/ceph/messenger.h (renamed from fs/ceph/messenger.h)12
-rw-r--r--include/linux/ceph/mon_client.h (renamed from fs/ceph/mon_client.h)1
-rw-r--r--include/linux/ceph/msgpool.h (renamed from fs/ceph/msgpool.h)0
-rw-r--r--include/linux/ceph/msgr.h (renamed from fs/ceph/msgr.h)0
-rw-r--r--include/linux/ceph/osd_client.h (renamed from fs/ceph/osd_client.h)67
-rw-r--r--include/linux/ceph/osdmap.h (renamed from fs/ceph/osdmap.h)4
-rw-r--r--include/linux/ceph/pagelist.h (renamed from fs/ceph/pagelist.h)23
-rw-r--r--include/linux/ceph/rados.h (renamed from fs/ceph/rados.h)0
-rw-r--r--include/linux/ceph/types.h (renamed from fs/ceph/types.h)0
-rw-r--r--include/linux/crush/crush.h (renamed from fs/ceph/crush/crush.h)0
-rw-r--r--include/linux/crush/hash.h (renamed from fs/ceph/crush/hash.h)0
-rw-r--r--include/linux/crush/mapper.h (renamed from fs/ceph/crush/mapper.h)0
-rw-r--r--net/Kconfig1
-rw-r--r--net/Makefile1
-rw-r--r--net/ceph/Kconfig28
-rw-r--r--net/ceph/Makefile37
-rw-r--r--net/ceph/armor.c (renamed from fs/ceph/armor.c)0
-rw-r--r--net/ceph/auth.c (renamed from fs/ceph/auth.c)10
-rw-r--r--net/ceph/auth_none.c (renamed from fs/ceph/auth_none.c)7
-rw-r--r--net/ceph/auth_none.h (renamed from fs/ceph/auth_none.h)3
-rw-r--r--net/ceph/auth_x.c (renamed from fs/ceph/auth_x.c)9
-rw-r--r--net/ceph/auth_x.h (renamed from fs/ceph/auth_x.h)3
-rw-r--r--net/ceph/auth_x_protocol.h (renamed from fs/ceph/auth_x_protocol.h)0
-rw-r--r--net/ceph/buffer.c (renamed from fs/ceph/buffer.c)9
-rw-r--r--net/ceph/ceph_common.c529
-rw-r--r--net/ceph/ceph_fs.c (renamed from fs/ceph/ceph_fs.c)5
-rw-r--r--net/ceph/ceph_hash.c (renamed from fs/ceph/ceph_hash.c)2
-rw-r--r--net/ceph/ceph_strings.c84
-rw-r--r--net/ceph/crush/crush.c (renamed from fs/ceph/crush/crush.c)2
-rw-r--r--net/ceph/crush/hash.c (renamed from fs/ceph/crush/hash.c)2
-rw-r--r--net/ceph/crush/mapper.c (renamed from fs/ceph/crush/mapper.c)4
-rw-r--r--net/ceph/crypto.c (renamed from fs/ceph/crypto.c)4
-rw-r--r--net/ceph/crypto.h (renamed from fs/ceph/crypto.h)4
-rw-r--r--net/ceph/debugfs.c267
-rw-r--r--net/ceph/messenger.c (renamed from fs/ceph/messenger.c)296
-rw-r--r--net/ceph/mon_client.c (renamed from fs/ceph/mon_client.c)73
-rw-r--r--net/ceph/msgpool.c (renamed from fs/ceph/msgpool.c)4
-rw-r--r--net/ceph/osd_client.c (renamed from fs/ceph/osd_client.c)400
-rw-r--r--net/ceph/osdmap.c (renamed from fs/ceph/osdmap.c)30
-rw-r--r--net/ceph/pagelist.c154
-rw-r--r--net/ceph/pagevec.c223
79 files changed, 5380 insertions, 2046 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index f2a2b8e647c..3d4179fbc52 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1527,6 +1527,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
S: Supported
F: Documentation/filesystems/ceph.txt
F: fs/ceph
+F: net/ceph
+F: include/linux/ceph
CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
M: David Vrabel <david.vrabel@csr.com>
@@ -4805,6 +4807,15 @@ F: fs/qnx4/
F: include/linux/qnx4_fs.h
F: include/linux/qnxtypes.h
+RADOS BLOCK DEVICE (RBD)
+F: include/linux/qnxtypes.h
+M: Yehuda Sadeh <yehuda@hq.newdream.net>
+M: Sage Weil <sage@newdream.net>
+M: ceph-devel@vger.kernel.org
+S: Supported
+F: drivers/block/rbd.c
+F: drivers/block/rbd_types.h
+
RADEON FRAMEBUFFER DISPLAY DRIVER
M: Benjamin Herrenschmidt <benh@kernel.crashing.org>
L: linux-fbdev@vger.kernel.org
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index de277689da6..4b9359a6f6c 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -488,4 +488,21 @@ config BLK_DEV_HD
If unsure, say N.
+config BLK_DEV_RBD
+ tristate "Rados block device (RBD)"
+ depends on INET && EXPERIMENTAL && BLOCK
+ select CEPH_LIB
+ select LIBCRC32C
+ select CRYPTO_AES
+ select CRYPTO
+ default n
+ help
+ Say Y here if you want include the Rados block device, which stripes
+ a block device over objects stored in the Ceph distributed object
+ store.
+
+ More information at http://ceph.newdream.net/.
+
+ If unsure, say N.
+
endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index aff5ac925c3..d7f463d6312 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD) += hd.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
+obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
swim_mod-objs := swim.o swim_asm.o
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
new file mode 100644
index 00000000000..6ec9d53806c
--- /dev/null
+++ b/drivers/block/rbd.c
@@ -0,0 +1,1841 @@
+/*
+ rbd.c -- Export ceph rados objects as a Linux block device
+
+
+ based on drivers/block/osdblk.c:
+
+ Copyright 2009 Red Hat, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+
+ Instructions for use
+ --------------------
+
+ 1) Map a Linux block device to an existing rbd image.
+
+ Usage: <mon ip addr> <options> <pool name> <rbd image name> [snap name]
+
+ $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add
+
+ The snapshot name can be "-" or omitted to map the image read/write.
+
+ 2) List all active blkdev<->object mappings.
+
+ In this example, we have performed step #1 twice, creating two blkdevs,
+ mapped to two separate rados objects in the rados rbd pool
+
+ $ cat /sys/class/rbd/list
+ #id major client_name pool name snap KB
+ 0 254 client4143 rbd foo - 1024000
+
+ The columns, in order, are:
+ - blkdev unique id
+ - blkdev assigned major
+ - rados client id
+ - rados pool name
+ - rados block device name
+ - mapped snapshot ("-" if none)
+ - device size in KB
+
+
+ 3) Create a snapshot.
+
+ Usage: <blkdev id> <snapname>
+
+ $ echo "0 mysnap" > /sys/class/rbd/snap_create
+
+
+ 4) Listing a snapshot.
+
+ $ cat /sys/class/rbd/snaps_list
+ #id snap KB
+ 0 - 1024000 (*)
+ 0 foo 1024000
+
+ The columns, in order, are:
+ - blkdev unique id
+ - snapshot name, '-' means none (active read/write version)
+ - size of device at time of snapshot
+ - the (*) indicates this is the active version
+
+ 5) Rollback to snapshot.
+
+ Usage: <blkdev id> <snapname>
+
+ $ echo "0 mysnap" > /sys/class/rbd/snap_rollback
+
+
+ 6) Mapping an image using snapshot.
+
+ A snapshot mapping is read-only. This is being done by passing
+ snap=<snapname> to the options when adding a device.
+
+ $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add
+
+
+ 7) Remove an active blkdev<->rbd image mapping.
+
+ In this example, we remove the mapping with blkdev unique id 1.
+
+ $ echo 1 > /sys/class/rbd/remove
+
+
+ NOTE: The actual creation and deletion of rados objects is outside the scope
+ of this driver.
+
+ */
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/decode.h>
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+
+#include "rbd_types.h"
+
+#define DRV_NAME "rbd"
+#define DRV_NAME_LONG "rbd (rados block device)"
+
+#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
+
+#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX))
+#define RBD_MAX_POOL_NAME_LEN 64
+#define RBD_MAX_SNAP_NAME_LEN 32
+#define RBD_MAX_OPT_LEN 1024
+
+#define RBD_SNAP_HEAD_NAME "-"
+
+#define DEV_NAME_LEN 32
+
+/*
+ * block device image metadata (in-memory version)
+ */
+struct rbd_image_header {
+ u64 image_size;
+ char block_name[32];
+ __u8 obj_order;
+ __u8 crypt_type;
+ __u8 comp_type;
+ struct rw_semaphore snap_rwsem;
+ struct ceph_snap_context *snapc;
+ size_t snap_names_len;
+ u64 snap_seq;
+ u32 total_snaps;
+
+ char *snap_names;
+ u64 *snap_sizes;
+};
+
+/*
+ * an instance of the client. multiple devices may share a client.
+ */
+struct rbd_client {
+ struct ceph_client *client;
+ struct kref kref;
+ struct list_head node;
+};
+
+/*
+ * a single io request
+ */
+struct rbd_request {
+ struct request *rq; /* blk layer request */
+ struct bio *bio; /* cloned bio */
+ struct page **pages; /* list of used pages */
+ u64 len;
+};
+
+/*
+ * a single device
+ */
+struct rbd_device {
+ int id; /* blkdev unique id */
+
+ int major; /* blkdev assigned major */
+ struct gendisk *disk; /* blkdev's gendisk and rq */
+ struct request_queue *q;
+
+ struct ceph_client *client;
+ struct rbd_client *rbd_client;
+
+ char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
+
+ spinlock_t lock; /* queue lock */
+
+ struct rbd_image_header header;
+ char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
+ int obj_len;
+ char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
+ char pool_name[RBD_MAX_POOL_NAME_LEN];
+ int poolid;
+
+ char snap_name[RBD_MAX_SNAP_NAME_LEN];
+ u32 cur_snap; /* index+1 of current snapshot within snap context
+ 0 - for the head */
+ int read_only;
+
+ struct list_head node;
+};
+
+static spinlock_t node_lock; /* protects client get/put */
+
+static struct class *class_rbd; /* /sys/class/rbd */
+static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
+static LIST_HEAD(rbd_dev_list); /* devices */
+static LIST_HEAD(rbd_client_list); /* clients */
+
+
+static int rbd_open(struct block_device *bdev, fmode_t mode)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ struct rbd_device *rbd_dev = disk->private_data;
+
+ set_device_ro(bdev, rbd_dev->read_only);
+
+ if ((mode & FMODE_WRITE) && rbd_dev->read_only)
+ return -EROFS;
+
+ return 0;
+}
+
+static const struct block_device_operations rbd_bd_ops = {
+ .owner = THIS_MODULE,
+ .open = rbd_open,
+};
+
+/*
+ * Initialize an rbd client instance.
+ * We own *opt.
+ */
+static struct rbd_client *rbd_client_create(struct ceph_options *opt)
+{
+ struct rbd_client *rbdc;
+ int ret = -ENOMEM;
+
+ dout("rbd_client_create\n");
+ rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
+ if (!rbdc)
+ goto out_opt;
+
+ kref_init(&rbdc->kref);
+ INIT_LIST_HEAD(&rbdc->node);
+
+ rbdc->client = ceph_create_client(opt, rbdc);
+ if (IS_ERR(rbdc->client))
+ goto out_rbdc;
+ opt = NULL; /* Now rbdc->client is responsible for opt */
+
+ ret = ceph_open_session(rbdc->client);
+ if (ret < 0)
+ goto out_err;
+
+ spin_lock(&node_lock);
+ list_add_tail(&rbdc->node, &rbd_client_list);
+ spin_unlock(&node_lock);
+
+ dout("rbd_client_create created %p\n", rbdc);
+ return rbdc;
+
+out_err:
+ ceph_destroy_client(rbdc->client);
+out_rbdc:
+ kfree(rbdc);
+out_opt:
+ if (opt)
+ ceph_destroy_options(opt);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Find a ceph client with specific addr and configuration.
+ */
+static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
+{
+ struct rbd_client *client_node;
+
+ if (opt->flags & CEPH_OPT_NOSHARE)
+ return NULL;
+
+ list_for_each_entry(client_node, &rbd_client_list, node)
+ if (ceph_compare_options(opt, client_node->client) == 0)
+ return client_node;
+ return NULL;
+}
+
+/*
+ * Get a ceph client with specific addr and configuration, if one does
+ * not exist create it.
+ */
+static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
+ char *options)
+{
+ struct rbd_client *rbdc;
+ struct ceph_options *opt;
+ int ret;
+
+ ret = ceph_parse_options(&opt, options, mon_addr,
+ mon_addr + strlen(mon_addr), NULL, NULL);
+ if (ret < 0)
+ return ret;
+
+ spin_lock(&node_lock);
+ rbdc = __rbd_client_find(opt);
+ if (rbdc) {
+ ceph_destroy_options(opt);
+
+ /* using an existing client */
+ kref_get(&rbdc->kref);
+ rbd_dev->rbd_client = rbdc;
+ rbd_dev->client = rbdc->client;
+ spin_unlock(&node_lock);
+ return 0;
+ }
+ spin_unlock(&node_lock);
+
+ rbdc = rbd_client_create(opt);
+ if (IS_ERR(rbdc))
+ return PTR_ERR(rbdc);
+
+ rbd_dev->rbd_client = rbdc;
+ rbd_dev->client = rbdc->client;
+ return 0;
+}
+
+/*
+ * Destroy ceph client
+ */
+static void rbd_client_release(struct kref *kref)
+{
+ struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
+
+ dout("rbd_release_client %p\n", rbdc);
+ spin_lock(&node_lock);
+ list_del(&rbdc->node);
+ spin_unlock(&node_lock);
+
+ ceph_destroy_client(rbdc->client);
+ kfree(rbdc);
+}
+
+/*
+ * Drop reference to ceph client node. If it's not referenced anymore, release
+ * it.
+ */
+static void rbd_put_client(struct rbd_device *rbd_dev)
+{
+ kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
+ rbd_dev->rbd_client = NULL;
+ rbd_dev->client = NULL;
+}
+
+
+/*
+ * Create a new header structure, translate header format from the on-disk
+ * header.
+ */
+static int rbd_header_from_disk(struct rbd_image_header *header,
+ struct rbd_image_header_ondisk *ondisk,
+ int allocated_snaps,
+ gfp_t gfp_flags)
+{
+ int i;
+ u32 snap_count = le32_to_cpu(ondisk->snap_count);
+ int ret = -ENOMEM;
+
+ init_rwsem(&header->snap_rwsem);
+
+ header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
+ header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
+ snap_count *
+ sizeof(struct rbd_image_snap_ondisk),
+ gfp_flags);
+ if (!header->snapc)
+ return -ENOMEM;
+ if (snap_count) {
+ header->snap_names = kmalloc(header->snap_names_len,
+ GFP_KERNEL);
+ if (!header->snap_names)
+ goto err_snapc;
+ header->snap_sizes = kmalloc(snap_count * sizeof(u64),
+ GFP_KERNEL);
+ if (!header->snap_sizes)
+ goto err_names;
+ } else {
+ header->snap_names = NULL;
+ header->snap_sizes = NULL;
+ }
+ memcpy(header->block_name, ondisk->block_name,
+ sizeof(ondisk->block_name));
+
+ header->image_size = le64_to_cpu(ondisk->image_size);
+ header->obj_order = ondisk->options.order;
+ header->crypt_type = ondisk->options.crypt_type;
+ header->comp_type = ondisk->options.comp_type;
+
+ atomic_set(&header->snapc->nref, 1);
+ header->snap_seq = le64_to_cpu(ondisk->snap_seq);
+ header->snapc->num_snaps = snap_count;
+ header->total_snaps = snap_count;
+
+ if (snap_count &&
+ allocated_snaps == snap_count) {
+ for (i = 0; i < snap_count; i++) {
+ header->snapc->snaps[i] =
+ le64_to_cpu(ondisk->snaps[i].id);
+ header->snap_sizes[i] =
+ le64_to_cpu(ondisk->snaps[i].image_size);
+ }
+
+ /* copy snapshot names */
+ memcpy(header->snap_names, &ondisk->snaps[i],
+ header->snap_names_len);
+ }
+
+ return 0;
+
+err_names:
+ kfree(header->snap_names);
+err_snapc:
+ kfree(header->snapc);
+ return ret;
+}
+
+static int snap_index(struct rbd_image_header *header, int snap_num)
+{
+ return header->total_snaps - snap_num;
+}
+