diff options
-rw-r--r-- | Documentation/ABI/testing/sysfs-bus-rbd | 2 | ||||
-rw-r--r-- | drivers/block/rbd.c | 361 | ||||
-rw-r--r-- | fs/ceph/debugfs.c | 6 | ||||
-rw-r--r-- | fs/ceph/dir.c | 24 | ||||
-rw-r--r-- | fs/ceph/file.c | 10 | ||||
-rw-r--r-- | fs/ceph/inode.c | 25 | ||||
-rw-r--r-- | fs/ceph/super.c | 9 | ||||
-rw-r--r-- | fs/ceph/super.h | 66 | ||||
-rw-r--r-- | include/linux/ceph/ceph_fs.h | 19 | ||||
-rw-r--r-- | include/linux/ceph/libceph.h | 1 | ||||
-rw-r--r-- | include/linux/ceph/osd_client.h | 57 | ||||
-rw-r--r-- | include/linux/ceph/rados.h | 39 | ||||
-rw-r--r-- | net/ceph/armor.c | 4 | ||||
-rw-r--r-- | net/ceph/ceph_common.c | 1 | ||||
-rw-r--r-- | net/ceph/osd_client.c | 624 |
15 files changed, 1018 insertions, 230 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd index 90a87e2a572..fa72ccb2282 100644 --- a/Documentation/ABI/testing/sysfs-bus-rbd +++ b/Documentation/ABI/testing/sysfs-bus-rbd @@ -1,6 +1,6 @@ What: /sys/bus/rbd/ Date: November 2010 -Contact: Yehuda Sadeh <yehuda@hq.newdream.net>, +Contact: Yehuda Sadeh <yehuda@newdream.net>, Sage Weil <sage@newdream.net> Description: diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e1e38b11f48..16dc3645291 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -31,6 +31,7 @@ #include <linux/ceph/osd_client.h> #include <linux/ceph/mon_client.h> #include <linux/ceph/decode.h> +#include <linux/parser.h> #include <linux/kernel.h> #include <linux/device.h> @@ -54,6 +55,8 @@ #define DEV_NAME_LEN 32 +#define RBD_NOTIFY_TIMEOUT_DEFAULT 10 + /* * block device image metadata (in-memory version) */ @@ -71,6 +74,12 @@ struct rbd_image_header { char *snap_names; u64 *snap_sizes; + + u64 obj_version; +}; + +struct rbd_options { + int notify_timeout; }; /* @@ -78,6 +87,7 @@ struct rbd_image_header { */ struct rbd_client { struct ceph_client *client; + struct rbd_options *rbd_opts; struct kref kref; struct list_head node; }; @@ -124,6 +134,9 @@ struct rbd_device { char pool_name[RBD_MAX_POOL_NAME_LEN]; int poolid; + struct ceph_osd_event *watch_event; + struct ceph_osd_request *watch_request; + char snap_name[RBD_MAX_SNAP_NAME_LEN]; u32 cur_snap; /* index+1 of current snapshot within snap context 0 - for the head */ @@ -177,6 +190,8 @@ static void rbd_put_dev(struct rbd_device *rbd_dev) put_device(&rbd_dev->dev); } +static int __rbd_update_snaps(struct rbd_device *rbd_dev); + static int rbd_open(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; @@ -211,7 +226,8 @@ static const struct block_device_operations rbd_bd_ops = { * Initialize an rbd client instance. * We own *opt. */ -static struct rbd_client *rbd_client_create(struct ceph_options *opt) +static struct rbd_client *rbd_client_create(struct ceph_options *opt, + struct rbd_options *rbd_opts) { struct rbd_client *rbdc; int ret = -ENOMEM; @@ -233,6 +249,8 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt) if (ret < 0) goto out_err; + rbdc->rbd_opts = rbd_opts; + spin_lock(&node_lock); list_add_tail(&rbdc->node, &rbd_client_list); spin_unlock(&node_lock); @@ -267,6 +285,59 @@ static struct rbd_client *__rbd_client_find(struct ceph_options *opt) } /* + * mount options + */ +enum { + Opt_notify_timeout, + Opt_last_int, + /* int args above */ + Opt_last_string, + /* string args above */ +}; + +static match_table_t rbdopt_tokens = { + {Opt_notify_timeout, "notify_timeout=%d"}, + /* int args above */ + /* string args above */ + {-1, NULL} +}; + +static int parse_rbd_opts_token(char *c, void *private) +{ + struct rbd_options *rbdopt = private; + substring_t argstr[MAX_OPT_ARGS]; + int token, intval, ret; + + token = match_token((char *)c, rbdopt_tokens, argstr); + if (token < 0) + return -EINVAL; + + if (token < Opt_last_int) { + ret = match_int(&argstr[0], &intval); + if (ret < 0) { + pr_err("bad mount option arg (not int) " + "at '%s'\n", c); + return ret; + } + dout("got int token %d val %d\n", token, intval); + } else if (token > Opt_last_int && token < Opt_last_string) { + dout("got string token %d val %s\n", token, + argstr[0].from); + } else { + dout("got token %d\n", token); + } + + switch (token) { + case Opt_notify_timeout: + rbdopt->notify_timeout = intval; + break; + default: + BUG_ON(token); + } + return 0; +} + +/* * Get a ceph client with specific addr and configuration, if one does * not exist create it. */ @@ -276,11 +347,18 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, struct rbd_client *rbdc; struct ceph_options *opt; int ret; + struct rbd_options *rbd_opts; + + rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL); + if (!rbd_opts) + return -ENOMEM; + + rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; ret = ceph_parse_options(&opt, options, mon_addr, - mon_addr + strlen(mon_addr), NULL, NULL); + mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts); if (ret < 0) - return ret; + goto done_err; spin_lock(&node_lock); rbdc = __rbd_client_find(opt); @@ -296,13 +374,18 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, } spin_unlock(&node_lock); - rbdc = rbd_client_create(opt); - if (IS_ERR(rbdc)) - return PTR_ERR(rbdc); + rbdc = rbd_client_create(opt, rbd_opts); + if (IS_ERR(rbdc)) { + ret = PTR_ERR(rbdc); + goto done_err; + } rbd_dev->rbd_client = rbdc; rbd_dev->client = rbdc->client; return 0; +done_err: + kfree(rbd_opts); + return ret; } /* @@ -318,6 +401,7 @@ static void rbd_client_release(struct kref *kref) spin_unlock(&node_lock); ceph_destroy_client(rbdc->client); + kfree(rbdc->rbd_opts); kfree(rbdc); } @@ -666,7 +750,9 @@ static int rbd_do_request(struct request *rq, struct ceph_osd_req_op *ops, int num_reply, void (*rbd_cb)(struct ceph_osd_request *req, - struct ceph_msg *msg)) + struct ceph_msg *msg), + struct ceph_osd_request **linger_req, + u64 *ver) { struct ceph_osd_request *req; struct ceph_file_layout *layout; @@ -729,12 +815,20 @@ static int rbd_do_request(struct request *rq, req->r_oid, req->r_oid_len); up_read(&header->snap_rwsem); + if (linger_req) { + ceph_osdc_set_request_linger(&dev->client->osdc, req); + *linger_req = req; + } + ret = ceph_osdc_start_request(&dev->client->osdc, req, false); if (ret < 0) goto done_err; if (!rbd_cb) { ret = ceph_osdc_wait_request(&dev->client->osdc, req); + if (ver) + *ver = le64_to_cpu(req->r_reassert_version.version); + dout("reassert_ver=%lld\n", le64_to_cpu(req->r_reassert_version.version)); ceph_osdc_put_request(req); } return ret; @@ -789,6 +883,11 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) kfree(req_data); } +static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) +{ + ceph_osdc_put_request(req); +} + /* * Do a synchronous ceph osd operation */ @@ -801,7 +900,9 @@ static int rbd_req_sync_op(struct rbd_device *dev, int num_reply, const char *obj, u64 ofs, u64 len, - char *buf) + char *buf, + struct ceph_osd_request **linger_req, + u64 *ver) { int ret; struct page **pages; @@ -833,7 +934,8 @@ static int rbd_req_sync_op(struct rbd_device *dev, flags, ops, 2, - NULL); + NULL, + linger_req, ver); if (ret < 0) goto done_ops; @@ -893,7 +995,7 @@ static int rbd_do_op(struct request *rq, flags, ops, num_reply, - rbd_req_cb); + rbd_req_cb, 0, NULL); done: kfree(seg_name); return ret; @@ -940,18 +1042,174 @@ static int rbd_req_sync_read(struct rbd_device *dev, u64 snapid, const char *obj, u64 ofs, u64 len, - char *buf) + char *buf, + u64 *ver) { return rbd_req_sync_op(dev, NULL, (snapid ? snapid : CEPH_NOSNAP), CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, - 1, obj, ofs, len, buf); + 1, obj, ofs, len, buf, NULL, ver); } /* - * Request sync osd read + * Request sync osd watch + */ +static int rbd_req_sync_notify_ack(struct rbd_device *dev, + u64 ver, + u64 notify_id, + const char *obj) +{ + struct ceph_osd_req_op *ops; + struct page **pages = NULL; + int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0); + if (ret < 0) + return ret; + + ops[0].watch.ver = cpu_to_le64(dev->header.obj_version); + ops[0].watch.cookie = notify_id; + ops[0].watch.flag = 0; + + ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP, + obj, 0, 0, NULL, + pages, 0, + CEPH_OSD_FLAG_READ, + ops, + 1, + rbd_simple_req_cb, 0, NULL); + + rbd_destroy_ops(ops); + return ret; +} + +static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) +{ + struct rbd_device *dev = (struct rbd_device *)data; + if (!dev) + return; + + dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, + notify_id, (int)opcode); + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + __rbd_update_snaps(dev); + mutex_unlock(&ctl_mutex); + + rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); +} + +/* + * Request sync osd watch + */ +static int rbd_req_sync_watch(struct rbd_device *dev, + const char *obj, + u64 ver) +{ + struct ceph_osd_req_op *ops; + struct ceph_osd_client *osdc = &dev->client->osdc; + + int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); + if (ret < 0) + return ret; + + ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, + (void *)dev, &dev->watch_event); + if (ret < 0) + goto fail; + + ops[0].watch.ver = cpu_to_le64(ver); + ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); + ops[0].watch.flag = 1; + + ret = rbd_req_sync_op(dev, NULL, + CEPH_NOSNAP, + 0, + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, + ops, + 1, obj, 0, 0, NULL, + &dev->watch_request, NULL); + + if (ret < 0) + goto fail_event; + + rbd_destroy_ops(ops); + return 0; + +fail_event: + ceph_osdc_cancel_event(dev->watch_event); + dev->watch_event = NULL; +fail: + rbd_destroy_ops(ops); + return ret; +} + +struct rbd_notify_info { + struct rbd_device *dev; +}; + +static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data) +{ + struct rbd_device *dev = (struct rbd_device *)data; + if (!dev) + return; + + dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, + notify_id, (int)opcode); +} + +/* + * Request sync osd notify + */ +static int rbd_req_sync_notify(struct rbd_device *dev, + const char *obj) +{ + struct ceph_osd_req_op *ops; + struct ceph_osd_client *osdc = &dev->client->osdc; + struct ceph_osd_event *event; + struct rbd_notify_info info; + int payload_len = sizeof(u32) + sizeof(u32); + int ret; + + ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len); + if (ret < 0) + return ret; + + info.dev = dev; + + ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1, + (void *)&info, &event); + if (ret < 0) + goto fail; + + ops[0].watch.ver = 1; + ops[0].watch.flag = 1; + ops[0].watch.cookie = event->cookie; + ops[0].watch.prot_ver = RADOS_NOTIFY_VER; + ops[0].watch.timeout = 12; + + ret = rbd_req_sync_op(dev, NULL, + CEPH_NOSNAP, + 0, + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, + ops, + 1, obj, 0, 0, NULL, NULL, NULL); + if (ret < 0) + goto fail_event; + + ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT); + dout("ceph_osdc_wait_event returned %d\n", ret); + rbd_destroy_ops(ops); + return 0; + +fail_event: + ceph_osdc_cancel_event(event); +fail: + rbd_destroy_ops(ops); + return ret; +} + +/* + * Request sync osd rollback */ static int rbd_req_sync_rollback_obj(struct rbd_device *dev, u64 snapid, @@ -969,13 +1227,10 @@ static int rbd_req_sync_rollback_obj(struct rbd_device *dev, 0, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, - 1, obj, 0, 0, NULL); + 1, obj, 0, 0, NULL, NULL, NULL); rbd_destroy_ops(ops); - if (ret < 0) - return ret; - return ret; } @@ -987,7 +1242,8 @@ static int rbd_req_sync_exec(struct rbd_device *dev, const char *cls, const char *method, const char *data, - int len) + int len, + u64 *ver) { struct ceph_osd_req_op *ops; int cls_len = strlen(cls); @@ -1010,7 +1266,7 @@ static int rbd_req_sync_exec(struct rbd_device *dev, 0, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, - 1, obj, 0, 0, NULL); + 1, obj, 0, 0, NULL, NULL, ver); rbd_destroy_ops(ops); @@ -1156,6 +1412,7 @@ static int rbd_read_header(struct rbd_device *rbd_dev, struct rbd_image_header_ondisk *dh; int snap_count = 0; u64 snap_names_len = 0; + u64 ver; while (1) { int len = sizeof(*dh) + @@ -1171,7 +1428,7 @@ static int rbd_read_header(struct rbd_device *rbd_dev, NULL, CEPH_NOSNAP, rbd_dev->obj_md_name, 0, len, - (char *)dh); + (char *)dh, &ver); if (rc < 0) goto out_dh; @@ -1188,6 +1445,7 @@ static int rbd_read_header(struct rbd_device *rbd_dev, } break; } + header->obj_version = ver; out_dh: kfree(dh); @@ -1205,6 +1463,7 @@ static int rbd_header_add_snap(struct rbd_device *dev, u64 new_snapid; int ret; void *data, *data_start, *data_end; + u64 ver; /* we should create a snapshot only if we're pointing at the head */ if (dev->cur_snap) @@ -1227,7 +1486,7 @@ static int rbd_header_add_snap(struct rbd_device *dev, ceph_encode_64_safe(&data, data_end, new_snapid, bad); ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", - data_start, data - data_start); + data_start, data - data_start, &ver); kfree(data_start); @@ -1259,6 +1518,7 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev) int ret; struct rbd_image_header h; u64 snap_seq; + int follow_seq = 0; ret = rbd_read_header(rbd_dev, &h); if (ret < 0) @@ -1267,6 +1527,11 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev) down_write(&rbd_dev->header.snap_rwsem); snap_seq = rbd_dev->header.snapc->seq; + if (rbd_dev->header.total_snaps && + rbd_dev->header.snapc->snaps[0] == snap_seq) + /* pointing at the head, will need to follow that + if head moves */ + follow_seq = 1; kfree(rbd_dev->header.snapc); kfree(rbd_dev->header.snap_names); @@ -1277,7 +1542,10 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev) rbd_dev->header.snap_names = h.snap_names; rbd_dev->header.snap_names_len = h.snap_names_len; rbd_dev->header.snap_sizes = h.snap_sizes; - rbd_dev->header.snapc->seq = snap_seq; + if (follow_seq) + rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0]; + else + rbd_dev->header.snapc->seq = snap_seq; ret = __rbd_init_snaps_header(rbd_dev); @@ -1699,7 +1967,28 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev) device_unregister(&rbd_dev->dev); } -static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) +static int rbd_init_watch_dev(struct rbd_device *rbd_dev) +{ + int ret, rc; + + do { + ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name, + rbd_dev->header.obj_version); + if (ret == -ERANGE) { + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + rc = __rbd_update_snaps(rbd_dev); + mutex_unlock(&ctl_mutex); + if (rc < 0) + return rc; + } + } while (ret == -ERANGE); + + return ret; +} + +static ssize_t rbd_add(struct bus_type *bus, + const char *buf, + size_t count) { struct ceph_osd_client *osdc; struct rbd_device *rbd_dev; @@ -1797,6 +2086,10 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) if (rc) goto err_out_bus; + rc = rbd_init_watch_dev(rbd_dev); + if (rc) + goto err_out_bus; + return count; err_out_bus: @@ -1849,6 +2142,12 @@ static void rbd_dev_release(struct device *dev) struct rbd_device *rbd_dev = container_of(dev, struct rbd_device, dev); + if (rbd_dev->watch_request) + ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc, + rbd_dev->watch_request); + if (rbd_dev->watch_event) + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_put_client(rbd_dev); /* clean up and free blkdev */ @@ -1914,14 +2213,24 @@ static ssize_t rbd_snap_add(struct device *dev, ret = rbd_header_add_snap(rbd_dev, name, GFP_KERNEL); if (ret < 0) - goto done_unlock; + goto err_unlock; ret = __rbd_update_snaps(rbd_dev); if (ret < 0) - goto done_unlock; + goto err_unlock; + + /* shouldn't hold ctl_mutex when notifying.. notify might + trigger a watch callback that would need to get that mutex */ + mutex_unlock(&ctl_mutex); + + /* make a best effort, don't error if failed */ + rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name); ret = count; -done_unlock: + kfree(name); + return ret; + +err_unlock: mutex_unlock(&ctl_mutex); kfree(name); return ret; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 08f65faac11..0dba6915712 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -210,8 +210,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) if (!fsc->debugfs_congestion_kb) goto out; - dout("a\n"); - snprintf(name, sizeof(name), "../../bdi/%s", dev_name(fsc->backing_dev_info.dev)); fsc->debugfs_bdi = @@ -221,7 +219,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) if (!fsc->debugfs_bdi) goto out; - dout("b\n"); fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", 0600, fsc->client->debugfs_dir, @@ -230,7 +227,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) if (!fsc->debugfs_mdsmap) goto out; - dout("ca\n"); fsc->debugfs_mdsc = debugfs_create_file("mdsc", 0600, fsc->client->debugfs_dir, @@ -239,7 +235,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) if (!fsc->debugfs_mdsc) goto out; - dout("da\n"); fsc->debugfs_caps = debugfs_create_file("caps", 0400, fsc->client->debugfs_dir, @@ -248,7 +243,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) if (!fsc->debugfs_caps) goto out; - dout("ea\n"); fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", 0600, fsc->client->debugfs_dir, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index ebafa65a29b..1a867a3601a 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -161,7 +161,7 @@ more: filp->f_pos = di->offset; err = filldir(dirent, dentry->d_name.name, dentry->d_name.len, di->offset, - dentry->d_inode->i_ino, + ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), dentry->d_inode->i_mode >> 12); if (last) { @@ -245,15 +245,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) dout("readdir off 0 -> '.'\n"); if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), - inode->i_ino, inode->i_mode >> 12) < 0) + ceph_translate_ino(inode->i_sb, inode->i_ino), + inode->i_mode >> 12) < 0) return 0; filp->f_pos = 1; off = 1; } if (filp->f_pos == 1) { + ino_t ino = filp->f_dentry->d_parent->d_inode->i_ino; dout("readdir off 1 -> '..'\n"); if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), - filp->f_dentry->d_parent->d_inode->i_ino, + ceph_translate_ino(inode->i_sb, ino), inode->i_mode >> 12) < 0) return 0; filp->f_pos = 2; @@ -377,7 +379,8 @@ more: if (filldir(dirent, rinfo->dir_dname[off - fi->offset], rinfo->dir_dname_len[off - fi->offset], - pos, ino, ftype) < 0) { + pos, + ceph_translate_ino(inode->i_sb, ino), ftype) < 0) { dout("filldir stopping us...\n"); return 0; } @@ -1024,14 +1027,13 @@ out_touch: } /* - * When a dentry is released, clear the dir I_COMPLETE if it was part - * of the current dir gen or if this is in the snapshot namespace. + * Release our ceph_dentry_info. */ -static void ceph_dentry_release(struct dentry *dentry) +static void ceph_d_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); - dout("dentry_release %p\n", dentry); + dout("d_release %p\n", dentry); if (di) { ceph_dentry_lru_del(dentry); if (di->lease_session) @@ -1256,14 +1258,14 @@ const struct inode_operations ceph_dir_iops = { const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, - .d_release = ceph_dentry_release, + .d_release = ceph_d_release, }; const struct dentry_operations ceph_snapdir_dentry_ops = { .d_revalidate = ceph_snapdir_d_revalidate, - .d_release = ceph_dentry_release, + .d_release = ceph_d_release, }; const struct dentry_operations ceph_snap_dentry_ops = { - .d_release = ceph_dentry_release, + .d_release = ceph_d_release, }; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7d0e4a82d89..159b512d5a2 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -564,11 +564,19 @@ more: * start_request so that a tid has been assigned. */ spin_lock(&ci->i_unsafe_lock); - list_add(&req->r_unsafe_item, &ci->i_unsafe_writes); + list_add_tail(&req->r_unsafe_item, + &ci->i_unsafe_writes); spin_unlock(&ci->i_unsafe_lock); ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); } + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (ret < 0 && req->r_safe_callback) { + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_item); + spin_unlock(&ci->i_unsafe_lock); + ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + } } if (file->f_flags & O_DIRECT) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 193bfa5e9cb..b54c97da1c4 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -36,6 +36,13 @@ static void ceph_vmtruncate_work(struct work_struct *work); /* * find or create an inode, given the ceph ino number */ +static int ceph_set_ino_cb(struct inode *inode, void *data) +{ + ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; + inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); + return 0; +} + struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) { struct inode *inode; @@ -1030,9 +1037,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, dout("fill_trace doing d_move %p -> %p\n", req->r_old_dentry, dn); - /* d_move screws up d_subdirs order */ - ceph_i_clear(dir, CEPH_I_COMPLETE); - d_move(req->r_old_dentry, dn); dout(" src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, @@ -1044,12 +1048,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* take overwritten dentry's readdir offset */ - dout("dn %p gets %p offset %lld (old offset %lld)\n", - req->r_old_dentry, dn, ceph_dentry(dn)->offset, + /* + * d_move() puts the renamed dentry at the end of + * d_subdirs. We need to assign it an appropriate + * directory offset so we can behave when holding + * I_COMPLETE. + */ + ceph_set_dentry_offset(req->r_old_dentry); + dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); - ceph_dentry(req->r_old_dentry)->offset = - ceph_dentry(dn)->offset; dn = req->r_old_dentry; /* use old_dentry */ in = dn->d_inode; @@ -1809,7 +1816,7 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL); if (!err) { generic_fillattr(inode, stat); - stat->ino = inode->i_ino; + stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); if (ceph_snap(inode) != CEPH_NOSNAP) stat->dev = ceph_snap(inode); else diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9c5085465a6..a9e78b4a258 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -131,6 +131,7 @@ enum { Opt_rbytes, Opt_norbytes, Opt_noasyncreaddir, + Opt_ino32, }; static match_table_t fsopt_tokens = { @@ -150,6 +151,7 @@ static match_table_t fsopt_tokens = { {Opt_rbytes, "rbytes"}, {Opt_norbytes, "norbytes"}, {Opt_noasyncreaddir, "noasyncreaddir"}, + {Opt_ino32, "ino32"}, {-1, NULL} }; @@ -225,6 +227,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_noasyncreaddir: fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; break; + case Opt_ino32: + fsopt->flags |= CEPH_MOUNT_OPT_INO32; + break; default: BUG_ON(token); } @@ -288,7 +293,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->sb_flags = flags; fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; - fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; + fsopt->rsize = CEPH_RSIZE_DEFAULT; fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; @@ -370,7 +375,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); - if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT) + if (fsopt->rsize != CEPH_RSIZE_DEFAULT) seq_printf(m, ",rsize=%d", fsopt->rsize); if (fsopt->congestion_kb != default_congestion_kb()) seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 20b907d76ae..619fe719968 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -27,6 +27,7 @@ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ +#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) @@ -35,6 +36,7 @@ #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) +#define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" @@ -319,6 +321,16 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode) return container_of(inode, struct ceph_inode_info, vfs_inode); } +static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) +{ + return (struct ceph_fs_client *)inode->i_sb->s_fs_info; +} + +static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) +{ + return (struct ceph_fs_client *)sb->s_fs_info; +} + static inline struct ceph_vino ceph_vino(struct inode *inode) { return ceph_inode(inode)->i_vino; @@ -327,19 +339,49 @@ static inline struct ceph_vino ceph_vino(struct inode *inode) /* * ino_t is <64 bits on many architectures, blech. * - * don't include snap in ino hash, at least for now. + * i_ino (kernel inode) st_ino (userspace) + * i386 32 32 + * x86_64+ino32 64 32 + * x86_64 64 64 + */ +static inline u32 ceph_ino_to_ino32(ino_t ino) +{ + ino ^= ino >> (sizeof(ino) * 8 - 32); + if (!ino) + ino = 1; + return ino; +} + +/* + * kernel i_ino value */ static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) { ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ #if BITS_PER_LONG == 32 - ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; - if (!ino) - ino = 1; + ino = ceph_ino_to_ino32(ino); #endif return ino; } +/* + * user-visible ino (stat, filldir) + */ +#if BITS_PER_LONG == 32 +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) +{ + return ino; +} +#else +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) +{ + if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)) + ino = ceph_ino_to_ino32(ino); + return ino; +} +#endif + + /* for printf-style formatting */ #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap @@ -428,13 +470,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) return ((loff_t)frag << 32) | (loff_t)off; } -static inline int ceph_set_ino_cb(struct inode *inode, void *data) -{ - ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; - inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); - return 0; -} - /* * caps helpers */ @@ -503,15 +538,6 @@ extern void ceph_reservation_status(struct ceph_fs_client *client, int *total, int *avail, int *used, int *reserved, int *min); -static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) -{ - return (struct ceph_fs_client *)inode->i_sb->s_fs_info; -} - -static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) -{ - return (struct ceph_fs_client *)sb->s_fs_info; -} /* diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 09dcc0c2ffd..b8e995fbd86 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -136,9 +136,18 @@ struct ceph_dir_layout { /* osd */ -#define CEPH_MSG_OSD_MAP 41 -#define CEPH_MSG_OSD_OP 42 -#define CEPH_MSG_OSD_OPREPLY 43 +#define CEPH_MSG_OSD_MAP 41 +#define CEPH_MSG_OSD_OP 42 +#define CEPH_MSG_OSD_OPREPLY 43 +#define CEPH_MSG_WATCH_NOTIFY 44 + + +/* watch-notify operations */ +enum { + WATCH_NOTIFY = 1, /* notifying watcher */ + WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */ +}; + /* pool operations */ enum { @@ -213,8 +222,10 @@ struct ceph_client_mount { struct ceph_mon_request_header monhdr; } __attribute__ ((packed)); +#define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */ + struct ceph_mon_subscribe_item { - __le64 have_version; __le64 have; + __le64 have_version; __le64 have; __u8 onetime; } __attribute__ ((packed)); diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph. |