aboutsummaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-05-06 13:11:19 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2013-05-06 13:11:19 -0700
commit91f8575685e35f3bd021286bc82d26397458f5a9 (patch)
tree09de8d889758a12071adb9427ed741e27c907aa6 /net
parent2e378f3eebd28feefbb1f9953834a5a19482f053 (diff)
parentb5b09be30cf99f9c699e825629f02e3bce555d44 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph changes from Alex Elder: "This is a big pull. Most of it is culmination of Alex's work to implement RBD image layering, which is now complete (yay!). There is also some work from Yan to fix i_mutex behavior surrounding writes in cephfs, a sync write fix, a fix for RBD images that get resized while they are mapped, and a few patches from me that resolve annoying auth warnings and fix several bugs in the ceph auth code." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (254 commits) rbd: fix image request leak on parent read libceph: use slab cache for osd client requests libceph: allocate ceph message data with a slab allocator libceph: allocate ceph messages with a slab allocator rbd: allocate image object names with a slab allocator rbd: allocate object requests with a slab allocator rbd: allocate name separate from obj_request rbd: allocate image requests with a slab allocator rbd: use binary search for snapshot lookup rbd: clear EXISTS flag if mapped snapshot disappears rbd: kill off the snapshot list rbd: define rbd_snap_size() and rbd_snap_features() rbd: use snap_id not index to look up snap info rbd: look up snapshot name in names buffer rbd: drop obj_request->version rbd: drop rbd_obj_method_sync() version parameter rbd: more version parameter removal rbd: get rid of some version parameters rbd: stop tracking header object version rbd: snap names are pointer to constant data ...
Diffstat (limited to 'net')
-rw-r--r--net/ceph/Makefile2
-rw-r--r--net/ceph/auth.c117
-rw-r--r--net/ceph/auth_x.c24
-rw-r--r--net/ceph/auth_x.h1
-rw-r--r--net/ceph/ceph_common.c7
-rw-r--r--net/ceph/debugfs.c4
-rw-r--r--net/ceph/messenger.c1019
-rw-r--r--net/ceph/mon_client.c7
-rw-r--r--net/ceph/osd_client.c1087
-rw-r--r--net/ceph/osdmap.c45
-rw-r--r--net/ceph/snapshot.c78
11 files changed, 1665 insertions, 726 deletions
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index e87ef435e11..958d9856912 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
crypto.o armor.o \
auth_x.o \
ceph_fs.o ceph_strings.o ceph_hash.o \
- pagevec.o
+ pagevec.o snapshot.o
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index b4bf4ac090f..6b923bcaa2a 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -47,6 +47,7 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp
if (!ac)
goto out;
+ mutex_init(&ac->mutex);
ac->negotiating = true;
if (name)
ac->name = name;
@@ -73,10 +74,12 @@ void ceph_auth_destroy(struct ceph_auth_client *ac)
*/
void ceph_auth_reset(struct ceph_auth_client *ac)
{
+ mutex_lock(&ac->mutex);
dout("auth_reset %p\n", ac);
if (ac->ops && !ac->negotiating)
ac->ops->reset(ac);
ac->negotiating = true;
+ mutex_unlock(&ac->mutex);
}
int ceph_entity_name_encode(const char *name, void **p, void *end)
@@ -102,6 +105,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
int i, num;
int ret;
+ mutex_lock(&ac->mutex);
dout("auth_build_hello\n");
monhdr->have_version = 0;
monhdr->session_mon = cpu_to_le16(-1);
@@ -122,15 +126,19 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
ret = ceph_entity_name_encode(ac->name, &p, end);
if (ret < 0)
- return ret;
+ goto out;
ceph_decode_need(&p, end, sizeof(u64), bad);
ceph_encode_64(&p, ac->global_id);
ceph_encode_32(&lenp, p - lenp - sizeof(u32));
- return p - buf;
+ ret = p - buf;
+out:
+ mutex_unlock(&ac->mutex);
+ return ret;
bad:
- return -ERANGE;
+ ret = -ERANGE;
+ goto out;
}
static int ceph_build_auth_request(struct ceph_auth_client *ac,
@@ -151,11 +159,13 @@ static int ceph_build_auth_request(struct ceph_auth_client *ac,
if (ret < 0) {
pr_err("error %d building auth method %s request\n", ret,
ac->ops->name);
- return ret;
+ goto out;
}
dout(" built request %d bytes\n", ret);
ceph_encode_32(&p, ret);
- return p + ret - msg_buf;
+ ret = p + ret - msg_buf;
+out:
+ return ret;
}
/*
@@ -176,6 +186,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
int result_msg_len;
int ret = -EINVAL;
+ mutex_lock(&ac->mutex);
dout("handle_auth_reply %p %p\n", p, end);
ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
protocol = ceph_decode_32(&p);
@@ -227,33 +238,103 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
ret = ac->ops->handle_reply(ac, result, payload, payload_end);
if (ret == -EAGAIN) {
- return ceph_build_auth_request(ac, reply_buf, reply_len);
+ ret = ceph_build_auth_request(ac, reply_buf, reply_len);
} else if (ret) {
pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
- return ret;
}
- return 0;
-bad:
- pr_err("failed to decode auth msg\n");
out:
+ mutex_unlock(&ac->mutex);
return ret;
+
+bad:
+ pr_err("failed to decode auth msg\n");
+ ret = -EINVAL;
+ goto out;
}
int ceph_build_auth(struct ceph_auth_client *ac,
void *msg_buf, size_t msg_len)
{
+ int ret = 0;
+
+ mutex_lock(&ac->mutex);
if (!ac->protocol)
- return ceph_auth_build_hello(ac, msg_buf, msg_len);
- BUG_ON(!ac->ops);
- if (ac->ops->should_authenticate(ac))
- return ceph_build_auth_request(ac, msg_buf, msg_len);
- return 0;
+ ret = ceph_auth_build_hello(ac, msg_buf, msg_len);
+ else if (ac->ops->should_authenticate(ac))
+ ret = ceph_build_auth_request(ac, msg_buf, msg_len);
+ mutex_unlock(&ac->mutex);
+ return ret;
}
int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
{
- if (!ac->ops)
- return 0;
- return ac->ops->is_authenticated(ac);
+ int ret = 0;
+
+ mutex_lock(&ac->mutex);
+ if (ac->ops)
+ ret = ac->ops->is_authenticated(ac);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_auth_is_authenticated);
+
+int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
+ int peer_type,
+ struct ceph_auth_handshake *auth)
+{
+ int ret = 0;
+
+ mutex_lock(&ac->mutex);
+ if (ac->ops && ac->ops->create_authorizer)
+ ret = ac->ops->create_authorizer(ac, peer_type, auth);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_auth_create_authorizer);
+
+void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a)
+{
+ mutex_lock(&ac->mutex);
+ if (ac->ops && ac->ops->destroy_authorizer)
+ ac->ops->destroy_authorizer(ac, a);
+ mutex_unlock(&ac->mutex);
+}
+EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
+
+int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
+ int peer_type,
+ struct ceph_auth_handshake *a)
+{
+ int ret = 0;
+
+ mutex_lock(&ac->mutex);
+ if (ac->ops && ac->ops->update_authorizer)
+ ret = ac->ops->update_authorizer(ac, peer_type, a);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_auth_update_authorizer);
+
+int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a, size_t len)
+{
+ int ret = 0;
+
+ mutex_lock(&ac->mutex);
+ if (ac->ops && ac->ops->verify_authorizer_reply)
+ ret = ac->ops->verify_authorizer_reply(ac, a, len);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_auth_verify_authorizer_reply);
+
+void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
+{
+ mutex_lock(&ac->mutex);
+ if (ac->ops && ac->ops->invalidate_authorizer)
+ ac->ops->invalidate_authorizer(ac, peer_type);
+ mutex_unlock(&ac->mutex);
}
+EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index a16bf14eb02..96238ba95f2 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -298,6 +298,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
return -ENOMEM;
}
au->service = th->service;
+ au->secret_id = th->secret_id;
msg_a = au->buf->vec.iov_base;
msg_a->struct_v = 1;
@@ -555,6 +556,26 @@ static int ceph_x_create_authorizer(
return 0;
}
+static int ceph_x_update_authorizer(
+ struct ceph_auth_client *ac, int peer_type,
+ struct ceph_auth_handshake *auth)
+{
+ struct ceph_x_authorizer *au;
+ struct ceph_x_ticket_handler *th;
+
+ th = get_ticket_handler(ac, peer_type);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ au = (struct ceph_x_authorizer *)auth->authorizer;
+ if (au->secret_id < th->secret_id) {
+ dout("ceph_x_update_authorizer service %u secret %llu < %llu\n",
+ au->service, au->secret_id, th->secret_id);
+ return ceph_x_build_authorizer(ac, th, au);
+ }
+ return 0;
+}
+
static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
struct ceph_authorizer *a, size_t len)
{
@@ -630,7 +651,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
th = get_ticket_handler(ac, peer_type);
if (!IS_ERR(th))
- remove_ticket_handler(ac, th);
+ memset(&th->validity, 0, sizeof(th->validity));
}
@@ -641,6 +662,7 @@ static const struct ceph_auth_client_ops ceph_x_ops = {
.build_request = ceph_x_build_request,
.handle_reply = ceph_x_handle_reply,
.create_authorizer = ceph_x_create_authorizer,
+ .update_authorizer = ceph_x_update_authorizer,
.verify_authorizer_reply = ceph_x_verify_authorizer_reply,
.destroy_authorizer = ceph_x_destroy_authorizer,
.invalidate_authorizer = ceph_x_invalidate_authorizer,
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index f459e93b774..c5a058da7ac 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -29,6 +29,7 @@ struct ceph_x_authorizer {
struct ceph_buffer *buf;
unsigned int service;
u64 nonce;
+ u64 secret_id;
char reply_buf[128]; /* big enough for encrypted blob */
};
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index e65e6e4be38..34b11ee8124 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -606,11 +606,17 @@ static int __init init_ceph_lib(void)
if (ret < 0)
goto out_crypto;
+ ret = ceph_osdc_setup();
+ if (ret < 0)
+ goto out_msgr;
+
pr_info("loaded (mon/osd proto %d/%d)\n",
CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL);
return 0;
+out_msgr:
+ ceph_msgr_exit();
out_crypto:
ceph_crypto_shutdown();
out_debugfs:
@@ -622,6 +628,7 @@ out:
static void __exit exit_ceph_lib(void)
{
dout("exit_ceph_lib\n");
+ ceph_osdc_cleanup();
ceph_msgr_exit();
ceph_crypto_shutdown();
ceph_debugfs_cleanup();
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 00d051f4894..83661cdc076 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -123,8 +123,8 @@ static int osdc_show(struct seq_file *s, void *pp)
mutex_lock(&osdc->request_mutex);
for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
struct ceph_osd_request *req;
+ unsigned int i;
int opcode;
- int i;
req = rb_entry(p, struct ceph_osd_request, r_node);
@@ -142,7 +142,7 @@ static int osdc_show(struct seq_file *s, void *pp)
seq_printf(s, "\t");
for (i = 0; i < req->r_num_ops; i++) {
- opcode = le16_to_cpu(req->r_request_ops[i].op);
+ opcode = req->r_ops[i].op;
seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
}
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 2c0669fb54e..eb0a46a49bd 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -21,6 +21,9 @@
#include <linux/ceph/pagelist.h>
#include <linux/export.h>
+#define list_entry_next(pos, member) \
+ list_entry(pos->member.next, typeof(*pos), member)
+
/*
* Ceph uses the messenger to exchange ceph_msg messages with other
* hosts in the system. The messenger provides ordered and reliable
@@ -149,6 +152,11 @@ static bool con_flag_test_and_set(struct ceph_connection *con,
return test_and_set_bit(con_flag, &con->flags);
}
+/* Slab caches for frequently-allocated structures */
+
+static struct kmem_cache *ceph_msg_cache;
+static struct kmem_cache *ceph_msg_data_cache;
+
/* static tag bytes (protocol control messages) */
static char tag_msg = CEPH_MSGR_TAG_MSG;
static char tag_ack = CEPH_MSGR_TAG_ACK;
@@ -223,6 +231,41 @@ static void encode_my_addr(struct ceph_messenger *msgr)
*/
static struct workqueue_struct *ceph_msgr_wq;
+static int ceph_msgr_slab_init(void)
+{
+ BUG_ON(ceph_msg_cache);
+ ceph_msg_cache = kmem_cache_create("ceph_msg",
+ sizeof (struct ceph_msg),
+ __alignof__(struct ceph_msg), 0, NULL);
+
+ if (!ceph_msg_cache)
+ return -ENOMEM;
+
+ BUG_ON(ceph_msg_data_cache);
+ ceph_msg_data_cache = kmem_cache_create("ceph_msg_data",
+ sizeof (struct ceph_msg_data),
+ __alignof__(struct ceph_msg_data),
+ 0, NULL);
+ if (ceph_msg_data_cache)
+ return 0;
+
+ kmem_cache_destroy(ceph_msg_cache);
+ ceph_msg_cache = NULL;
+
+ return -ENOMEM;
+}
+
+static void ceph_msgr_slab_exit(void)
+{
+ BUG_ON(!ceph_msg_data_cache);
+ kmem_cache_destroy(ceph_msg_data_cache);
+ ceph_msg_data_cache = NULL;
+
+ BUG_ON(!ceph_msg_cache);
+ kmem_cache_destroy(ceph_msg_cache);
+ ceph_msg_cache = NULL;
+}
+
static void _ceph_msgr_exit(void)
{
if (ceph_msgr_wq) {
@@ -230,6 +273,8 @@ static void _ceph_msgr_exit(void)
ceph_msgr_wq = NULL;
}
+ ceph_msgr_slab_exit();
+
BUG_ON(zero_page == NULL);
kunmap(zero_page);
page_cache_release(zero_page);
@@ -242,6 +287,9 @@ int ceph_msgr_init(void)
zero_page = ZERO_PAGE(0);
page_cache_get(zero_page);
+ if (ceph_msgr_slab_init())
+ return -ENOMEM;
+
ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0);
if (ceph_msgr_wq)
return 0;
@@ -471,6 +519,22 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
return r;
}
+static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
+ int page_offset, size_t length)
+{
+ void *kaddr;
+ int ret;
+
+ BUG_ON(page_offset + length > PAGE_SIZE);
+
+ kaddr = kmap(page);
+ BUG_ON(!kaddr);
+ ret = ceph_tcp_recvmsg(sock, kaddr + page_offset, length);
+ kunmap(page);
+
+ return ret;
+}
+
/*
* write something. @more is true if caller will be sending more data
* shortly.
@@ -493,7 +557,7 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
}
static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
- int offset, size_t size, int more)
+ int offset, size_t size, bool more)
{
int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR);
int ret;
@@ -697,50 +761,397 @@ static void con_out_kvec_add(struct ceph_connection *con,
}
#ifdef CONFIG_BLOCK
-static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
+
+/*
+ * For a bio data item, a piece is whatever remains of the next
+ * entry in the current bio iovec, or the first entry in the next
+ * bio in the list.
+ */
+static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
+ size_t length)
{
- if (!bio) {
- *iter = NULL;
- *seg = 0;
- return;
+ struct ceph_msg_data *data = cursor->data;
+ struct bio *bio;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_BIO);
+
+ bio = data->bio;
+ BUG_ON(!bio);
+ BUG_ON(!bio->bi_vcnt);
+
+ cursor->resid = min(length, data->bio_length);
+ cursor->bio = bio;
+ cursor->vector_index = 0;
+ cursor->vector_offset = 0;
+ cursor->last_piece = length <= bio->bi_io_vec[0].bv_len;
+}
+
+static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset,
+ size_t *length)
+{
+ struct ceph_msg_data *data = cursor->data;
+ struct bio *bio;
+ struct bio_vec *bio_vec;
+ unsigned int index;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_BIO);
+
+ bio = cursor->bio;
+ BUG_ON(!bio);
+
+ index = cursor->vector_index;
+ BUG_ON(index >= (unsigned int) bio->bi_vcnt);
+
+ bio_vec = &bio->bi_io_vec[index];
+ BUG_ON(cursor->vector_offset >= bio_vec->bv_len);
+ *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset);
+ BUG_ON(*page_offset >= PAGE_SIZE);
+ if (cursor->last_piece) /* pagelist offset is always 0 */
+ *length = cursor->resid;
+ else
+ *length = (size_t) (bio_vec->bv_len - cursor->vector_offset);
+ BUG_ON(*length > cursor->resid);
+ BUG_ON(*page_offset + *length > PAGE_SIZE);
+
+ return bio_vec->bv_page;
+}
+
+static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
+{
+ struct bio *bio;
+ struct bio_vec *bio_vec;
+ unsigned int index;
+
+ BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO);
+
+ bio = cursor->bio;
+ BUG_ON(!bio);
+
+ index = cursor->vector_index;
+ BUG_ON(index >= (unsigned int) bio->bi_vcnt);
+ bio_vec = &bio->bi_io_vec[index];
+
+ /* Advance the cursor offset */
+
+ BUG_ON(cursor->resid < bytes);
+ cursor->resid -= bytes;
+ cursor->vector_offset += bytes;
+ if (cursor->vector_offset < bio_vec->bv_len)
+ return false; /* more bytes to process in this segment */
+ BUG_ON(cursor->vector_offset != bio_vec->bv_len);
+
+ /* Move on to the next segment, and possibly the next bio */
+
+ if (++index == (unsigned int) bio->bi_vcnt) {
+ bio = bio->bi_next;
+ index = 0;
}
- *iter = bio;
- *seg = bio->bi_idx;
+ cursor->bio = bio;
+ cursor->vector_index = index;
+ cursor->vector_offset = 0;
+
+ if (!cursor->last_piece) {
+ BUG_ON(!cursor->resid);
+ BUG_ON(!bio);
+ /* A short read is OK, so use <= rather than == */
+ if (cursor->resid <= bio->bi_io_vec[index].bv_len)
+ cursor->last_piece = true;
+ }
+
+ return true;
}
+#endif /* CONFIG_BLOCK */
-static void iter_bio_next(struct bio **bio_iter, int *seg)
+/*
+ * For a page array, a piece comes from the first page in the array
+ * that has not already been fully consumed.
+ */
+static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
+ size_t length)
{
- if (*bio_iter == NULL)
- return;
+ struct ceph_msg_data *data = cursor->data;
+ int page_count;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
- BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
+ BUG_ON(!data->pages);
+ BUG_ON(!data->length);
- (*seg)++;
- if (*seg == (*bio_iter)->bi_vcnt)
- init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
+ cursor->resid = min(length, data->length);
+ page_count = calc_pages_for(data->alignment, (u64)data->length);
+ cursor->page_offset = data->alignment & ~PAGE_MASK;
+ cursor->page_index = 0;
+ BUG_ON(page_count > (int)USHRT_MAX);
+ cursor->page_count = (unsigned short)page_count;
+ BUG_ON(length > SIZE_MAX - cursor->page_offset);
+ cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE;
}
-#endif
-static void prepare_write_message_data(struct ceph_connection *con)
+static struct page *
+ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length)
{
- struct ceph_msg *msg = con->out_msg;
+ struct ceph_msg_data *data = cursor->data;
- BUG_ON(!msg);
- BUG_ON(!msg->hdr.data_len);
+ BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
+
+ BUG_ON(cursor->page_index >= cursor->page_count);
+ BUG_ON(cursor->page_offset >= PAGE_SIZE);
+
+ *page_offset = cursor->page_offset;
+ if (cursor->last_piece)
+ *length = cursor->resid;
+ else
+ *length = PAGE_SIZE - *page_offset;
+
+ return data->pages[cursor->page_index];
+}
+
+static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
+{
+ BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES);
+
+ BUG_ON(cursor->page_offset + bytes > PAGE_SIZE);
+
+ /* Advance the cursor page offset */
+
+ cursor->resid -= bytes;
+ cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK;
+ if (!bytes || cursor->page_offset)
+ return false; /* more bytes to process in the current page */
+
+ /* Move on to the next page; offset is already at 0 */
+
+ BUG_ON(cursor->page_index >= cursor->page_count);
+ cursor->page_index++;
+ cursor->last_piece = cursor->resid <= PAGE_SIZE;
+
+ return true;
+}
+
+/*
+ * For a pagelist, a piece is whatever remains to be consumed in the
+ * first page in the list, or the front of the next page.
+ */
+static void
+ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor,
+ size_t length)
+{
+ struct ceph_msg_data *data = cursor->data;
+ struct ceph_pagelist *pagelist;
+ struct page *page;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
+
+ pagelist = data->pagelist;
+ BUG_ON(!pagelist);
+
+ if (!length)
+ return; /* pagelist can be assigned but empty */
+
+ BUG_ON(list_empty(&pagelist->head));
+ page = list_first_entry(&pagelist->head, struct page, lru);
+
+ cursor->resid = min(length, pagelist->length);
+ cursor->page = page;
+ cursor->offset = 0;
+ cursor->last_piece = cursor->resid <= PAGE_SIZE;
+}
+
+static struct page *
+ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length)
+{
+ struct ceph_msg_data *data = cursor->data;
+ struct ceph_pagelist *pagelist;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
- /* initialize page iterator */
- con->out_msg_pos.page = 0;
- if (msg->pages)
- con->out_msg_pos.page_pos = msg->page_alignment;
+ pagelist = data->pagelist;
+ BUG_ON(!pagelist);
+
+ BUG_ON(!cursor->page);
+ BUG_ON(cursor->offset + cursor->resid != pagelist->length);
+
+ /* offset of first page in pagelist is always 0 */
+ *page_offset = cursor->offset & ~PAGE_MASK;
+ if (cursor->last_piece)
+ *length = cursor->resid;
else
- con->out_msg_pos.page_pos = 0;
+ *length = PAGE_SIZE - *page_offset;
+
+ return cursor->page;
+}
+
+static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
+{
+ struct ceph_msg_data *data = cursor->data;
+ struct ceph_pagelist *pagelist;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
+
+ pagelist = data->pagelist;
+ BUG_ON(!pagelist);
+
+ BUG_ON(cursor->offset + cursor->resid != pagelist->length);
+ BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE);
+
+ /* Advance the cursor offset */
+
+ cursor->resid -= bytes;
+ cursor->offset += bytes;
+ /* offset of first page in pagelist is always 0 */
+ if (!bytes || cursor->offset & ~PAGE_MASK)
+ return false; /* more bytes to process in the current page */
+
+ /* Move on to the next page */
+
+ BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
+ cursor->page = list_entry_next(cursor->page, lru);
+ cursor->last_piece = cursor->resid <= PAGE_SIZE;
+
+ return true;
+}
+
+/*
+ * Message data is handled (sent or received) in pieces, where each
+ * piece resides on a single page. The network layer might not
+ * consume an entire piece at once. A data item's cursor keeps
+ * track of which piece is next to process and how much remains to
+ * be processed in that piece. It also tracks whether the current
+ * piece is the last one in the data item.
+ */
+static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
+{
+ size_t length = cursor->total_resid;
+
+ switch (cursor->data->type) {
+ case CEPH_MSG_DATA_PAGELIST:
+ ceph_msg_data_pagelist_cursor_init(cursor, length);
+ break;
+ case CEPH_MSG_DATA_PAGES:
+ ceph_msg_data_pages_cursor_init(cursor, length);
+ break;
#ifdef CONFIG_BLOCK
- if (msg->bio)
- init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
-#endif
- con->out_msg_pos.data_pos = 0;
- con->out_msg_pos.did_page_crc = false;
- con->out_more = 1; /* data + footer will follow */
+ case CEPH_MSG_DATA_BIO:
+ ceph_msg_data_bio_cursor_init(cursor, length);
+ break;
+#endif /* CONFIG_BLOCK */
+ case CEPH_MSG_DATA_NONE:
+ default:
+ /* BUG(); */
+ break;
+ }
+ cursor->need_crc = true;
+}
+
+static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
+{
+ struct ceph_msg_data_cursor *cursor = &msg->cursor;
+ struct ceph_msg_data *data;
+
+ BUG_ON(!length);
+ BUG_ON(length > msg->data_length);
+ BUG_ON(list_empty(&msg->data));
+
+ cursor->data_head = &msg->data;
+ cursor->total_resid = length;
+ data = list_first_entry(&msg->data, struct ceph_msg_data, links);
+ cursor->data = data;
+
+ __ceph_msg_data_cursor_init(cursor);
+}
+
+/*
+ * Return the page containing the next piece to process for a given
+ * data item, and supply the page offset and length of that piece.
+ * Indicate whether this is the last piece in this data item.
+ */
+static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length,
+ bool *last_piece)
+{
+ struct page *page;
+
+ switch (cursor->data->type) {
+ case CEPH_MSG_DATA_PAGELIST:
+ page = ceph_msg_data_pagelist_next(cursor, page_offset, length);
+ break;
+ case CEPH_MSG_DATA_PAGES:
+ page = ceph_msg_data_pages_next(cursor, page_offset, length);
+ break;
+#ifdef CONFIG_BLOCK
+ case CEPH_MSG_DATA_BIO:
+ page = ceph_msg_data_bio_next(cursor, page_offset, length);
+ break;
+#endif /* CONFIG_BLOCK */
+ case CEPH_MSG_DATA_NONE:
+ default:
+ page = NULL;
+ break;
+ }
+ BUG_ON(!page);
+ BUG_ON(*page_offset + *length > PAGE_SIZE);
+ BUG_ON(!*length);
+ if (last_piece)
+ *last_piece = cursor->last_piece;
+
+ return page;
+}
+
+/*
+ * Returns true if the result moves the cursor on to the next piece
+ * of the data item.
+ */
+static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
+{
+ bool new_piece;
+
+ BUG_ON(bytes > cursor->resid);
+ switch (cursor->data->type) {
+ case CEPH_MSG_DATA_PAGELIST:
+ new_piece = ceph_msg_data_pagelist_advance(cursor, bytes);
+ break;
+ case CEPH_MSG_DATA_PAGES:
+ new_piece = ceph_msg_data_pages_advance(cursor, bytes);
+ break;
+#ifdef CONFIG_BLOCK
+ case CEPH_MSG_DATA_BIO:
+ new_piece = ceph_msg_data_bio_advance(cursor, bytes);
+ break;
+#endif /* CONFIG_BLOCK */
+ case CEPH_MSG_DATA_NONE:
+ default:
+ BUG();
+ break;
+ }
+ cursor->total_resid -= bytes;
+
+ if (!cursor->resid && cursor->total_resid) {
+ WARN_ON(!cursor->last_piece);
+ BUG_ON(list_is_last(&cursor->data->links, cursor->data_head));
+ cursor->data = list_entry_next(cursor->data, links);
+ __ceph_msg_data_cursor_init(cursor);
+ new_piece = true;
+ }
+ cursor->need_crc = new_piece;
+
+ return new_piece;
+}
+
+static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
+{
+ BUG_ON(!msg);
+ BUG_ON(!data_len);
+
+ /* Initialize data cursor */
+
+ ceph_msg_data_cursor_init(msg, (size_t)data_len);
}
/*
@@ -803,16 +1214,12 @@ static void prepare_write_message(struct ceph_connection *con)
m->hdr.seq = cpu_to_le64(++con->out_seq);
m->needs_out_seq = false;
}
-#ifdef CONFIG_BLOCK
- else
- m->bio_iter = NULL;
-#endif
+ WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
- dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
+ dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
m, con->out_seq, le16_to_cpu(m->hdr.type),
le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
- le32_to_cpu(m->hdr.data_len),
- m->nr_pages);
+ m->data_length);
BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
/* tag + hdr + front + middle */
@@ -843,11 +1250,13 @@ static void prepare_write_message(struct ceph_connection *con)
/* is there a data payload? */
con->out_msg->footer.data_crc = 0;
- if (m->hdr.data_len)
- prepare_write_message_data(con);
- else
+ if (m->data_length) {
+ prepare_message_data(con->out_msg, m->data_length);
+ con->out_more = 1; /* data + footer will follow */
+ } else {
/* no, queue up footer too and be done */
prepare_write_message_footer(con);
+ }
con_flag_set(con, CON_FLAG_WRITE_PENDING);
}
@@ -874,6 +1283,24 @@ static void prepare_write_ack(struct ceph_connection *con)
}
/*
+ * Prepare to share the seq during handshake
+ */
+static void prepare_write_seq(struct ceph_connection *con)
+{
+ dout("prepare_write_seq %p %llu -> %llu\n", con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ con_out_kvec_reset(con);
+
+ con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof (con->out_temp_ack),
+ &con->out_temp_ack);
+
+ con_flag_set(con, CON_FLAG_WRITE_PENDING);
+}
+
+/*
* Prepare to write keepalive byte.
*/
static void prepare_write_keepalive(struct ceph_connection *con)
@@ -1022,35 +1449,19 @@ out:
return ret; /* done! */
}
-static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
- size_t len, size_t sent, bool in_trail)
+static u32 ceph_crc32c_page(u32 crc, struct page *page,
+ unsigned int page_offset,
+ unsigned int length)
{
- struct ceph_msg *msg = con->out_msg;
+ char *kaddr;
- BUG_ON(!msg);
- BUG_ON(!sent);
-
- con->out_msg_pos.data_pos += sent;
- con->out_msg_pos.page_pos += sent;
- if (sent < len)
- return;
+ kaddr = kmap(page);
+ BUG_ON(kaddr == NULL);
+ crc = crc32c(crc, kaddr + page_offset, length);
+ kunmap(page);
- BUG_ON(sent != len);
- con->out_msg_pos.page_pos = 0;
- con->out_msg_pos.page++;
- con->out_msg_pos.did_page_crc = false;
- if (in_trail)
- list_move_tail(&page->lru,
- &msg->trail->head);
- else if (msg->pagelist)
- list_move_tail(&page->lru,
- &msg->pagelist->head);
-#ifdef CONFIG_BLOCK
- else if (msg->bio)
- iter_bio_next(&msg->bio_iter, &msg->bio_seg);
-#endif
+ return crc;
}
-
/*
* Write as much message data payload as we can. If we finish, queue
* up the footer.
@@ -1058,21 +1469,17 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
* 0 -> socket full, but more to do
* <0 -> error
*/
-static int write_partial_msg_pages(struct ceph_connection *con)
+static int write_partial_message_data(struct ceph_connection *con)
{
struct ceph_msg *msg = con->out_msg;
- unsigned int data_len = le32_to_cpu(msg->hdr.data_len);
- size_t len;
+ struct ceph_msg_data_cursor *cursor = &msg->cursor;
bool do_datacrc = !con->msgr->nocrc;
- int ret;
- int total_max_write;
- bool in_trail = false;
- const size_t trail_len = (msg->trail ? msg->trail->length : 0);
- const size_t trail_off = data_len - trail_len;
+ u32 crc;
- dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
- con, msg, con->out_msg_pos.page, msg->nr_pages,
- con->out_msg_pos.page_pos);
+ dout("%s %p msg %p\n", __func__, con, msg);
+
+ if (list_empty(&msg->data))
+ return -EINVAL;
/*
* Iterate through each page that contains data to be
@@ -1082,72 +1489,41 @@ static int write_partial_msg_pages(struct ceph_connection *con)
* need to map the page. If we have no pages, they have
* been revoked, so use the zero page.
*/
- while (data_len > con->out_msg_pos.data_pos) {
- struct page *page = NULL;
- int max_write = PAGE_SIZE;
- int bio_offset = 0;
-