diff options
author | Yehuda Sadeh <yehuda@hq.newdream.net> | 2010-04-06 15:14:15 -0700 |
---|---|---|
committer | Sage Weil <sage@newdream.net> | 2010-10-20 15:37:28 -0700 |
commit | 3d14c5d2b6e15c21d8e5467dc62d33127c23a644 (patch) | |
tree | 7d123c47847df9d1e865b6b78dc7da3fe739b704 /fs/ceph/messenger.c | |
parent | ae1533b62b3369e6ae32338f4a77d64d0e88f676 (diff) |
ceph: factor out libceph from Ceph file system
This factors out protocol and low-level storage parts of ceph into a
separate libceph module living in net/ceph and include/linux/ceph. This
is mostly a matter of moving files around. However, a few key pieces
of the interface change as well:
- ceph_client becomes ceph_fs_client and ceph_client, where the latter
captures the mon and osd clients, and the fs_client gets the mds client
and file system specific pieces.
- Mount option parsing and debugfs setup is correspondingly broken into
two pieces.
- The mon client gets a generic handler callback for otherwise unknown
messages (mds map, in this case).
- The basic supported/required feature bits can be expanded (and are by
ceph_fs_client).
No functional change, aside from some subtle error handling cases that got
cleaned up in the refactoring process.
Signed-off-by: Sage Weil <sage@newdream.net>
Diffstat (limited to 'fs/ceph/messenger.c')
-rw-r--r-- | fs/ceph/messenger.c | 2432 |
1 files changed, 0 insertions, 2432 deletions
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c deleted file mode 100644 index 17a09b32a59..00000000000 --- a/fs/ceph/messenger.c +++ /dev/null @@ -1,2432 +0,0 @@ -#include "ceph_debug.h" - -#include <linux/crc32c.h> -#include <linux/ctype.h> -#include <linux/highmem.h> -#include <linux/inet.h> -#include <linux/kthread.h> -#include <linux/net.h> -#include <linux/slab.h> -#include <linux/socket.h> -#include <linux/string.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <net/tcp.h> - -#include "super.h" -#include "messenger.h" -#include "decode.h" -#include "pagelist.h" - -/* - * Ceph uses the messenger to exchange ceph_msg messages with other - * hosts in the system. The messenger provides ordered and reliable - * delivery. We tolerate TCP disconnects by reconnecting (with - * exponential backoff) in the case of a fault (disconnection, bad - * crc, protocol error). Acks allow sent messages to be discarded by - * the sender. - */ - -/* static tag bytes (protocol control messages) */ -static char tag_msg = CEPH_MSGR_TAG_MSG; -static char tag_ack = CEPH_MSGR_TAG_ACK; -static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; - -#ifdef CONFIG_LOCKDEP -static struct lock_class_key socket_class; -#endif - - -static void queue_con(struct ceph_connection *con); -static void con_work(struct work_struct *); -static void ceph_fault(struct ceph_connection *con); - -/* - * nicely render a sockaddr as a string. - */ -#define MAX_ADDR_STR 20 -#define MAX_ADDR_STR_LEN 60 -static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; -static DEFINE_SPINLOCK(addr_str_lock); -static int last_addr_str; - -const char *pr_addr(const struct sockaddr_storage *ss) -{ - int i; - char *s; - struct sockaddr_in *in4 = (void *)ss; - struct sockaddr_in6 *in6 = (void *)ss; - - spin_lock(&addr_str_lock); - i = last_addr_str++; - if (last_addr_str == MAX_ADDR_STR) - last_addr_str = 0; - spin_unlock(&addr_str_lock); - s = addr_str[i]; - - switch (ss->ss_family) { - case AF_INET: - snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr, - (unsigned int)ntohs(in4->sin_port)); - break; - - case AF_INET6: - snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr, - (unsigned int)ntohs(in6->sin6_port)); - break; - - default: - sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family); - } - - return s; -} - -static void encode_my_addr(struct ceph_messenger *msgr) -{ - memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); - ceph_encode_addr(&msgr->my_enc_addr); -} - -/* - * work queue for all reading and writing to/from the socket. - */ -struct workqueue_struct *ceph_msgr_wq; - -int __init ceph_msgr_init(void) -{ - ceph_msgr_wq = create_workqueue("ceph-msgr"); - if (IS_ERR(ceph_msgr_wq)) { - int ret = PTR_ERR(ceph_msgr_wq); - pr_err("msgr_init failed to create workqueue: %d\n", ret); - ceph_msgr_wq = NULL; - return ret; - } - return 0; -} - -void ceph_msgr_exit(void) -{ - destroy_workqueue(ceph_msgr_wq); -} - -void ceph_msgr_flush(void) -{ - flush_workqueue(ceph_msgr_wq); -} - - -/* - * socket callback functions - */ - -/* data available on socket, or listen socket received a connect */ -static void ceph_data_ready(struct sock *sk, int count_unused) -{ - struct ceph_connection *con = - (struct ceph_connection *)sk->sk_user_data; - if (sk->sk_state != TCP_CLOSE_WAIT) { - dout("ceph_data_ready on %p state = %lu, queueing work\n", - con, con->state); - queue_con(con); - } -} - -/* socket has buffer space for writing */ -static void ceph_write_space(struct sock *sk) -{ - struct ceph_connection *con = - (struct ceph_connection *)sk->sk_user_data; - - /* only queue to workqueue if there is data we want to write. */ - if (test_bit(WRITE_PENDING, &con->state)) { - dout("ceph_write_space %p queueing write work\n", con); - queue_con(con); - } else { - dout("ceph_write_space %p nothing to write\n", con); - } - - /* since we have our own write_space, clear the SOCK_NOSPACE flag */ - clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -} - -/* socket's state has changed */ -static void ceph_state_change(struct sock *sk) -{ - struct ceph_connection *con = - (struct ceph_connection *)sk->sk_user_data; - - dout("ceph_state_change %p state = %lu sk_state = %u\n", - con, con->state, sk->sk_state); - - if (test_bit(CLOSED, &con->state)) - return; - - switch (sk->sk_state) { - case TCP_CLOSE: - dout("ceph_state_change TCP_CLOSE\n"); - case TCP_CLOSE_WAIT: - dout("ceph_state_change TCP_CLOSE_WAIT\n"); - if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) { - if (test_bit(CONNECTING, &con->state)) - con->error_msg = "connection failed"; - else - con->error_msg = "socket closed"; - queue_con(con); - } - break; - case TCP_ESTABLISHED: - dout("ceph_state_change TCP_ESTABLISHED\n"); - queue_con(con); - break; - } -} - -/* - * set up socket callbacks - */ -static void set_sock_callbacks(struct socket *sock, - struct ceph_connection *con) -{ - struct sock *sk = sock->sk; - sk->sk_user_data = (void *)con; - sk->sk_data_ready = ceph_data_ready; - sk->sk_write_space = ceph_write_space; - sk->sk_state_change = ceph_state_change; -} - - -/* - * socket helpers - */ - -/* - * initiate connection to a remote socket. - */ -static struct socket *ceph_tcp_connect(struct ceph_connection *con) -{ - struct sockaddr_storage *paddr = &con->peer_addr.in_addr; - struct socket *sock; - int ret; - - BUG_ON(con->sock); - ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, - IPPROTO_TCP, &sock); - if (ret) - return ERR_PTR(ret); - con->sock = sock; - sock->sk->sk_allocation = GFP_NOFS; - -#ifdef CONFIG_LOCKDEP - lockdep_set_class(&sock->sk->sk_lock, &socket_class); -#endif - - set_sock_callbacks(sock, con); - - dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); - - ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), - O_NONBLOCK); - if (ret == -EINPROGRESS) { - dout("connect %s EINPROGRESS sk_state = %u\n", - pr_addr(&con->peer_addr.in_addr), - sock->sk->sk_state); - ret = 0; - } - if (ret < 0) { - pr_err("connect %s error %d\n", - pr_addr(&con->peer_addr.in_addr), ret); - sock_release(sock); - con->sock = NULL; - con->error_msg = "connect error"; - } - - if (ret < 0) - return ERR_PTR(ret); - return sock; -} - -static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) -{ - struct kvec iov = {buf, len}; - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; - - return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); -} - -/* - * write something. @more is true if caller will be sending more data - * shortly. - */ -static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, - size_t kvlen, size_t len, int more) -{ - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; - - if (more) - msg.msg_flags |= MSG_MORE; - else - msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ - - return kernel_sendmsg(sock, &msg, iov, kvlen, len); -} - - -/* - * Shutdown/close the socket for the given connection. - */ -static int con_close_socket(struct ceph_connection *con) -{ - int rc; - - dout("con_close_socket on %p sock %p\n", con, con->sock); - if (!con->sock) - return 0; - set_bit(SOCK_CLOSED, &con->state); - rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); - sock_release(con->sock); - con->sock = NULL; - clear_bit(SOCK_CLOSED, &con->state); - return rc; -} - -/* - * Reset a connection. Discard all incoming and outgoing messages - * and clear *_seq state. - */ -static void ceph_msg_remove(struct ceph_msg *msg) -{ - list_del_init(&msg->list_head); - ceph_msg_put(msg); -} -static void ceph_msg_remove_list(struct list_head *head) -{ - while (!list_empty(head)) { - struct ceph_msg *msg = list_first_entry(head, struct ceph_msg, - list_head); - ceph_msg_remove(msg); - } -} - -static void reset_connection(struct ceph_connection *con) -{ - /* reset connection, out_queue, msg_ and connect_seq */ - /* discard existing out_queue and msg_seq */ - ceph_msg_remove_list(&con->out_queue); - ceph_msg_remove_list(&con->out_sent); - - if (con->in_msg) { - ceph_msg_put(con->in_msg); - con->in_msg = NULL; - } - - con->connect_seq = 0; - con->out_seq = 0; - if (con->out_msg) { - ceph_msg_put(con->out_msg); - con->out_msg = NULL; - } - con->out_keepalive_pending = false; - con->in_seq = 0; - con->in_seq_acked = 0; -} - -/* - * mark a peer down. drop any open connections. - */ -void ceph_con_close(struct ceph_connection *con) -{ - dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr)); - set_bit(CLOSED, &con->state); /* in case there's queued work */ - clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ - clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ - clear_bit(KEEPALIVE_PENDING, &con->state); - clear_bit(WRITE_PENDING, &con->state); - mutex_lock(&con->mutex); - reset_connection(con); - con->peer_global_seq = 0; - cancel_delayed_work(&con->work); - mutex_unlock(&con->mutex); - queue_con(con); -} - -/* - * Reopen a closed connection, with a new peer address. - */ -void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) -{ - dout("con_open %p %s\n", con, pr_addr(&addr->in_addr)); - set_bit(OPENING, &con->state); - clear_bit(CLOSED, &con->state); - memcpy(&con->peer_addr, addr, sizeof(*addr)); - con->delay = 0; /* reset backoff memory */ - queue_con(con); -} - -/* - * return true if this connection ever successfully opened - */ -bool ceph_con_opened(struct ceph_connection *con) -{ - return con->connect_seq > 0; -} - -/* - * generic get/put - */ -struct ceph_connection *ceph_con_get(struct ceph_connection *con) -{ - dout("con_get %p nref = %d -> %d\n", con, - atomic_read(&con->nref), atomic_read(&con->nref) + 1); - if (atomic_inc_not_zero(&con->nref)) - return con; - return NULL; -} - -void ceph_con_put(struct ceph_connection *con) -{ - dout("con_put %p nref = %d -> %d\n", con, - atomic_read(&con->nref), atomic_read(&con->nref) - 1); - BUG_ON(atomic_read(&con->nref) == 0); - if (atomic_dec_and_test(&con->nref)) { - BUG_ON(con->sock); - kfree(con); - } -} - -/* - * initialize a new connection. - */ -void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) -{ - dout("con_init %p\n", con); - memset(con, 0, sizeof(*con)); - atomic_set(&con->nref, 1); - con->msgr = msgr; - mutex_init(&con->mutex); - INIT_LIST_HEAD(&con->out_queue); - INIT_LIST_HEAD(&con->out_sent); - INIT_DELAYED_WORK(&con->work, con_work); -} - - -/* - * We maintain a global counter to order connection attempts. Get - * a unique seq greater than @gt. - */ -static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) -{ - u32 ret; - - spin_lock(&msgr->global_seq_lock); - if (msgr->global_seq < gt) - msgr->global_seq = gt; - ret = ++msgr->global_seq; - spin_unlock(&msgr->global_seq_lock); - return ret; -} - - -/* - * Prepare footer for currently outgoing message, and finish things - * off. Assumes out_kvec* are already valid.. we just add on to the end. - */ -static void prepare_write_message_footer(struct ceph_connection *con, int v) -{ - struct ceph_msg *m = con->out_msg; - - dout("prepare_write_message_footer %p\n", con); - con->out_kvec_is_msg = true; - con->out_kvec[v].iov_base = &m->footer; - con->out_kvec[v].iov_len = sizeof(m->footer); - con->out_kvec_bytes += sizeof(m->footer); - con->out_kvec_left++; - con->out_more = m->more_to_follow; - con->out_msg_done = true; -} - -/* - * Prepare headers for the next outgoing message. - */ -static void prepare_write_message(struct ceph_connection *con) -{ - struct ceph_msg *m; - int v = 0; - - con->out_kvec_bytes = 0; - con->out_kvec_is_msg = true; - con->out_msg_done = false; - - /* Sneak an ack in there first? If we can get it into the same - * TCP packet that's a good thing. */ - if (con->in_seq > con->in_seq_acked) { - con->in_seq_acked = con->in_seq; - con->out_kvec[v].iov_base = &tag_ack; - con->out_kvec[v++].iov_len = 1; - con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con->out_kvec[v].iov_base = &con->out_temp_ack; - con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack); - con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); - } - - m = list_first_entry(&con->out_queue, - struct ceph_msg, list_head); - con->out_msg = m; - if (test_bit(LOSSYTX, &con->state)) { - list_del_init(&m->list_head); - } else { - /* put message on sent list */ - ceph_msg_get(m); - list_move_tail(&m->list_head, &con->out_sent); - } - - /* - * only assign outgoing seq # if we haven't sent this message - * yet. if it is requeued, resend with it's original seq. - */ - if (m->needs_out_seq) { - m->hdr.seq = cpu_to_le64(++con->out_seq); - m->needs_out_seq = false; - } - - dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", - m, con->out_seq, le16_to_cpu(m->hdr.type), - le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), - le32_to_cpu(m->hdr.data_len), - m->nr_pages); - BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); - - /* tag + hdr + front + middle */ - con->out_kvec[v].iov_base = &tag_msg; - con->out_kvec[v++].iov_len = 1; - con->out_kvec[v].iov_base = &m->hdr; - con->out_kvec[v++].iov_len = sizeof(m->hdr); - con->out_kvec[v++] = m->front; - if (m->middle) - con->out_kvec[v++] = m->middle->vec; - con->out_kvec_left = v; - con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len + - (m->middle ? m->middle->vec.iov_len : 0); - con->out_kvec_cur = con->out_kvec; - - /* fill in crc (except data pages), footer */ - con->out_msg->hdr.crc = - cpu_to_le32(crc32c(0, (void *)&m->hdr, - sizeof(m->hdr) - sizeof(m->hdr.crc))); - con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; - con->out_msg->footer.front_crc = - cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len)); - if (m->middle) - con->out_msg->footer.middle_crc = - cpu_to_le32(crc32c(0, m->middle->vec.iov_base, - m->middle->vec.iov_len)); - else - con->out_msg->footer.middle_crc = 0; - con->out_msg->footer.data_crc = 0; - dout("prepare_write_message front_crc %u data_crc %u\n", - le32_to_cpu(con->out_msg->footer.front_crc), - le32_to_cpu(con->out_msg->footer.middle_crc)); - - /* is there a data payload? */ - if (le32_to_cpu(m->hdr.data_len) > 0) { - /* initialize page iterator */ - con->out_msg_pos.page = 0; - if (m->pages) - con->out_msg_pos.page_pos = - le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; - else - con->out_msg_pos.page_pos = 0; - con->out_msg_pos.data_pos = 0; - con->out_msg_pos.did_page_crc = 0; - con->out_more = 1; /* data + footer will follow */ - } else { - /* no, queue up footer too and be done */ - prepare_write_message_footer(con, v); - } - - set_bit(WRITE_PENDING, &con->state); -} - -/* - * Prepare an ack. - */ -static void prepare_write_ack(struct ceph_connection *con) -{ - dout("prepare_write_ack %p %llu -> %llu\n", con, - con->in_seq_acked, con->in_seq); - con->in_seq_acked = con->in_seq; - - con->out_kvec[0].iov_base = &tag_ack; - con->out_kvec[0].iov_len = 1; - con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con->out_kvec[1].iov_base = &con->out_temp_ack; - con->out_kvec[1].iov_len = sizeof(con->out_temp_ack); - con->out_kvec_left = 2; - con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); - con->out_kvec_cur = con->out_kvec; - con->out_more = 1; /* more will follow.. eventually.. */ - set_bit(WRITE_PENDING, &con->state); -} - -/* - * Prepare to write keepalive byte. - */ -static void prepare_write_keepalive(struct ceph_connection *con) -{ - dout("prepare_write_keepalive %p\n", con); - con->out_kvec[0].iov_base = &tag_keepalive; - con->out_kvec[0].iov_len = 1; - con->out_kvec_left = 1; - con->out_kvec_bytes = 1; - con->out_kvec_cur = con->out_kvec; - set_bit(WRITE_PENDING, &con->state); -} - -/* - * Connection negotiation. - */ - -static void prepare_connect_authorizer(struct ceph_connection *con) -{ - void *auth_buf; - int auth_len = 0; - int auth_protocol = 0; - - mutex_unlock(&con->mutex); - if (con->ops->get_authorizer) - con->ops->get_authorizer(con, &auth_buf, &auth_len, - &auth_protocol, &con->auth_reply_buf, - &con->auth_reply_buf_len, - con->auth_retry); - mutex_lock(&con->mutex); - - con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); - con->out_connect.authorizer_len = cpu_to_le32(auth_len); - - con->out_kvec[con->out_kvec_left].iov_base = auth_buf; - con->out_kvec[con->out_kvec_left].iov_len = auth_len; - con->out_kvec_left++; - con->out_kvec_bytes += auth_len; -} - -/* - * We connected to a peer and are saying hello. - */ -static void prepare_write_banner(struct ceph_messenger *msgr, - struct ceph_connection *con) -{ - int len = strlen(CEPH_BANNER); - - con->out_kvec[0].iov_base = CEPH_BANNER; - con->out_kvec[0].iov_len = len; - con->out_kvec[1].iov_base = &msgr->my_enc_addr; - con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr); - con->out_kvec_left = 2; - con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr); - con->out_kvec_cur = con->out_kvec; - con->out_more = 0; - set_bit(WRITE_PENDING, &con->state); -} - -static void prepare_write_connect(struct ceph_messenger *msgr, - struct ceph_connection *con, - int after_banner) -{ - unsigned global_seq = get_global_seq(con->msgr, 0); - int proto; - - switch (con->peer_name.type) { - case CEPH_ENTITY_TYPE_MON: - proto = CEPH_MONC_PROTOCOL; - break; - case CEPH_ENTITY_TYPE_OSD: - proto = CEPH_OSDC_PROTOCOL; - break; - case CEPH_ENTITY_TYPE_MDS: - proto = CEPH_MDSC_PROTOCOL; - break; - default: - BUG(); - } - - dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, - con->connect_seq, global_seq, proto); - - con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED); - con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); - con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); - con->out_connect.global_seq = cpu_to_le32(global_seq); - con->out_connect.protocol_version = cpu_to_le32(proto); - con->out_connect.flags = 0; - - if (!after_banner) { - con->out_kvec_left = 0; - con->out_kvec_bytes = 0; - } - con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect; - con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect); - con->out_kvec_left++; - con->out_kvec_bytes += sizeof(con->out_connect); - con->out_kvec_cur = con->out_kvec; - con->out_more = 0; - set_bit(WRITE_PENDING, &con->state); - - prepare_connect_authorizer(con); -} - - -/* - * write as much of pending kvecs to the socket as we can. - * 1 -> done - * 0 -> socket full, but more to do - * <0 -> error - */ -static int write_partial_kvec(struct ceph_connection *con) -{ - int ret; - - dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes); - while (con->out_kvec_bytes > 0) { - ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur, - con->out_kvec_left, con->out_kvec_bytes, - con->out_more); - if (ret <= 0) - goto out; - con->out_kvec_bytes -= ret; - if (con->out_kvec_bytes == 0) - break; /* done */ - while (ret > 0) { - if (ret >= con->out_kvec_cur->iov_len) { - ret -= con->out_kvec_cur->iov_len; - con->out_kvec_cur++; - con->out_kvec_left--; - } else { - con->out_kvec_cur->iov_len -= ret; - con->out_kvec_cur->iov_base += ret; - ret = 0; - break; - } - } - } - con->out_kvec_left = 0; - con->out_kvec_is_msg = false; - ret = 1; -out: - dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, - con->out_kvec_bytes, con->out_kvec_left, ret); - return ret; /* done! */ -} - -#ifdef CONFIG_BLOCK -static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) -{ - if (!bio) { - *iter = NULL; - *seg = 0; - return; - } - *iter = bio; - *seg = bio->bi_idx; -} - -static void iter_bio_next(struct bio **bio_iter, int *seg) -{ - if (*bio_iter == NULL) - return; - - BUG_ON(*seg >= (*bio_iter)->bi_vcnt); - - (*seg)++; - if (*seg == (*bio_iter)->bi_vcnt) - init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); -} -#endif - -/* - * Write as much message data payload as we can. If we finish, queue - * up the footer. - * 1 -> done, footer is now queued in out_kvec[]. - * 0 -> socket full, but more to do - * <0 -> error - */ -static int write_partial_msg_pages(struct ceph_connection *con) -{ - struct ceph_msg *msg = con->out_msg; - unsigned data_len = le32_to_cpu(msg->hdr.data_len); - size_t len; - int crc = con->msgr->nocrc; - int ret; - int total_max_write; - int in_trail = 0; - size_t trail_len = (msg->trail ? msg->trail->length : 0); - - dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", - con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, - con->out_msg_pos.page_pos); - -#ifdef CONFIG_BLOCK - if (msg->bio && !msg->bio_iter) - init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); -#endif - - while (data_len > con->out_msg_pos.data_pos) { - struct page *page = NULL; - void *kaddr = NULL; - int max_write = PAGE_SIZE; - int page_shift = 0; - - total_max_write = data_len - trail_len - - con->out_msg_pos.data_pos; - - /* - * if we are calculating the data crc (the default), we need - * to map the page. if our pages[] has been revoked, use the - * zero page. - */ - - /* have we reached the trail part of the data? */ - if (con->out_msg_pos.data_pos >= data_len - trail_len) { - in_trail = 1; - - total_max_write = data_len - con->out_msg_pos.data_pos; - - page = list_first_entry(&msg->trail->head, - struct page, lru); - if (crc) - kaddr = kmap(page); - max_write = PAGE_SIZE; - } else if (msg->pages) { - page = msg->pages[con->out_msg_pos.page]; - if (crc) - kaddr = kmap(page); - } else if (msg->pagelist) { - page = list_first_entry(&msg->pagelist->head, - struct page, lru); - if (crc) - kaddr = kmap(page); -#ifdef CONFIG_BLOCK - } else if (msg->bio) { - struct bio_vec *bv; - - bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); - page = bv->bv_page; - page_shift = bv->bv_offset; - if (crc) - kaddr = kmap(page) + page_shift; - max_write = bv->bv_len; -#endif - } else { - page = con->msgr->zero_page; - if (crc) - kaddr = page_address(con->msgr->zero_page); - } - len = min_t(int, max_write - con->out_msg_pos.page_pos, - total_max_write); - - if (crc && !con->out_msg_pos.did_page_crc) { - void *base = kaddr + con->out_msg_pos.page_pos; - u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); - - BUG_ON(kaddr == NULL); - con->out_msg->footer.data_crc = - cpu_to_le32(crc32c(tmpcrc, base, len)); - con->out_msg_pos.did_page_crc = 1; - } - ret = kernel_sendpage(con->sock, page, - con->out_msg_pos.page_pos + page_shift, - len, - MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_MORE); - - if (crc && - (msg->pages || msg->pagelist || msg->bio || in_trail)) - kunmap(page); - - if (ret <= 0) - goto out; - - con->out_msg_pos.data_pos += ret; - con->out_msg_pos.page_pos += ret; - if (ret == len) { - con->out_msg_pos.page_pos = 0; - con->out_msg_pos.page++; - con->out_msg_pos.did_page_crc = 0; - if (in_trail) - list_move_tail(&page->lru, - &msg->trail->head); - else if (msg->pagelist) - list_move_tail(&page->lru, - &msg->pagelist->head); -#ifdef CONFIG_BLOCK - else if (msg->bio) - iter_bio_next(&msg->bio_iter, &msg->bio_seg); -#endif - } - } - - dout("write_partial_msg_pages %p msg %p done\n", con, msg); - - /* prepare and queue up footer, too */ - if (!crc) - con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; - con->out_kvec_bytes = 0; - con->out_kvec_left = 0; - con->out_kvec_cur = con->out_kvec; - prepare_write_message_footer(con, 0); - ret = 1; -out: - return ret; -} - -/* - * write some zeros - */ -static int write_partial_skip(struct ceph_connection *con) -{ - int ret; - - while (con->out_skip > 0) { - struct kvec iov = { - .iov_base = page_address(con->msgr->zero_page), - .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE) - }; - - ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1); - if (ret <= 0) - goto out; - con->out_skip -= ret; - } - ret = 1; -out: - return ret; -} - -/* - * Prepare to read connection handshake, or an ack. - */ -static void prepare_read_banner(struct ceph_connection *con) -{ - dout("prepare_read_banner %p\n", con); - con->in_base_pos = 0; -} - -static void prepare_read_connect(struct ceph_connection *con) -{ - dout("prepare_read_connect %p\n", con); - con->in_base_pos = 0; -} - -static void prepare_read_ack(struct ceph_connection *con) -{ - dout("prepare_read_ack %p\n", con); - con->in_base_pos = 0; -} - -static void prepare_read_tag(struct ceph_connection *con) -{ - dout("prepare_read_tag %p\n", con); - con->in_base_pos = 0; - con->in_tag = CEPH_MSGR_TAG_READY; -} - -/* - * Prepare to read a message. - */ -static int prepare_read_message(struct ceph_connection *con) -{ - dout("prepare_read_message %p\n", con); - BUG_ON(con->in_msg != NULL); - con->in_base_pos = 0; - con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; - return 0; -} - - -static int read_partial(struct ceph_connection *con, - int *to, int size, void *object) -{ - *to += size; - while (con->in_base_pos < *to) { - int left = *to - con->in_base_pos; - int have = size - left; - int ret = ceph_tcp_recvmsg(con->sock, object + have, left); - if (ret <= 0) - return ret; - con->in_base_pos += ret; - } - return 1; -} - - -/* - * Read all or part of the connect-side handshake on a new connection - */ -static int read_partial_banner(struct ceph_connection *con) -{ - int ret, to = 0; - - dout("read_partial_banner %p at %d\n", con, con->in_base_pos); - - /* peer's banner */ - ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner); - if (ret <= 0) - goto out; - ret = read_partial(con, &to, sizeof(con->actual_peer_addr), - &con->actual_peer_addr); - if (ret <= 0) - goto out; - ret = read_partial(con, &to, sizeof(con->peer_addr_for_me), - &con->peer_addr_for_me); - if (ret <= 0) - goto out; -out: - return ret; -} - -static int read_partial_connect(struct ceph_connection *con) -{ - int ret, to = 0; - - dout("read_partial_connect %p at %d\n", con, con->in_base_pos); - - ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply); - if (ret <= 0) - goto out; - ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len), - con->auth_reply_buf); - if (ret <= 0) - goto out; - - dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", - con, (int)con->in_reply.tag, - le32_to_cpu(con->in_reply.connect_seq), - le32_to_cpu(con->in_reply.global_seq)); -out: - return ret; - -} - -/* - * Verify the hello banner looks okay. - */ -static int verify_hello(struct ceph_connection *con) -{ - if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { - pr_err("connect to %s got bad banner\n", - pr_addr(&con->peer_addr.in_addr)); - con->error_msg = "protocol error, bad banner"; - return -1; - } - return 0; -} - -static bool addr_is_blank(struct sockaddr_storage *ss) -{ - switch (ss->ss_family) { - case AF_INET: - return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0; - case AF_INET6: - return - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 && - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 && - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 && - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0; - } - return false; -} - -static int addr_port(struct sockaddr_storage *ss) -{ - switch (ss->ss_family) { - case AF_INET: - return ntohs(((struct sockaddr_in *)ss)->sin_port); - case AF_INET6: - return ntohs(((struct sockaddr_in6 *)ss)->sin6_port); - } - return 0; -} - -static void addr_set_port(struct sockaddr_storage *ss, int p) -{ - switch (ss->ss_family) { - case AF_INET: - ((struct sockaddr_in *)ss)->sin_port = htons(p); - case AF_INET6: - ((struct sockaddr_in6 *)ss)->sin6_port = htons(p); - } -} - -/* - * Parse an ip[:port] list into an addr array. Use the default - * monitor port if a port isn't specified. - */ -int ceph_parse_ips(const char *c, const char *end, - struct ceph_entity_addr *addr, - int max_count, int *count) -{ - int i; - const char *p = c; - - dout("parse_ips on '%.*s'\n", (int)(end-c), c); - for (i = 0; i < max_count; i++) { - const char *ipend; - struct sockaddr_storage *ss = &addr[i].in_addr; - struct sockaddr_in *in4 = (void *)ss; - struct sockaddr_in6 *in6 = (void *)ss; - int port; - char delim = ','; - - if (*p == '[') { - delim = ']'; - p++; - } - - memset(ss, 0, sizeof(*ss)); - if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, - delim, &ipend)) - ss->ss_family = AF_INET; - else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, - delim, &ipend)) - ss->ss_family = AF_INET6; - else - goto bad; - p = ipend; - - if (delim == ']') { - if (*p != ']') { - dout("missing matching ']'\n"); - goto bad; - } - p++; - } - - /* port? */ - if (p < end && *p == ':') { - port = 0; - p++; - while (p < end && *p >= '0' && *p <= '9') { - port = (port * 10) + (*p - '0'); - p++; - } - if (port > 65535 || port == 0) - goto bad; - } else { - port = CEPH_MON_PORT; - } - - addr_set_port(ss, port); - - dout("parse_ips got %s\n", pr_addr(ss)); - - if (p == end) - break; - if (*p != ',') - goto bad; - p++; - } - - if (p != end) - goto bad; - - if (count) - *count = i + 1; - return 0; - -bad: - pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); - return -EINVAL; -} - -static int process_banner(struct ceph_connection *con) -{ - dout("process_banner on %p\n", con); - - if (verify_hello(con) < 0) - return -1; - - ceph_decode_addr(&con->actual_peer_addr); - ceph_decode_addr(&con->peer_addr_for_me); - - /* - * Make sure the other end is who we wanted. note that the other - * end may not yet know their ip address, so if it's 0.0.0.0, give - * them the benefit of the doubt. - */ - if (memcmp(&con->peer_addr, &con->actual_peer_addr, - sizeof(con->peer_addr)) != 0 && - !(addr_is_blank(&con->actual_peer_addr.in_addr) && - con->actual_peer_addr.nonce == con->peer_addr.nonce)) { - pr_warning("wrong peer, want %s/%d, got %s/%d\n", - pr |