aboutsummaryrefslogtreecommitdiff
path: root/fs/ocfs2/cluster
diff options
context:
space:
mode:
authorZach Brown <zach.brown@oracle.com>2005-12-15 14:31:23 -0800
committerJoel Becker <joel.becker@oracle.com>2006-01-03 11:45:46 -0800
commit98211489d4147e41b11703e4245846d60b3acce4 (patch)
treef3c9c6b8df5bb001db79bc6314d8cbb5e127b45b /fs/ocfs2/cluster
parenta7f6a5fb4bde142b622706e2006ba33f793e13ed (diff)
[PATCH] OCFS2: The Second Oracle Cluster Filesystem
Node messaging via tcp. Used by the dlm and the file system for point to point communication between nodes. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Diffstat (limited to 'fs/ocfs2/cluster')
-rw-r--r--fs/ocfs2/cluster/quorum.c315
-rw-r--r--fs/ocfs2/cluster/quorum.h36
-rw-r--r--fs/ocfs2/cluster/sys.c124
-rw-r--r--fs/ocfs2/cluster/sys.h33
-rw-r--r--fs/ocfs2/cluster/tcp.c1829
-rw-r--r--fs/ocfs2/cluster/tcp.h113
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h174
7 files changed, 2624 insertions, 0 deletions
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
new file mode 100644
index 00000000000..7bba98fbfc1
--- /dev/null
+++ b/fs/ocfs2/cluster/quorum.c
@@ -0,0 +1,315 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ *
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/* This quorum hack is only here until we transition to some more rational
+ * approach that is driven from userspace. Honest. No foolin'.
+ *
+ * Imagine two nodes lose network connectivity to each other but they're still
+ * up and operating in every other way. Presumably a network timeout indicates
+ * that a node is broken and should be recovered. They can't both recover each
+ * other and both carry on without serialising their access to the file system.
+ * They need to decide who is authoritative. Now extend that problem to
+ * arbitrary groups of nodes losing connectivity between each other.
+ *
+ * So we declare that a node which has given up on connecting to a majority
+ * of nodes who are still heartbeating will fence itself.
+ *
+ * There are huge opportunities for races here. After we give up on a node's
+ * connection we need to wait long enough to give heartbeat an opportunity
+ * to declare the node as truly dead. We also need to be careful with the
+ * race between when we see a node start heartbeating and when we connect
+ * to it.
+ *
+ * So nodes that are in this transtion put a hold on the quorum decision
+ * with a counter. As they fall out of this transition they drop the count
+ * and if they're the last, they fire off the decision.
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_QUORUM
+#include "masklog.h"
+#include "quorum.h"
+
+static struct o2quo_state {
+ spinlock_t qs_lock;
+ struct work_struct qs_work;
+ int qs_pending;
+ int qs_heartbeating;
+ unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ int qs_connected;
+ unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ int qs_holds;
+ unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+} o2quo_state;
+
+/* this is horribly heavy-handed. It should instead flip the file
+ * system RO and call some userspace script. */
+static void o2quo_fence_self(void)
+{
+ /* panic spins with interrupts enabled. with preempt
+ * threads can still schedule, etc, etc */
+ o2hb_stop_all_regions();
+ panic("ocfs2 is very sorry to be fencing this system by panicing\n");
+}
+
+/* Indicate that a timeout occured on a hearbeat region write. The
+ * other nodes in the cluster may consider us dead at that time so we
+ * want to "fence" ourselves so that we don't scribble on the disk
+ * after they think they've recovered us. This can't solve all
+ * problems related to writeout after recovery but this hack can at
+ * least close some of those gaps. When we have real fencing, this can
+ * go away as our node would be fenced externally before other nodes
+ * begin recovery. */
+void o2quo_disk_timeout(void)
+{
+ o2quo_fence_self();
+}
+
+static void o2quo_make_decision(void *arg)
+{
+ int quorum;
+ int lowest_hb, lowest_reachable = 0, fence = 0;
+ struct o2quo_state *qs = &o2quo_state;
+
+ spin_lock(&qs->qs_lock);
+
+ lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
+ if (lowest_hb != O2NM_MAX_NODES)
+ lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
+
+ mlog(0, "heartbeating: %d, connected: %d, "
+ "lowest: %d (%sreachable)\n", qs->qs_heartbeating,
+ qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
+
+ if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
+ qs->qs_heartbeating == 1)
+ goto out;
+
+ if (qs->qs_heartbeating & 1) {
+ /* the odd numbered cluster case is straight forward --
+ * if we can't talk to the majority we're hosed */
+ quorum = (qs->qs_heartbeating + 1)/2;
+ if (qs->qs_connected < quorum) {
+ mlog(ML_ERROR, "fencing this node because it is "
+ "only connected to %u nodes and %u is needed "
+ "to make a quorum out of %u heartbeating nodes\n",
+ qs->qs_connected, quorum,
+ qs->qs_heartbeating);
+ fence = 1;
+ }
+ } else {
+ /* the even numbered cluster adds the possibility of each half
+ * of the cluster being able to talk amongst themselves.. in
+ * that case we're hosed if we can't talk to the group that has
+ * the lowest numbered node */
+ quorum = qs->qs_heartbeating / 2;
+ if (qs->qs_connected < quorum) {
+ mlog(ML_ERROR, "fencing this node because it is "
+ "only connected to %u nodes and %u is needed "
+ "to make a quorum out of %u heartbeating nodes\n",
+ qs->qs_connected, quorum,
+ qs->qs_heartbeating);
+ fence = 1;
+ }
+ else if ((qs->qs_connected == quorum) &&
+ !lowest_reachable) {
+ mlog(ML_ERROR, "fencing this node because it is "
+ "connected to a half-quorum of %u out of %u "
+ "nodes which doesn't include the lowest active "
+ "node %u\n", quorum, qs->qs_heartbeating,
+ lowest_hb);
+ fence = 1;
+ }
+ }
+
+out:
+ spin_unlock(&qs->qs_lock);
+ if (fence)
+ o2quo_fence_self();
+}
+
+static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
+{
+ assert_spin_locked(&qs->qs_lock);
+
+ if (!test_and_set_bit(node, qs->qs_hold_bm)) {
+ qs->qs_holds++;
+ mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
+ "node %u\n", node);
+ mlog(0, "node %u, %d total\n", node, qs->qs_holds);
+ }
+}
+
+static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
+{
+ assert_spin_locked(&qs->qs_lock);
+
+ if (test_and_clear_bit(node, qs->qs_hold_bm)) {
+ mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
+ if (--qs->qs_holds == 0) {
+ if (qs->qs_pending) {
+ qs->qs_pending = 0;
+ schedule_work(&qs->qs_work);
+ }
+ }
+ mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
+ node, qs->qs_holds);
+ }
+}
+
+/* as a node comes up we delay the quorum decision until we know the fate of
+ * the connection. the hold will be droped in conn_up or hb_down. it might be
+ * perpetuated by con_err until hb_down. if we already have a conn, we might
+ * be dropping a hold that conn_up got. */
+void o2quo_hb_up(u8 node)
+{
+ struct o2quo_state *qs = &o2quo_state;
+
+ spin_lock(&qs->qs_lock);
+
+ qs->qs_heartbeating++;
+ mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
+ "node %u\n", node);
+ mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
+ set_bit(node, qs->qs_hb_bm);
+
+ mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
+
+ if (!test_bit(node, qs->qs_conn_bm))
+ o2quo_set_hold(qs, node);
+ else
+ o2quo_clear_hold(qs, node);
+
+ spin_unlock(&qs->qs_lock);
+}
+
+/* hb going down releases any holds we might have had due to this node from
+ * conn_up, conn_err, or hb_up */
+void o2quo_hb_down(u8 node)
+{
+ struct o2quo_state *qs = &o2quo_state;
+
+ spin_lock(&qs->qs_lock);
+
+ qs->qs_heartbeating--;
+ mlog_bug_on_msg(qs->qs_heartbeating < 0,
+ "node %u, %d heartbeating\n",
+ node, qs->qs_heartbeating);
+ mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
+ clear_bit(node, qs->qs_hb_bm);
+
+ mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
+
+ o2quo_clear_hold(qs, node);
+
+ spin_unlock(&qs->qs_lock);
+}
+
+/* this tells us that we've decided that the node is still heartbeating
+ * even though we've lost it's conn. it must only be called after conn_err
+ * and indicates that we must now make a quorum decision in the future,
+ * though we might be doing so after waiting for holds to drain. Here
+ * we'll be dropping the hold from conn_err. */
+void o2quo_hb_still_up(u8 node)
+{
+ struct o2quo_state *qs = &o2quo_state;
+
+ spin_lock(&qs->qs_lock);
+
+ mlog(0, "node %u\n", node);
+
+ qs->qs_pending = 1;
+ o2quo_clear_hold(qs, node);
+
+ spin_unlock(&qs->qs_lock);
+}
+
+/* This is analagous to hb_up. as a node's connection comes up we delay the
+ * quorum decision until we see it heartbeating. the hold will be droped in
+ * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
+ * it's already heartbeating we we might be dropping a hold that conn_up got.
+ * */
+void o2quo_conn_up(u8 node)
+{
+ struct o2quo_state *qs = &o2quo_state;
+
+ spin_lock(&qs->qs_lock);
+
+ qs->qs_connected++;
+ mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
+ "node %u\n", node);
+ mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
+ set_bit(node, qs->qs_conn_bm);
+
+ mlog(0, "node %u, %d total\n", node, qs->qs_connected);
+
+ if (!test_bit(node, qs->qs_hb_bm))
+ o2quo_set_hold(qs, node);
+ else
+ o2quo_clear_hold(qs, node);
+
+ spin_unlock(&qs->qs_lock);
+}
+
+/* we've decided that we won't ever be connecting to the node again. if it's
+ * still heartbeating we grab a hold that will delay decisions until either the
+ * node stops heartbeating from hb_down or the caller decides that the node is
+ * still up and calls still_up */
+void o2quo_conn_err(u8 node)
+{
+ struct o2quo_state *qs = &o2quo_state;
+
+ spin_lock(&qs->qs_lock);
+
+ if (test_bit(node, qs->qs_conn_bm)) {
+ qs->qs_connected--;
+ mlog_bug_on_msg(qs->qs_connected < 0,
+ "node %u, connected %d\n",
+ node, qs->qs_connected);
+
+ clear_bit(node, qs->qs_conn_bm);
+ }
+
+ mlog(0, "node %u, %d total\n", node, qs->qs_connected);
+
+ if (test_bit(node, qs->qs_hb_bm))
+ o2quo_set_hold(qs, node);
+
+ spin_unlock(&qs->qs_lock);
+}
+
+void o2quo_init(void)
+{
+ struct o2quo_state *qs = &o2quo_state;
+
+ spin_lock_init(&qs->qs_lock);
+ INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL);
+}
+
+void o2quo_exit(void)
+{
+ flush_scheduled_work();
+}
diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h
new file mode 100644
index 00000000000..6649cc6f67c
--- /dev/null
+++ b/fs/ocfs2/cluster/quorum.h
@@ -0,0 +1,36 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_QUORUM_H
+#define O2CLUSTER_QUORUM_H
+
+void o2quo_init(void);
+void o2quo_exit(void);
+
+void o2quo_hb_up(u8 node);
+void o2quo_hb_down(u8 node);
+void o2quo_hb_still_up(u8 node);
+void o2quo_conn_up(u8 node);
+void o2quo_conn_err(u8 node);
+void o2quo_disk_timeout(void);
+
+#endif /* O2CLUSTER_QUORUM_H */
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
new file mode 100644
index 00000000000..f1e99461bb0
--- /dev/null
+++ b/fs/ocfs2/cluster/sys.c
@@ -0,0 +1,124 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sys.c
+ *
+ * OCFS2 cluster sysfs interface
+ *
+ * Copyright (C) 2005 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation,
+ * version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+
+#include "ocfs2_nodemanager.h"
+#include "masklog.h"
+#include "sys.h"
+
+struct o2cb_attribute {
+ struct attribute attr;
+ ssize_t (*show)(char *buf);
+ ssize_t (*store)(const char *buf, size_t count);
+};
+
+#define O2CB_ATTR(_name, _mode, _show, _store) \
+struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+#define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset)
+#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
+
+static ssize_t o2cb_interface_revision_show(char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
+}
+
+O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
+
+static struct attribute *o2cb_attrs[] = {
+ &o2cb_attr_interface_revision.attr,
+ NULL,
+};
+
+static ssize_t
+o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer);
+static ssize_t
+o2cb_store(struct kobject * kobj, struct attribute * attr,
+ const char * buffer, size_t count);
+static struct sysfs_ops o2cb_sysfs_ops = {
+ .show = o2cb_show,
+ .store = o2cb_store,
+};
+
+static struct kobj_type o2cb_subsys_type = {
+ .default_attrs = o2cb_attrs,
+ .sysfs_ops = &o2cb_sysfs_ops,
+};
+
+/* gives us o2cb_subsys */
+decl_subsys(o2cb, NULL, NULL);
+
+static ssize_t
+o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
+{
+ struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
+ struct subsystem *sbs = to_o2cb_subsys(kobj);
+
+ BUG_ON(sbs != &o2cb_subsys);
+
+ if (o2cb_attr->show)
+ return o2cb_attr->show(buffer);
+ return -EIO;
+}
+
+static ssize_t
+o2cb_store(struct kobject * kobj, struct attribute * attr,
+ const char * buffer, size_t count)
+{
+ struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
+ struct subsystem *sbs = to_o2cb_subsys(kobj);
+
+ BUG_ON(sbs != &o2cb_subsys);
+
+ if (o2cb_attr->store)
+ return o2cb_attr->store(buffer, count);
+ return -EIO;
+}
+
+void o2cb_sys_shutdown(void)
+{
+ mlog_sys_shutdown();
+ subsystem_unregister(&o2cb_subsys);
+}
+
+int o2cb_sys_init(void)
+{
+ int ret;
+
+ o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type;
+ ret = subsystem_register(&o2cb_subsys);
+ if (ret)
+ return ret;
+
+ ret = mlog_sys_init(&o2cb_subsys);
+ if (ret)
+ subsystem_unregister(&o2cb_subsys);
+ return ret;
+}
diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h
new file mode 100644
index 00000000000..d66b8ab0045
--- /dev/null
+++ b/fs/ocfs2/cluster/sys.h
@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sys.h
+ *
+ * Function prototypes for o2cb sysfs interface
+ *
+ * Copyright (C) 2005 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation,
+ * version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_SYS_H
+#define O2CLUSTER_SYS_H
+
+void o2cb_sys_shutdown(void);
+int o2cb_sys_init(void);
+
+#endif /* O2CLUSTER_SYS_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
new file mode 100644
index 00000000000..35d92c01a97
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp.c
@@ -0,0 +1,1829 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ *
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * ----
+ *
+ * Callers for this were originally written against a very simple synchronus
+ * API. This implementation reflects those simple callers. Some day I'm sure
+ * we'll need to move to a more robust posting/callback mechanism.
+ *
+ * Transmit calls pass in kernel virtual addresses and block copying this into
+ * the socket's tx buffers via a usual blocking sendmsg. They'll block waiting
+ * for a failed socket to timeout. TX callers can also pass in a poniter to an
+ * 'int' which gets filled with an errno off the wire in response to the
+ * message they send.
+ *
+ * Handlers for unsolicited messages are registered. Each socket has a page
+ * that incoming data is copied into. First the header, then the data.
+ * Handlers are called from only one thread with a reference to this per-socket
+ * page. This page is destroyed after the handler call, so it can't be
+ * referenced beyond the call. Handlers may block but are discouraged from
+ * doing so.
+ *
+ * Any framing errors (bad magic, large payload lengths) close a connection.
+ *
+ * Our sock_container holds the state we associate with a socket. It's current
+ * framing state is held there as well as the refcounting we do around when it
+ * is safe to tear down the socket. The socket is only finally torn down from
+ * the container when the container loses all of its references -- so as long
+ * as you hold a ref on the container you can trust that the socket is valid
+ * for use with kernel socket APIs.
+ *
+ * Connections are initiated between a pair of nodes when the node with the
+ * higher node number gets a heartbeat callback which indicates that the lower
+ * numbered node has started heartbeating. The lower numbered node is passive
+ * and only accepts the connection if the higher numbered node is heartbeating.
+ */
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <net/tcp.h>
+
+#include <asm/uaccess.h>
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_TCP
+#include "masklog.h"
+#include "quorum.h"
+
+#include "tcp_internal.h"
+
+/*
+ * The linux network stack isn't sparse endian clean.. It has macros like
+ * ntohs() which perform the endian checks and structs like sockaddr_in
+ * which aren't annotated. So __force is found here to get the build
+ * clean. When they emerge from the dark ages and annotate the code
+ * we can remove these.
+ */
+
+#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
+#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \
+ NIPQUAD(sc->sc_node->nd_ipv4_address), \
+ ntohs(sc->sc_node->nd_ipv4_port)
+
+/*
+ * In the following two log macros, the whitespace after the ',' just
+ * before ##args is intentional. Otherwise, gcc 2.95 will eat the
+ * previous token if args expands to nothing.
+ */
+#define msglog(hdr, fmt, args...) do { \
+ typeof(hdr) __hdr = (hdr); \
+ mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d " \
+ "key %08x num %u] " fmt, \
+ be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), \
+ be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status), \
+ be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key), \
+ be32_to_cpu(__hdr->msg_num) , ##args); \
+} while (0)
+
+#define sclog(sc, fmt, args...) do { \
+ typeof(sc) __sc = (sc); \
+ mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \
+ "pg_off %zu] " fmt, __sc, \
+ atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \
+ __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \
+ ##args); \
+} while (0)
+
+static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED;
+static struct rb_root o2net_handler_tree = RB_ROOT;
+
+static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
+
+/* XXX someday we'll need better accounting */
+static struct socket *o2net_listen_sock = NULL;
+
+/*
+ * listen work is only queued by the listening socket callbacks on the
+ * o2net_wq. teardown detaches the callbacks before destroying the workqueue.
+ * quorum work is queued as sock containers are shutdown.. stop_listening
+ * tears down all the node's sock containers, preventing future shutdowns
+ * and queued quroum work, before canceling delayed quorum work and
+ * destroying the work queue.
+ */
+static struct workqueue_struct *o2net_wq;
+static struct work_struct o2net_listen_work;
+
+static struct o2hb_callback_func o2net_hb_up, o2net_hb_down;
+#define O2NET_HB_PRI 0x1
+
+static struct o2net_handshake *o2net_hand;
+static struct o2net_msg *o2net_keep_req, *o2net_keep_resp;
+
+static int o2net_sys_err_translations[O2NET_ERR_MAX] =
+ {[O2NET_ERR_NONE] = 0,
+ [O2NET_ERR_NO_HNDLR] = -ENOPROTOOPT,
+ [O2NET_ERR_OVERFLOW] = -EOVERFLOW,
+ [O2NET_ERR_DIED] = -EHOSTDOWN,};
+
+/* can't quite avoid *all* internal declarations :/ */
+static void o2net_sc_connect_completed(void *arg);
+static void o2net_rx_until_empty(void *arg);
+static void o2net_shutdown_sc(void *arg);
+static void o2net_listen_data_ready(struct sock *sk, int bytes);
+static void o2net_sc_send_keep_req(void *arg);
+static void o2net_idle_timer(unsigned long data);
+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
+
+static inline int o2net_sys_err_to_errno(enum o2net_system_error err)
+{
+ int trans;
+ BUG_ON(err >= O2NET_ERR_MAX);
+ trans = o2net_sys_err_translations[err];
+
+ /* Just in case we mess up the translation table above */
+ BUG_ON(err != O2NET_ERR_NONE && trans == 0);
+ return trans;
+}
+
+static struct o2net_node * o2net_nn_from_num(u8 node_num)
+{
+ BUG_ON(node_num >= ARRAY_SIZE(o2net_nodes));
+ return &o2net_nodes[node_num];
+}
+
+static u8 o2net_num_from_nn(struct o2net_node *nn)
+{
+ BUG_ON(nn == NULL);
+ return nn - o2net_nodes;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
+{
+ int ret = 0;
+
+ do {
+ if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
+ ret = -EAGAIN;
+ break;
+ }
+ spin_lock(&nn->nn_lock);
+ ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
+ if (ret == 0)
+ list_add_tail(&nsw->ns_node_item,
+ &nn->nn_status_list);
+ spin_unlock(&nn->nn_lock);
+ } while (ret == -EAGAIN);
+
+ if (ret == 0) {
+ init_waitqueue_head(&nsw->ns_wq);
+ nsw->ns_sys_status = O2NET_ERR_NONE;
+ nsw->ns_status = 0;
+ }
+
+ return ret;
+}
+
+static void o2net_complete_nsw_locked(struct o2net_node *nn,
+ struct o2net_status_wait *nsw,
+ enum o2net_system_error sys_status,
+ s32 status)
+{
+ assert_spin_locked(&nn->nn_lock);
+
+ if (!list_empty(&nsw->ns_node_item)) {
+ list_del_init(&nsw->ns_node_item);
+ nsw->ns_sys_status = sys_status;
+ nsw->ns_status = status;
+ idr_remove(&nn->nn_status_idr, nsw->ns_id);
+ wake_up(&nsw->ns_wq);
+ }
+}
+
+static void o2net_complete_nsw(struct o2net_node *nn,
+ struct o2net_status_wait *nsw,
+ u64 id, enum o2net_system_error sys_status,
+ s32 status)
+{
+ spin_lock(&nn->nn_lock);
+ if (nsw == NULL) {
+ if (id > INT_MAX)
+ goto out;
+
+ nsw = idr_find(&nn->nn_status_idr, id);
+ if (nsw == NULL)
+ goto out;
+ }
+
+ o2net_complete_nsw_locked(nn, nsw, sys_status, status);
+
+out:
+ spin_unlock(&nn->nn_lock);
+ return;
+}
+
+static void o2net_complete_nodes_nsw(struct o2net_node *nn)
+{
+ struct list_head *iter, *tmp;
+ unsigned int num_kills = 0;
+ struct o2net_status_wait *nsw;
+
+ assert_spin_locked(&nn->nn_lock);
+
+ list_for_each_safe(iter, tmp, &nn->nn_status_list) {
+ nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
+ o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
+ num_kills++;
+ }
+
+ mlog(0, "completed %d messages for node %u\n", num_kills,
+ o2net_num_from_nn(nn));
+}
+
+static int o2net_nsw_completed(struct o2net_node *nn,
+ struct o2net_status_wait *nsw)
+{
+ int completed;
+ spin_lock(&nn->nn_lock);
+ completed = list_empty(&nsw->ns_node_item);
+ spin_unlock(&nn->nn_lock);
+ return completed;
+}
+
+/* ------------------------------------------------------------ */
+
+static void sc_kref_release(struct kref *kref)
+{
+ struct o2net_sock_container *sc = container_of(kref,
+ struct o2net_sock_container, sc_kref);
+ sclog(sc, "releasing\n");
+
+ if (sc->sc_sock) {
+ sock_release(sc->sc_sock);
+ sc->sc_sock = NULL;
+ }
+
+ o2nm_node_put(sc->sc_node);
+ sc->sc_node = NULL;
+
+ kfree(sc);
+}
+
+static void sc_put(struct o2net_sock_container *sc)
+{
+ sclog(sc, "put\n");
+ kref_put(&sc->sc_kref, sc_kref_release);
+}
+static void sc_get(struct o2net_sock_container *sc)
+{
+ sclog(sc, "get\n");
+ kref_get(&sc->sc_kref);
+}
+static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
+{
+ struct o2net_sock_container *sc, *ret = NULL;
+ struct page *page = NULL;
+
+ page = alloc_page(GFP_NOFS);
+ sc = kcalloc(1, sizeof(*sc), GFP_NOFS);
+ if (sc == NULL || page == NULL)
+ goto out;
+
+ kref_init(&sc->sc_kref);
+ o2nm_node_get(node);
+ sc->sc_node = node;
+
+ INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed, sc);
+ INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty, sc);
+ INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc, sc);
+ INIT_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req, sc);
+
+ init_timer(&sc->sc_idle_timeout);
+ sc->sc_idle_timeout.function = o2net_idle_timer;
+ sc->sc_idle_timeout.data = (unsigned long)sc;
+
+ sclog(sc, "alloced\n");
+
+ ret = sc;
+ sc->sc_page = page;
+ sc = NULL;
+ page = NULL;
+
+out:
+ if (page)
+ __free_page(page);
+ kfree(sc);
+
+ return ret;
+}
+
+/* ------------------------------------------------------------ */
+
+static void o2net_sc_queue_work(struct o2net_sock_container *sc,
+ struct work_struct *work)
+{
+ sc_get(sc);
+ if (!queue_work(o2net_wq, work))
+ sc_put(sc);
+}
+static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc,
+ struct work_struct *work,
+ int delay)
+{
+ sc_get(sc);
+ if (!queue_delayed_work(o2net_wq, work, delay))
+ sc_put(sc);
+}
+static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc,
+ struct work_struct *work)
+{
+ if (cancel_delayed_work(work))
+ sc_put(sc);
+}
+
+static void o2net_set_nn_state(struct o2net_node *nn,
+ struct o2net_sock_container *sc,
+ unsigned valid, int err)
+{
+ int was_valid = nn->nn_sc_valid;
+ int was_err = nn->nn_persistent_error;
+ struct o2net_sock_container *old_sc = nn->nn_sc;
+
+ assert_spin_locked(&nn->nn_lock);
+
+ /* the node num comparison and single connect/accept path should stop
+ * an non-null sc from being overwritten with another */
+ BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc);
+ mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
+ mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
+
+ /* we won't reconnect after our valid conn goes away for
+ * this hb iteration.. here so it shows up in the logs */
+ if (was_valid && !valid && err == 0)
+ err = -ENOTCONN;
+
+ mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n",
+ o2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid,
+ nn->nn_persistent_error, err);
+
+ nn->nn_sc = sc;
+ nn->nn_sc_valid = valid ? 1 : 0;
+ nn->nn_persistent_error = err;
+
+ /* mirrors o2net_tx_can_proceed() */
+ if (nn->nn_persistent_error || nn->nn_sc_valid)
+ wake_up(&nn->nn_sc_wq);
+
+ if (!was_err && nn->nn_persistent_error) {
+ o2quo_conn_err(o2net_num_from_nn(nn));
+ queue_delayed_work(o2net_wq, &nn->nn_still_up,
+ msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+ }
+
+ if (was_valid && !valid) {
+ mlog(ML_NOTICE, "no longer connected to " SC_NODEF_FMT "\n",
+ SC_NODEF_ARGS(old_sc));
+ o2net_complete_nodes_nsw(nn);
+ }
+
+ if (!was_valid && valid) {
+ o2quo_conn_up(o2net_num_from_nn(nn));
+ /* this is a bit of a hack. we only try reconnecting
+ * when heartbeating starts until we get a connection.
+ * if that connection then dies we don't try reconnecting.
+ * the only way to start connecting again is to down
+ * heartbeat and bring it back up. */
+ cancel_delayed_work(&nn->nn_connect_expired);
+ mlog(ML_NOTICE, "%s " SC_NODEF_FMT "\n",
+ o2nm_this_node() > sc->sc_node->nd_num ?
+ "connected to" : "accepted connection from",
+ SC_NODEF_ARGS(sc));
+ }
+
+ /* trigger the connecting worker func as long as we're not valid,
+ * it will back off if it shouldn't connect. This can be called
+ * from node config teardown and so needs to be careful about
+ * the work queue actually being up. */
+ if (!valid && o2net_wq) {
+ unsigned long delay;
+ /* delay if we're withing a RECONNECT_DELAY of the
+ * last attempt */
+ delay = (nn->nn_last_connect_attempt +
+ msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
+ - jiffies;
+ if (delay > msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
+ delay = 0;
+ mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
+ queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
+ }
+
+ /* keep track of the nn's sc ref for the caller */
+ if ((old_sc == NULL) && sc)
+ sc_get(sc);
+ if (old_sc && (old_sc != sc)) {
+ o2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work);
+ sc_put(old_sc);
+ }
+}
+
+/* see o2net_register_callbacks() */
+static void o2net_data_ready(struct sock *sk, int bytes)
+{
+ void (*ready)(struct sock *sk, int bytes);
+
+ read_lock(&sk->sk_callback_lock);
+ if (sk->sk_user_data) {
+ struct o2net_sock_container *sc = sk->sk_user_data;
+ sclog(sc, "data_ready hit\n");
+ do_gettimeofday(&sc->sc_tv_data_ready);
+ o2net_sc_queue_work(sc, &sc->sc_rx_work);
+ ready = sc->sc_data_ready;
+ } else {
+ ready = sk->sk_data_ready;
+ }
+ read_unlock(&sk->sk_callback_lock);
+
+ ready(sk, bytes);
+}
+
+/* see o2net_register_callbacks() */
+static void o2net_state_change(struct sock *sk)
+{
+ void (*state_change)(struct sock *sk);
+ struct o2net_sock_container *sc;
+
+ read_lock(&sk->sk_callback_lock);
+ sc = sk->sk_user_data;
+ if (sc == NULL) {
+ state_change = sk->sk_state_change;
+ goto out;
+ }
+
+ sclog(sc, "state_change to %d\n", sk->sk_state);
+
+ state_change = sc->sc_state_change;
+
+ switch(sk->sk_state) {
+ /* ignore connecting sockets as they make progress */
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ break;
+ case TCP_ESTABLISHED:
+ o2net_sc_queue_work(sc, &sc->sc_connect_work);
+ break;
+ default:
+ o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+ break;
+ }
+out:
+ read_unlock(&sk->sk_callback_lock);
+ state_change(sk);
+}
+
+/*
+ * we register callbacks so we can queue work on events before calling
+ * th