[NETFILTER]: Add nf_conntrack subsystem.

The existing connection tracking subsystem in netfilter can only handle ipv4. There were basically two choices present to add connection tracking support for ipv6. We could either duplicate all of the ipv4 connection tracking code into an ipv6 counterpart, or (the choice taken by these patches) we could design a generic layer that could handle both ipv4 and ipv6 and thus requiring only one sub-protocol (TCP, UDP, etc.) connection tracking helper module to be written. In fact nf_conntrack is capable of working with any layer 3 protocol. The existing ipv4 specific conntrack code could also not deal with the pecularities of doing connection tracking on ipv6, which is also cured here. For example, these issues include: 1) ICMPv6 handling, which is used for neighbour discovery in ipv6 thus some messages such as these should not participate in connection tracking since effectively they are like ARP messages 2) fragmentation must be handled differently in ipv6, because the simplistic "defrag, connection track and NAT, refrag" (which the existing ipv4 connection tracking does) approach simply isn't feasible in ipv6 3) ipv6 extension header parsing must occur at the correct spots before and after connection tracking decisions, and there were no provisions for this in the existing connection tracking design 4) ipv6 has no need for stateful NAT The ipv4 specific conntrack layer is kept around, until all of the ipv4 specific conntrack helpers are ported over to nf_conntrack and it is feature complete. Once that occurs, the old conntrack stuff will get placed into the feature-removal-schedule and we will fully kill it off 6 months later. Signed-off-by: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp> Signed-off-by: Harald Welte <laforge@netfilter.org> Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
author: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp> 2005-11-09 16:38:16 -0800
committer: David S. Miller <davem@davemloft.net> 2005-11-09 16:38:16 -0800
commit: 9fb9cbb1082d6b31fb45aa1a14432449a0df6cf1 (patch)
tree: c964a62bdd766eca436c30f51a9e33e2b798b0a6 /net/netfilter
parent: 6730c3c14421b7c924d06e31bb66e0adad225547 (diff)
10 files changed, 5418 insertions, 0 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 8296b38bf27..a84f9221e5f 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1,3 +1,6 @@
+menu "Core Netfilter Configuration"
+	depends on NET && NETFILTER
+
 config NETFILTER_NETLINK
        tristate "Netfilter netlink interface"
        help
@@ -22,3 +25,74 @@ config NETFILTER_NETLINK_LOG
 	  and is also scheduled to replace the old syslog-based ipt_LOG
 	  and ip6t_LOG modules.
 
+config NF_CONNTRACK
+	tristate "Layer 3 Independent Connection tracking (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && IP_NF_CONNTRACK=n
+	default n
+	---help---
+	  Connection tracking keeps a record of what packets have passed
+	  through your machine, in order to figure out how they are related
+	  into connections.
+
+	  Layer 3 independent connection tracking is experimental scheme
+	  which generalize ip_conntrack to support other layer 3 protocols.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_CT_ACCT
+	bool "Connection tracking flow accounting"
+	depends on NF_CONNTRACK
+	help
+	  If this option is enabled, the connection tracking code will
+	  keep per-flow packet and byte counters.
+
+	  Those counters can be used for flow-based accounting or the
+	  `connbytes' match.
+
+	  If unsure, say `N'.
+
+config NF_CONNTRACK_MARK
+	bool  'Connection mark tracking support'
+	depends on NF_CONNTRACK
+	help
+	  This option enables support for connection marks, used by the
+	  `CONNMARK' target and `connmark' match. Similar to the mark value
+	  of packets, but this mark value is kept in the conntrack session
+	  instead of the individual packets.
+
+config NF_CONNTRACK_EVENTS
+	bool "Connection tracking events"
+	depends on NF_CONNTRACK
+	help
+	  If this option is enabled, the connection tracking code will
+	  provide a notifier chain that can be used by other kernel code
+	  to get notified aboutchanges in the connection tracking state.
+
+	  If unsure, say `N'.
+
+config NF_CT_PROTO_SCTP
+	tristate 'SCTP protocol on new connection tracking support (EXPERIMENTAL)'
+	depends on EXPERIMENTAL && NF_CONNTRACK
+	default n
+	help
+	  With this option enabled, the layer 3 independent connection
+	  tracking code will be able to do state tracking on SCTP connections.
+
+	  If you want to compile it as a module, say M here and read
+	  Documentation/modules.txt.  If unsure, say `N'.
+
+config NF_CONNTRACK_FTP
+	tristate "FTP support on new connection tracking (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && NF_CONNTRACK
+	help
+	  Tracking FTP connections is problematic: special helpers are
+	  required for tracking them, and doing masquerading and other forms
+	  of Network Address Translation on them.
+
+	  This is FTP support on Layer 3 independent connection tracking.
+	  Layer 3 independent connection tracking is experimental scheme
+	  which generalize ip_conntrack to support other layer 3 protocols.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+endmenu
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index b3b44f8b415..55f019ad2c0 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -5,3 +5,11 @@ obj-$(CONFIG_NETFILTER) = netfilter.o
 obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
 obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
 obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
+
+nf_conntrack-objs	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o
+
+obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
+obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o
+
+# SCTP protocol connection tracking
+obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
new file mode 100644
index 00000000000..9a67c796b38
--- /dev/null
+++ b/net/netfilter/nf_conntrack_core.c
@@ -0,0 +1,1538 @@
+/* Connection state tracking for netfilter.  This is separated from,
+   but required by, the NAT layer; it can also be used by an iptables
+   extension. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
+ *	- new API and handling of conntrack/nat helpers
+ *	- now capable of multiple expectations for one master
+ * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
+ *	- add usage/reference counts to ip_conntrack_expect
+ *	- export ip_conntrack[_expect]_{find_get,put} functions
+ * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ *	- generalize L3 protocol denendent part.
+ * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ *	- add support various size of conntrack structures.
+ *
+ * Derived from net/ipv4/netfilter/ip_conntrack_core.c
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+
+/* This rwlock protects the main hash table, protocol/helper/expected
+   registrations, conntrack timers*/
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_protocol.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#define NF_CONNTRACK_VERSION	"0.4.1"
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+DEFINE_RWLOCK(nf_conntrack_lock);
+
+/* nf_conntrack_standalone needs this */
+atomic_t nf_conntrack_count = ATOMIC_INIT(0);
+
+void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
+LIST_HEAD(nf_conntrack_expect_list);
+struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
+struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
+static LIST_HEAD(helpers);
+unsigned int nf_conntrack_htable_size = 0;
+int nf_conntrack_max;
+struct list_head *nf_conntrack_hash;
+static kmem_cache_t *nf_conntrack_expect_cachep;
+struct nf_conn nf_conntrack_untracked;
+unsigned int nf_ct_log_invalid;
+static LIST_HEAD(unconfirmed);
+static int nf_conntrack_vmalloc;
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+struct notifier_block *nf_conntrack_chain;
+struct notifier_block *nf_conntrack_expect_chain;
+
+DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
+
+/* deliver cached events and clear cache entry - must be called with locally
+ * disabled softirqs */
+static inline void
+__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
+{
+	DEBUGP("ecache: delivering events for %p\n", ecache->ct);
+	if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
+	    && ecache->events)
+		notifier_call_chain(&nf_conntrack_chain, ecache->events,
+				    ecache->ct);
+
+	ecache->events = 0;
+	nf_ct_put(ecache->ct);
+	ecache->ct = NULL;
+}
+
+/* Deliver all cached events for a particular conntrack. This is called
+ * by code prior to async packet handling for freeing the skb */
+void nf_ct_deliver_cached_events(const struct nf_conn *ct)
+{
+	struct nf_conntrack_ecache *ecache;
+
+	local_bh_disable();
+	ecache = &__get_cpu_var(nf_conntrack_ecache);
+	if (ecache->ct == ct)
+		__nf_ct_deliver_cached_events(ecache);
+	local_bh_enable();
+}
+
+/* Deliver cached events for old pending events, if current conntrack != old */
+void __nf_ct_event_cache_init(struct nf_conn *ct)
+{
+	struct nf_conntrack_ecache *ecache;
+	
+	/* take care of delivering potentially old events */
+	ecache = &__get_cpu_var(nf_conntrack_ecache);
+	BUG_ON(ecache->ct == ct);
+	if (ecache->ct)
+		__nf_ct_deliver_cached_events(ecache);
+	/* initialize for this conntrack/packet */
+	ecache->ct = ct;
+	nf_conntrack_get(&ct->ct_general);
+}
+
+/* flush the event cache - touches other CPU's data and must not be called
+ * while packets are still passing through the code */
+static void nf_ct_event_cache_flush(void)
+{
+	struct nf_conntrack_ecache *ecache;
+	int cpu;
+
+	for_each_cpu(cpu) {
+		ecache = &per_cpu(nf_conntrack_ecache, cpu);
+		if (ecache->ct)
+			nf_ct_put(ecache->ct);
+	}
+}
+#else
+static inline void nf_ct_event_cache_flush(void) {}
+#endif /* CONFIG_NF_CONNTRACK_EVENTS */
+
+DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
+EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
+
+/*
+ * This scheme offers various size of "struct nf_conn" dependent on
+ * features(helper, nat, ...)
+ */
+
+#define NF_CT_FEATURES_NAMELEN	256
+static struct {
+	/* name of slab cache. printed in /proc/slabinfo */
+	char *name;
+
+	/* size of slab cache */
+	size_t size;
+
+	/* slab cache pointer */
+	kmem_cache_t *cachep;
+
+	/* allocated slab cache + modules which uses this slab cache */
+	int use;
+
+	/* Initialization */
+	int (*init_conntrack)(struct nf_conn *, u_int32_t);
+
+} nf_ct_cache[NF_CT_F_NUM];
+
+/* protect members of nf_ct_cache except of "use" */
+DEFINE_RWLOCK(nf_ct_cache_lock);
+
+/* This avoids calling kmem_cache_create() with same name simultaneously */
+DECLARE_MUTEX(nf_ct_cache_mutex);
+
+extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
+struct nf_conntrack_protocol *
+nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol)
+{
+	if (unlikely(nf_ct_protos[l3proto] == NULL))
+		return &nf_conntrack_generic_protocol;
+
+	return nf_ct_protos[l3proto][protocol];
+}
+
+static int nf_conntrack_hash_rnd_initted;
+static unsigned int nf_conntrack_hash_rnd;
+
+static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
+				  unsigned int size, unsigned int rnd)
+{
+	unsigned int a, b;
+	a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
+		  ((tuple->src.l3num) << 16) | tuple->dst.protonum);
+	b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
+			(tuple->src.u.all << 16) | tuple->dst.u.all);
+
+	return jhash_2words(a, b, rnd) % size;
+}
+
+static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
+{
+	return __hash_conntrack(tuple, nf_conntrack_htable_size,
+				nf_conntrack_hash_rnd);
+}
+
+/* Initialize "struct nf_conn" which has spaces for helper */
+static int
+init_conntrack_for_helper(struct nf_conn *conntrack, u_int32_t features)
+{
+
+	conntrack->help = (union nf_conntrack_help *)
+		(((unsigned long)conntrack->data
+		  + (__alignof__(union nf_conntrack_help) - 1))
+		 & (~((unsigned long)(__alignof__(union nf_conntrack_help) -1))));
+	return 0;
+}
+
+int nf_conntrack_register_cache(u_int32_t features, const char *name,
+				size_t size,
+				int (*init)(struct nf_conn *, u_int32_t))
+{
+	int ret = 0;
+	char *cache_name;
+	kmem_cache_t *cachep;
+
+	DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
+	       features, name, size);
+
+	if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
+		DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
+			features);
+		return -EINVAL;
+	}
+
+	down(&nf_ct_cache_mutex);
+
+	write_lock_bh(&nf_ct_cache_lock);
+	/* e.g: multiple helpers are loaded */
+	if (nf_ct_cache[features].use > 0) {
+		DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
+		if ((!strncmp(nf_ct_cache[features].name, name,
+			      NF_CT_FEATURES_NAMELEN))
+		    && nf_ct_cache[features].size == size
+		    && nf_ct_cache[features].init_conntrack == init) {
+			DEBUGP("nf_conntrack_register_cache: reusing.\n");
+			nf_ct_cache[features].use++;
+			ret = 0;
+		} else
+			ret = -EBUSY;
+
+		write_unlock_bh(&nf_ct_cache_lock);
+		up(&nf_ct_cache_mutex);
+		return ret;
+	}
+	write_unlock_bh(&nf_ct_cache_lock);
+
+	/*
+	 * The memory space for name of slab cache must be alive until
+	 * cache is destroyed.
+	 */
+	cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
+	if (cache_name == NULL) {
+		DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
+		ret = -ENOMEM;
+		goto out_up_mutex;
+	}
+
+	if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
+						>= NF_CT_FEATURES_NAMELEN) {
+		printk("nf_conntrack_register_cache: name too long\n");
+		ret = -EINVAL;
+		goto out_free_name;
+	}
+
+	cachep = kmem_cache_create(cache_name, size, 0, 0,
+				   NULL, NULL);
+	if (!cachep) {
+		printk("nf_conntrack_register_cache: Can't create slab cache "
+		       "for the features = 0x%x\n", features);
+		ret = -ENOMEM;
+		goto out_free_name;
+	}
+
+	write_lock_bh(&nf_ct_cache_lock);
+	nf_ct_cache[features].use = 1;
+	nf_ct_cache[features].size = size;
+	nf_ct_cache[features].init_conntrack = init;
+	nf_ct_cache[features].cachep = cachep;
+	nf_ct_cache[features].name = cache_name;
+	write_unlock_bh(&nf_ct_cache_lock);
+
+	goto out_up_mutex;
+
+out_free_name:
+	kfree(cache_name);
+out_up_mutex:
+	up(&nf_ct_cache_mutex);
+	return ret;
+}
+
+/* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
+void nf_conntrack_unregister_cache(u_int32_t features)
+{
+	kmem_cache_t *cachep;
+	char *name;
+
+	/*
+	 * This assures that kmem_cache_create() isn't called before destroying
+	 * slab cache.
+	 */
+	DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
+	down(&nf_ct_cache_mutex);
+
+	write_lock_bh(&nf_ct_cache_lock);
+	if (--nf_ct_cache[features].use > 0) {
+		write_unlock_bh(&nf_ct_cache_lock);
+		up(&nf_ct_cache_mutex);
+		return;
+	}
+	cachep = nf_ct_cache[features].cachep;
+	name = nf_ct_cache[features].name;
+	nf_ct_cache[features].cachep = NULL;
+	nf_ct_cache[features].name = NULL;
+	nf_ct_cache[features].init_conntrack = NULL;
+	nf_ct_cache[features].size = 0;
+	write_unlock_bh(&nf_ct_cache_lock);
+
+	synchronize_net();
+
+	kmem_cache_destroy(cachep);
+	kfree(name);
+
+	up(&nf_ct_cache_mutex);
+}
+
+int
+nf_ct_get_tuple(const struct sk_buff *skb,
+		unsigned int nhoff,
+		unsigned int dataoff,
+		u_int16_t l3num,
+		u_int8_t protonum,
+		struct nf_conntrack_tuple *tuple,
+		const struct nf_conntrack_l3proto *l3proto,
+		const struct nf_conntrack_protocol *protocol)
+{
+	NF_CT_TUPLE_U_BLANK(tuple);
+
+	tuple->src.l3num = l3num;
+	if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
+		return 0;
+
+	tuple->dst.protonum = protonum;
+	tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+
+	return protocol->pkt_to_tuple(skb, dataoff, tuple);
+}
+
+int
+nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
+		   const struct nf_conntrack_tuple *orig,
+		   const struct nf_conntrack_l3proto *l3proto,
+		   const struct nf_conntrack_protocol *protocol)
+{
+	NF_CT_TUPLE_U_BLANK(inverse);
+
+	inverse->src.l3num = orig->src.l3num;
+	if (l3proto->invert_tuple(inverse, orig) == 0)
+		return 0;
+
+	inverse->dst.dir = !orig->dst.dir;
+
+	inverse->dst.protonum = orig->dst.protonum;
+	return protocol->invert_tuple(inverse, orig);
+}
+
+/* nf_conntrack_expect helper functions */
+static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
+{
+	ASSERT_WRITE_LOCK(&nf_conntrack_lock);
+	NF_CT_ASSERT(!timer_pending(&exp_timeout));
+	list_del(&exp->list);
+	NF_CT_STAT_INC(expect_delete);
+	exp->master->expecting--;
+	nf_conntrack_expect_put(exp);
+}
+
+static void expectation_timed_out(unsigned long ul_expect)
+{
+	struct nf_conntrack_expect *exp = (void *)ul_expect;
+
+	write_lock_bh(&nf_conntrack_lock);
+	nf_ct_unlink_expect(exp);
+	write_unlock_bh(&nf_conntrack_lock);
+	nf_conntrack_expect_put(exp);
+}
+
+/* If an expectation for this connection is found, it gets delete from
+ * global list then returned. */
+static struct nf_conntrack_expect *
+find_expectation(const struct nf_conntrack_tuple *tuple)
+{
+	struct nf_conntrack_expect *i;
+
+	list_for_each_entry(i, &nf_conntrack_expect_list, list) {
+	/* If master is not in hash table yet (ie. packet hasn't left
+	   this machine yet), how can other end know about expected?
+	   Hence these are not the droids you are looking for (if
+	   master ct never got confirmed, we'd hold a reference to it
+	   and weird things would happen to future packets). */
+		if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
+		    && nf_ct_is_confirmed(i->master)) {
+			if (i->flags & NF_CT_EXPECT_PERMANENT) {
+				atomic_inc(&i->use);
+				return i;
+			} else if (del_timer(&i->timeout)) {
+				nf_ct_unlink_expect(i);
+				return i;
+			}
+		}
+	}
+	return NULL;
+}
+
+/* delete all expectations for this conntrack */
+static void remove_expectations(struct nf_conn *ct)
+{
+	struct nf_conntrack_expect *i, *tmp;
+
+	/* Optimization: most connection never expect any others. */
+	if (ct->expecting == 0)
+		return;
+
+	list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
+		if (i->master == ct && del_timer(&i->timeout)) {
+			nf_ct_unlink_expect(i);
+			nf_conntrack_expect_put(i);
+ 		}
+	}
+}
+
+static void
+clean_from_lists(struct nf_conn *ct)
+{
+	unsigned int ho, hr;
+	
+	DEBUGP("clean_from_lists(%p)\n", ct);
+	ASSERT_WRITE_LOCK(&nf_conntrack_lock);
+
+	ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+	hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+	LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
+	LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
+
+	/* Destroy all pending expectations */
+	remove_expectations(ct);
+}
+
+static void
+destroy_conntrack(struct nf_conntrack *nfct)
+{
+	struct nf_conn *ct = (struct nf_conn *)nfct;
+	struct nf_conntrack_l3proto *l3proto;
+	struct nf_conntrack_protocol *proto;
+
+	DEBUGP("destroy_conntrack(%p)\n", ct);
+	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
+	NF_CT_ASSERT(!timer_pending(&ct->timeout));
+
+	nf_conntrack_event(IPCT_DESTROY, ct);
+	set_bit(IPS_DYING_BIT, &ct->status);
+
+	/* To make sure we don't get any weird locking issues here:
+	 * destroy_conntrack() MUST NOT be called with a write lock
+	 * to nf_conntrack_lock!!! -HW */
+	l3proto = nf_ct_find_l3proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
+	if (l3proto && l3proto->destroy)
+		l3proto->destroy(ct);
+
+	proto = nf_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num,
+				 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
+	if (proto && proto->destroy)
+		proto->destroy(ct);
+
+	if (nf_conntrack_destroyed)
+		nf_conntrack_destroyed(ct);
+
+	write_lock_bh(&nf_conntrack_lock);
+	/* Expectations will have been removed in clean_from_lists,
+	 * except TFTP can create an expectation on the first packet,
+	 * before connection is in the list, so we need to clean here,
+	 * too. */
+	remove_expectations(ct);
+
+	/* We overload first tuple to link into unconfirmed list. */
+	if (!nf_ct_is_confirmed(ct)) {
+		BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
+		list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+	}
+
+	NF_CT_STAT_INC(delete);
+	write_unlock_bh(&nf_conntrack_lock);
+
+	if (ct->master)
+		nf_ct_put(ct->master);
+
+	DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
+	nf_conntrack_free(ct);
+}
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+	struct nf_conn *ct = (void *)ul_conntrack;
+
+	write_lock_bh(&nf_conntrack_lock);
+	/* Inside lock so preempt is disabled on module removal path.
+	 * Otherwise we can get spurious warnings. */
+	NF_CT_STAT_INC(delete_list);
+	clean_from_lists(ct);
+	write_unlock_bh(&nf_conntrack_lock);
+	nf_ct_put(ct);
+}
+
+static inline int
+conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
+		    const struct nf_conntrack_tuple *tuple,
+		    const struct nf_conn *ignored_conntrack)
+{
+	ASSERT_READ_LOCK(&nf_conntrack_lock);
+	return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
+		&& nf_ct_tuple_equal(tuple, &i->tuple);
+}
+
+static struct nf_conntrack_tuple_hash *
+__nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
+		    const struct nf_conn *ignored_conntrack)
+{
+	struct nf_conntrack_tuple_hash *h;
+	unsigned int hash = hash_conntrack(tuple);
+
+	ASSERT_READ_LOCK(&nf_conntrack_lock);
+	list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
+		if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
+			NF_CT_STAT_INC(found);
+			return h;
+		}
+		NF_CT_STAT_INC(searched);
+	}
+
+	return NULL;
+}
+
+/* Find a connection corresponding to a tuple. */
+struct nf_conntrack_tuple_hash *
+nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
+		      const struct nf_conn *ignored_conntrack)
+{
+	struct nf_conntrack_tuple_hash *h;
+
+	read_lock_bh(&nf_conntrack_lock);
+	h = __nf_conntrack_find(tuple, ignored_conntrack);
+	if (h)
+		atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
+	read_unlock_bh(&nf_conntrack_lock);
+
+	return h;
+}
+
+/* Confirm a connection given skb; places it in hash table */
+int
+__nf_conntrack_confirm(struct sk_buff **pskb)
+{
+	unsigned int hash, repl_hash;
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+
+	ct = nf_ct_get(*pskb, &ctinfo);
+
+	/* ipt_REJECT uses nf_conntrack_attach to attach related
+	   ICMP/TCP RST packets in other direction.  Actual packet
+	   which created connection will be IP_CT_NEW or for an
+	   expected connection, IP_CT_RELATED. */
+	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+		return NF_ACCEPT;
+
+	hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+	repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+	/* We're not in hash table, and we refuse to set up related
+	   connections for unconfirmed conns.  But packet copies and
+	   REJECT will give spurious warnings here. */
+	/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
+
+	/* No external references means noone else could have
+	   confirmed us. */
+	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
+	DEBUGP("Confirming conntrack %p\n", ct);
+
+	write_lock_bh(&nf_conntrack_lock);
+
+	/* See if there's one in the list already, including reverse:
+	   NAT could have grabbed it without realizing, since we're
+	   not in the hash.  If there is, we lost race. */
+	if (!LIST_FIND(&nf_conntrack_hash[hash],
+		       conntrack_tuple_cmp,
+		       struct nf_conntrack_tuple_hash *,
+		       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
+	    && !LIST_FIND(&nf_conntrack_hash[repl_hash],
+			  conntrack_tuple_cmp,
+			  struct nf_conntrack_tuple_hash *,
+			  &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
+		/* Remove from unconfirmed list */
+		list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+
+		list_prepend(&nf_conntrack_hash[hash],
+			     &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
+		list_prepend(&nf_conntrack_hash[repl_hash],
+			     &ct->tuplehash[IP_CT_DIR_REPLY]);
+		/* Timer relative to confirmation time, not original
+		   setting time, otherwise we'd get timer wrap in
+		   weird delay cases. */
+		ct->timeout.expires += jiffies;
+		add_timer(&ct->timeout);
+		atomic_inc(&ct->ct_general.use);
+		set_bit(IPS_CONFIRMED_BIT, &ct->status);
+		NF_CT_STAT_INC(insert);
+		write_unlock_bh(&nf_conntrack_lock);
+		if (ct->helper)
+			nf_conntrack_event_cache(IPCT_HELPER, *pskb);
+#ifdef CONFIG_NF_NAT_NEEDED
+		if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
+		    test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
+			nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
+#endif
+		nf_conntrack_event_cache(master_ct(ct) ?
+					 IPCT_RELATED : IPCT_NEW, *pskb);
+		return NF_ACCEPT;
+	}
+
+	NF_CT_STAT_INC(insert_failed);
+	write_unlock_bh(&nf_conntrack_lock);
+	return NF_DROP;
+}
+
+/* Returns true if a connection correspondings to the tuple (required
+   for NAT). */
+int
+nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
+			 const struct nf_conn *ignored_conntrack)
+{
+	struct nf_conntrack_tuple_hash *h;
+
+	read_lock_bh(&nf_conntrack_lock);
+	h = __nf_conntrack_find(tuple, ignored_conntrack);
+	read_unlock_bh(&nf_conntrack_lock);
+
+	return h != NULL;
+}
+
+/* There's a small race here where we may free a just-assured
+   connection.  Too bad: we're in trouble anyway. */
+static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
+{
+	return !(test_bit(IPS_ASSURED_BIT,
+			  &nf_ct_tuplehash_to_ctrack(i)->status));
+}
+
+static int early_drop(struct list_head *chain)
+{
+	/* Traverse backwards: gives us oldest, which is roughly LRU */
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct = NULL;
+	int dropped = 0;
+
+	read_lock_bh(&nf_conntrack_lock);
+	h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
+	if (h) {
+		ct = nf_ct_tuplehash_to_ctrack(h);
+		atomic_inc(&ct->ct_general.use);
+	}
+	read_unlock_bh(&nf_conntrack_lock);
+
+	if (!ct)
+		return dropped;
+
+	if (del_timer(&ct->timeout)) {
+		death_by_timeout((unsigned long)ct);
+		dropped = 1;
+		NF_CT_STAT_INC(early_drop);
+	}
+	nf_ct_put(ct);
+	return dropped;
+}
+
+static inline int helper_cmp(const struct nf_conntrack_helper *i,
+			     const struct nf_conntrack_tuple *rtuple)
+{
+	return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
+}
+
+static struct nf_conntrack_helper *
+nf_ct_find_helper(const struct nf_conntrack_tuple *tuple)
+{
+	return LIST_FIND(&helpers, helper_cmp,
+			 struct nf_conntrack_helper *,
+			 tuple);
+}
+
+static struct nf_conn *
+__nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
+		     const struct nf_conntrack_tuple *repl,
+		     const struct nf_conntrack_l3proto *l3proto)
+{
+	struct nf_conn *conntrack = NULL;
+	u_int32_t features = 0;
+
+	if (!nf_conntrack_hash_rnd_initted) {
+		get_random_bytes(&nf_conntrack_hash_rnd, 4);
+		nf_conntrack_hash_rnd_initted = 1;
+	}
+
+	if (nf_conntrack_max
+	    && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
+		unsigned int hash = hash_conntrack(orig);
+		/* Try dropping from this hash chain. */
+		if (!early_drop(&nf_conntrack_hash[hash])) {
+			if (net_ratelimit())
+				printk(KERN_WARNING
+				       "nf_conntrack: table full, dropping"
+				       " packet.\n");
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+
+	/*  find features needed by this conntrack. */
+	features = l3proto->get_features(orig);
+	read_lock_bh(&nf_conntrack_lock);
+	if (nf_ct_find_helper(repl) != NULL)
+		features |= NF_CT_F_HELP;
+	read_unlock_bh(&nf_conntrack_lock);
+
+	DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
+
+	read_lock_bh(&nf_ct_cache_lock);
+
+	if (!nf_ct_cache[features].use) {
+		DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
+			features);
+		goto out;
+	}
+
+	conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
+	if (conntrack == NULL) {
+		DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
+		goto out;
+	}
+
+	memset(conntrack, 0, nf_ct_cache[features].size);
+	conntrack->features = features;
+	if (nf_ct_cache[features].init_conntrack &&
+	    nf_ct_cache[features].init_conntrack(conntrack, features) < 0) {
+		DEBUGP("nf_conntrack_alloc: failed to init\n");
+		kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
+		conntrack = NULL;
+		goto out;
+	}
+
+	atomic_set(&conntrack->ct_general.use, 1);
+	conntrack->ct_general.destroy = destroy_conntrack;
+	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
+	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
+	/* Don't set timer yet: wait for confirmation */
+	init_timer(&conntrack->timeout);
+	conntrack->timeout.data = (unsigned long)conntrack;
+	conntrack->timeout.function = death_by_timeout;
+
+	atomic_inc(&nf_conntrack_count);
+out:
+	read_unlock_bh(&nf_ct_cache_lock);
+	return conntrack;
+}
+
+struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
+				   const struct nf_conntrack_tuple *repl)
+{
+	struct nf_conntrack_l3proto *l3proto;
+
+	l3proto = nf_ct_find_l3proto(orig->src.l3num);
+	return __nf_conntrack_alloc(orig, repl, l3proto);
+}
+
+void nf_conntrack_free(struct nf_conn *conntrack)
+{
+	u_int32_t features = conntrack->features;
+	NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
+	DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
+	       conntrack);
+	kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
+	atomic_dec(&nf_conntrack_count);
+}
+
+/* Allocate a new conntrack: we return -ENOMEM if classification
+   failed due to stress.  Otherwise it really is unclassifiable. */
+static struct nf_conntrack_tuple_hash *
+init_conntrack(const struct nf_conntrack_tuple *tuple,
+	       struct nf_conntrack_l3proto *l3proto,
+	       struct nf_conntrack_protocol *protocol,
+	       struct sk_buff *skb,
+	       unsigned int dataoff)
+{
+	struct nf_conn *conntrack;
+	struct nf_conntrack_tuple repl_tuple;
+	struct nf_conntrack_expect *exp;
+
+	if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
+		DEBUGP("Can't invert tuple.\n");
+		return NULL;
+	}
+
+	conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
+	if (conntrack == NULL || IS_ERR(conntrack)) {
+		DEBUGP("Can't allocate conntrack.\n");
+		return (struct nf_conntrack_tuple_hash *)conntrack;
+	}
+
+	if (!protocol->new(conntrack, skb, dataoff)) {
+		nf_conntrack_free(conntrack);
+		DEBUGP("init conntrack: can't track with proto module\n");
+		return NULL;
+	}
+
+	write_lock_bh(&nf_conntrack_lock);
+	exp = find_expectation(tuple);
+
+	if (exp) {
+		DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
+			conntrack, exp);
+		/* Welcome, Mr. Bond.  We've been expecting you... */
+		__set_bit(IPS_EXPECTED_BIT, &conntrack->status);
+		conntrack->master = exp->master;
+#ifdef CONFIG_NF_CONNTRACK_MARK
+		conntrack->mark = exp->master->mark;
+#endif
+		nf_conntrack_get(&conntrack->master->ct_general);
+		NF_CT_STAT_INC(expect_new);
+	} else {
+		conntrack->helper = nf_ct_find_helper(&repl_tuple);
+
+		NF_CT_STAT_INC(new);
+        }
+
+	/* Overload tuple linked list to put us in unconfirmed list. */
+	list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
+
+	write_unlock_bh(&nf_conntrack_lock);
+
+	if (exp) {
+		if (exp->expect
author	Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>	2005-11-09 16:38:16 -0800
committer	David S. Miller <davem@davemloft.net>	2005-11-09 16:38:16 -0800
commit	9fb9cbb1082d6b31fb45aa1a14432449a0df6cf1 (patch)
tree	c964a62bdd766eca436c30f51a9e33e2b798b0a6 /net/netfilter
parent	6730c3c14421b7c924d06e31bb66e0adad225547 (diff)